# Athens Stock Exchange Company Filings  
The purpose of this Twitter bot is to check for investor announcements from listed companies in the Athens Stock Exchange and tweet whenever a new one has been issued.

##### Import all dependencies

In [1]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import os
import shutil
import re
import difflib
import csv
import tweepy

##### Mine the Athens Stock Exchange website for company announcements
For this implementation, we're interested in:  
- Company name
- Company Ticker
- Announcement Date
- Announcement Title
- Link to full Announcement

In [2]:
def mine_data():
    # Use requests to grab the HTML code
    url = 'http://www.helex.gr/web/guest/companies-announcements'
    response = requests.get(url)

    # Use BeautifulSoup to parse the HTML code and extract the elements we're interested in
    doc = BeautifulSoup(response.text, "html.parser")

    # BeautifulSoup reads the DOM of the HTML code and allows us to grab certain parts by their tags
    container = doc.find_all('div', attrs={'id':'fullist_abstracts'})[0]

    company_names = container.find_all('h1', attrs={'class':'header-title'})
    # List comprehension for each variable, so we can use them for building a dataframe later
    company_names = [company.text for company in company_names]

    ## List of the companies quote symbol
    ## First find the nesting div
    stock_link_div = container.find_all('div', attrs={'class':'asset-date'})
    ## Extract quote symbol if exists, add an empty [] if it doesn't
    company_quotes = [adiv.a.text if adiv.a is not None else '[]' for adiv in stock_link_div]

    timestamps_all = container.find_all('div', attrs={'class':'popup-an-date'})
    # Date for each announcement is in a HH:HH format when they're published
    # but it changes to a DD MMM YY format later in the day
    # I'm appending a universal DD MMM YY format to all announcements, grabbing the date from server's local time
    date = time.strftime('%d %b %Y', time.localtime(time.time())) + " "
    # If the mined date is already in HH:HH DD MMM YY format append it to the list with no changes
    # If it's in HH:HH format (less than 10 characters) add the DD MMM YY format and append it
    timestamps = [stamp.text + date if len(stamp.text) < 10 else stamp.text for stamp in timestamps_all]
    
    titles = container.find_all('a', attrs={'data-google-analytics-tag':'Companies_Announcements'})
    titles = [title.text for title in titles]

    links = container.find_all('a', attrs={'data-google-analytics-tag':'Companies_Announcements'})
    # Some string manipulation to extract the plain URL of each announcement
    links = ['http://' + link['ontouchstart'].split('http://')[1].split('/exclusive')[0] for link in links]

    ## Return the collected data
    return company_names, timestamps, titles, company_quotes, links

##### Build a pandas dataframe with the mined information

In [3]:
def build_dataframe():
    ## Return the collected data from the mine_data function
    company_names, timestamps, titles, company_quotes, links = mine_data()
    
    global mined
    mined = pd.DataFrame(data={'quote': company_quotes, 'title': titles, 'timestamp': timestamps, 'link': links})
    # Regex on timestamp to get rid of blanks and seperate time from date
    mined.timestamp.replace(" (\d{2}[:]\d{2}) (\d{2}) ([A-Za-z]{3}) (\d{2})(\d{2}) ", "[\g<1>][\g<2> \g<3> \g<5>]", regex=True, inplace=True)    
    return mined

build_dataframe()

Unnamed: 0,link,quote,timestamp,title
0,http://www.helex.gr/en/web/guest/companies-ann...,[TPEIR],[20:00][17 Oct 17],PRESS RELEASE
1,http://www.helex.gr/en/web/guest/companies-ann...,[BELA],[19:20][17 Oct 17],SHAREHOLDERS INVITATION TO THE REGULAR ANNUAL ...
2,http://www.helex.gr/en/web/guest/companies-ann...,[NIR],[19:05][17 Oct 17],PRESS RELEASE 9M 2017
3,http://www.helex.gr/en/web/guest/companies-ann...,[INKAT],[18:37][17 Oct 17],Announcement 9997/2017 (no English translation...
4,http://www.helex.gr/en/web/guest/companies-ann...,[EXAE],[17:39][17 Oct 17],Transactions by Eurobank Equities (regulated i...
5,http://www.helex.gr/en/web/guest/companies-ann...,[EPSIL],[17:32][17 Oct 17],Announcement 9987/2017 (no English translation...
6,http://www.helex.gr/en/web/guest/companies-ann...,[PETRO],[17:28][17 Oct 17],Announcement 9986/2017 (no English translation...
7,http://www.helex.gr/en/web/guest/companies-ann...,[ELPE],[15:51][17 Oct 17],Announcement of Regulated Information
8,http://www.helex.gr/en/web/guest/companies-ann...,[MLAND],[15:51][17 Oct 17],Announcement 9983/2017 (no English translation...
9,http://www.helex.gr/en/web/guest/companies-ann...,[HTO],[14:30][17 Oct 17],Announcement


##### Save the dataframes to CSV, build a basefile, diff CSVs with basefile
- The first time we run the script, it will save a basefile B against which we will compare the data we mine the next time.  
- The second time we run the script, it will save a timestamped_file T and it will compare it with the basefile B. If and only if there are new rows in the timestamped_file T, a diff_timestamped file D will be created that will contain only the new rows.
- All subsequent times we run the script, the existing T will be overwriting B to become the new basefile B. A new T will be generated which will be compared against B. Again, if and only if there are new rows in the new T file, a new D file will be created.
- Everytime there will only be one basefile file B and one timestamped_file T, so we make sure our comparisons ae accurate.
- The previously generated B and T files can be copied before overwriting, for keeping them in our records.
- We will be using the diff'ed files D to create the tweets

In [24]:
def create_CSVs(filename_base, filename_path):
    # List of all files in our current directory using os
    files = os.listdir()
    # Check if there is a basefile.csv file in our directory
    # If basefile.csv is not in our directory, create it from the dataframe
    if 'basefile.csv' not in files:
        # Grab the filename for the basefile from the function's parameters
        filename_base = filename_base
        # pandas CSV export using the filename as the path
        mined.to_csv(path_or_buf=filename_base, index=False)
        # If you're testing the code, and you cannot wait for new announcements to get published
        # Use the following to exclude the first 10 rows of the dataframe:
        # mined[10:].to_csv(path_or_buf=filename_base, index=False)

        # Later we will be overwriting the original basefile, so we might want to save a copy of it, using shutil
        copied_base = "copy_basefile.csv"
        shutil.copy(filename_base, copied_base)

    # If basefile.csv is in files, check if there is already a timestamped_file.csv 
    else:
        files = os.listdir()

        # If timestamped_file exists:
        if any(re.search('timestamped_file', file) for file in files):
            r = re.compile("timestamped_file")
            existing_timestamped = filter(r.match, files)
            # Grab its filename
            existing_timestamped = list(existing_timestamped)[0]

            # Copy the existing timestamped_file.csv
            copied_timestamped = "copy" + existing_timestamped.split("timestamped_file")[1]
            shutil.copy(existing_timestamped, copied_timestamped)

            # Rename the timestamped_file.csv to basefile.csv
            os.rename(existing_timestamped, filename_base)

            # And save a new timestamped_file.csv from the dataframe. Its path filename is coming from the function parameters
            global filename_tmstmpd
            filename_tmstmpd = filename_path
            mined.to_csv(path_or_buf=filename_path, index=False)

        # If timestamped_file doesn't exist, create it:
        else:
            # Save the dataframe to a CSV file. Its path filename is coming from the function parameters
            filename_tmstmpd = filename_path
            mined.to_csv(path_or_buf=filename_path, index=False)
            # If you're testing the code, and you cannot wait for new announcements to get published
            # Use the following to exclude the first 5 rows of the dataframe:
            # mined[5:].to_csv(path_or_buf=filename_base, index=False)

create_CSVs(filename_base = "basefile.csv", filename_path = "timestamped_file_" + time.strftime('%Y-%m-%d_%H-%M-%S', time.localtime(time.time())) + ".csv")

In [25]:
def diff_CSVs(filename_base, filename_tmstmpd):
    # Open the basefile and the timestamped_file
    with open(filename_base, 'r') as hosts0:
        with open(filename_tmstmpd, 'r') as hosts1:
            # Perform a diff
            diff = difflib.unified_diff(
                hosts0.readlines(),
                hosts1.readlines(),
                fromfile='hosts0',
                tofile='hosts1',
            )
            # List comprehension for only the new lines in timestamped_file
            # New lines, ie. new company announcements for us to tweet about, have a + symbol in the beginning of the string
            new_lines = [line.strip() for line in diff if ("+h") in line]
    
    # If there haven't been any new lines in the timestamped_file, the list will be empty and no diff.csv file will be generated
    # If there have been new lines, write them in a new CSV file, that we will use for our tweets
    if len(new_lines) > 0:
        
        # Assign the filename we will be using for the diff_timestamped files
        # We use a timestamp in the filename so we keep the different diff_timestamped files in our records
        global diff_filename
        diff_filename = "diff_" + time.strftime('%Y-%m-%d_%H-%M-%S', time.localtime(time.time())) + ".csv"
        
        # Use csv library to write the new lines in a new CSV file
        with open(diff_filename, "w") as csv_file:
            writer = csv.writer(csv_file)
            for line in new_lines:
                writer.writerow(line.split(','))
        
        return new_lines
                
diff_CSVs(filename_base = "basefile.csv", filename_tmstmpd = filename_tmstmpd)

['+http://www.helex.gr/en/web/guest/companies-announcements/-/asset_publisher/ful1/content/announcement-9987-2017-no-english-translation-available-,[EPSIL],[17:32][17 Oct 17],Announcement 9987/2017 (no English translation available)',
 '+http://www.helex.gr/en/web/guest/companies-announcements/-/asset_publisher/ful1/content/announcement-9986-2017-no-english-translation-available-,[PETRO],[17:28][17 Oct 17],Announcement 9986/2017 (no English translation available)',
 '+http://www.helex.gr/en/web/guest/companies-announcements/-/asset_publisher/ful1/content/announcement-of-regulated-informa-451,[ELPE],[15:51][17 Oct 17],Announcement of Regulated Information',
 '+http://www.helex.gr/en/web/guest/companies-announcements/-/asset_publisher/ful1/content/announcement-9983-2017-no-english-translation-available-,[MLAND],[15:51][17 Oct 17],Announcement 9983/2017 (no English translation available)',
 '+http://www.helex.gr/en/web/guest/companies-announcements/-/asset_publisher/ful1/content/announc-1

##### Create the tweets list
- First we create the function that will generate the tweets format (we take into consideration that a Tweet has a maximum of 140 characters long and that any link regardless its size it takes up 23 characters).  
- Then we create the function that will turn each row from the latest diff_timestamped.csv file to a tweet with the specified format.

In [26]:
def create_tweet(row):
    # This will use each every row of a parsed dataframe
    link = row['link']
    quote = row['quote']
    quote = quote.strip()
    timestamp = row['timestamp']
    timestamp = timestamp.strip()
    title = row['title']
    blank = ' - '
    dots = '...'
        
    # Count the lengths of the different pieces of a tweet
    quote_len = len(quote)
    tmstmp_len = len(timestamp)
    blnk_len = len(blank)
    title_len = len(title)
    link_len = 23
    
    # If the different pieces of a tweet is less than or equal to 140 characters
    # Just use those pieces and put the tweet together
    if quote_len + title_len + tmstmp_len + (blnk_len * 2) + link_len <= 140:
        tweet = quote + timestamp + blank + title + blank + link
    # If they are longer than 140 characters
    # For the announcement titles use only the remaining space and insert three dots where it's split
    else:
        dots = "..."
        dot_len = len(dots)
        title_len = 140 - (quote_len + tmstmp_len + (blnk_len * 2) + dot_len + link_len)
        tweet = quote + timestamp + blank + title[:title_len] + dots + blank + link
    
    return tweet

In [28]:
def tweets_list():
    ## Build a dataframe from the diff_timestamped.csv file
    difference = pd.read_csv(filepath_or_buffer=diff_filename, delimiter=',', header=None, names=['link', 'quote', 'timestamp', 'title'])
    ## String manipulation with regex
    difference.link.replace("^\+", "", regex=True, inplace=True)
    difference.quote.replace("", "", regex=True, inplace=True)

    # We are applying the create_tweet function to the difference dataframe
    # And we save the generated tweets in a list
    global tweets
    tweets = list(difference.apply(create_tweet, axis=1))
    
    return tweets

tweets_list()

['[EPSIL][17:32][17 Oct 17] - Announcement 9987/2017 (no English translation available) - http://www.helex.gr/en/web/guest/companies-announcements/-/asset_publisher/ful1/content/announcement-9987-2017-no-english-translation-available-',
 '[PETRO][17:28][17 Oct 17] - Announcement 9986/2017 (no English translation available) - http://www.helex.gr/en/web/guest/companies-announcements/-/asset_publisher/ful1/content/announcement-9986-2017-no-english-translation-available-',
 '[ELPE][15:51][17 Oct 17] - Announcement of Regulated Information - http://www.helex.gr/en/web/guest/companies-announcements/-/asset_publisher/ful1/content/announcement-of-regulated-informa-451',
 '[MLAND][15:51][17 Oct 17] - Announcement 9983/2017 (no English translation available) - http://www.helex.gr/en/web/guest/companies-announcements/-/asset_publisher/ful1/content/announcement-9983-2017-no-english-translation-available-',
 '[HTO][14:30][17 Oct 17] - Announcement - http://www.helex.gr/en/web/guest/companies-announ

##### Connect to your Twitter account and post your Tweets
The most convenient way is to use Twitter API with Tweepy library. Make sure you have registered for API keys and tokens.

In [29]:
def connect_to_Twitter():
    ## Credentials
    consumer_key = 'insert_your_key_here'
    consumer_secret = 'insert_your_key_here'
    access_token = 'insert_your_token_here'
    access_token_secret = 'insert_your_token_here'

    ## oAuth connect
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    global api
    api = tweepy.API(auth)

connect_to_Twitter()

In [31]:
def post_tweets(tweets):
    # The tweets list is coming from our previously generated tweets list
    # And it's passed on the function parameters

    # We want to post the tweets in reversed chronological order 
    # (From oldest to newest)
    for tweet in reversed(tweets):
        api.update_status(status=tweet)
        # Give it some sleep time if you don't want to flood your Twitter API use
        time.sleep(5)
    
    return tweets

post_tweets(tweets = tweets)

['[EPSIL][17:32][17 Oct 17] - Announcement 9987/2017 (no English translation available) - http://www.helex.gr/en/web/guest/companies-announcements/-/asset_publisher/ful1/content/announcement-9987-2017-no-english-translation-available-',
 '[PETRO][17:28][17 Oct 17] - Announcement 9986/2017 (no English translation available) - http://www.helex.gr/en/web/guest/companies-announcements/-/asset_publisher/ful1/content/announcement-9986-2017-no-english-translation-available-',
 '[ELPE][15:51][17 Oct 17] - Announcement of Regulated Information - http://www.helex.gr/en/web/guest/companies-announcements/-/asset_publisher/ful1/content/announcement-of-regulated-informa-451',
 '[MLAND][15:51][17 Oct 17] - Announcement 9983/2017 (no English translation available) - http://www.helex.gr/en/web/guest/companies-announcements/-/asset_publisher/ful1/content/announcement-9983-2017-no-english-translation-available-',
 '[HTO][14:30][17 Oct 17] - Announcement - http://www.helex.gr/en/web/guest/companies-announ

# Check your Twitter accounts - your tweets should have been posted!