Import modules

In [1]:
from joblib import dump, load
import pandas as pd
import numpy as np
import re
import string
import pytz
import datetime
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegressionCV
from text_unidecode import unidecode

# Load models
Define directory where the models are stored:

In [2]:
dir_models = "D:/Project data/Data Project Sentiment Race/02_models/"

Load vectorizers:

In [3]:
vectorizer_bow = load(dir_models+'vectorizer_bow.joblib')
vectorizer_tfidf = load(dir_models+'vectorizer_tfidf.joblib')

Load models:

In [4]:
naivebayes = load(dir_models+'naivebayes_tfidf.joblib') 
logistic = load(dir_models+'logistic_tfidf_cv.joblib')

Define function for dividing iterable into chunks:

In [5]:
def get_chunks(iterable, chunk_size):
    size = iterable.shape[0]
    if size < chunk_size:
        yield iterable
    chunks_nb = int(size / chunk_size)
    iter_ints = range(0, chunks_nb)
    for i in iter_ints:
        j = i * chunk_size
        if i + 1 < chunks_nb:
            k = j + chunk_size
            yield iterable[j:k]
        else:
            yield iterable[j:]

Define function that makes predictions with Naive Bayes by making the forecasts iteratively to avoid memory issues:

In [6]:
def predict_GaussianNB(model, X, chunk_size):
    y = np.empty(0)
    for X_i in get_chunks(X, chunk_size):
        y_i = model.predict(X_i.toarray())
        y = np.concatenate((y, y_i))
    return y

# Data processing
In this section the functions and the data necessary for processing the tweets are defined/loaded.

## Data location
Define directory where the data is loacated:

In [7]:
dir_raw_data = "D:/Project data/Data Project Sentiment Race/00_raw/"
dir_original_data = 'D:/Original data/'

## Data issues
Load file that summarise the issues with the data:

In [8]:
data_issue = pd.read_csv(
    dir_raw_data + 'data_issue_info.tsv', 
    delimiter = '\t')
to_be_excluded = data_issue.loc[data_issue['exclude']==1, 'rpid'].values

## Mapping file
Load mapping file:

In [9]:
company_mapping = pd.read_csv(
    dir_raw_data + "SP500_Company_Mapping.tsv",
    delimiter="\t")

Lowercase company's ticker and name:

In [10]:
company_mapping['taq_ticker'] = company_mapping['taq_ticker'].map(lambda ticker: ticker.lower())
company_mapping['original_name'] = company_mapping['original_name'].map(lambda name: name.lower())
company_mapping['cleaned_name'] = company_mapping['cleaned_name'].map(lambda name: name.lower())

Remove observations for which we have data issues:

In [11]:
to_remove = company_mapping['rpid'].map(lambda x: x in to_be_excluded)
company_mapping = company_mapping.loc[~to_remove, ]

## Emoticons
Load emoticons:

In [12]:
emojis = pd.read_csv(dir_raw_data + 'emojis.csv', delimiter=';', index_col='unicode')

Load tagged emoticons:

In [13]:
emojis_tags = pd.read_csv(dir_raw_data + 'emojis_tags.csv', delimiter=';', index_col='unicode')

Define regular expressions for positive and negative emoticons:

In [14]:
emojis_positive = '|'.join('(' + pd.concat([emojis_tags.loc[emojis_tags['tag'] == 'positive'], emojis],
                                           join='inner', axis=1)['ftu8'] + ')')
emojis_negative = '|'.join('(' + pd.concat([emojis_tags.loc[emojis_tags['tag'] == 'negative'], emojis],
                                           join='inner', axis=1)['ftu8'] + ')')

## Text cleaning function
Define function that cleans the text of each tweet:

In [15]:
def clean_text(text, regex_cashtag, regex_ticker, regex_name, regex_cleanname, regex_posemoji, regex_negemoji, lemmer):
    # Transform text to unicode:
    text = unidecode(text)
    # Replace positive emojis:
    text = re.sub(regex_posemoji, ' emojipostag ', text)
    text = re.sub('(:[)])|(;[)])|(:-[)])|(=[)])|(:D)', ' emojipostag ', text)
    # Replace negative emojis:
    text = re.sub(regex_negemoji, ' emojinegtag ', text)
    text = re.sub('(:[(])|(:-[(])|(=[(])', ' emojinegtag ', text)
    # Remove other emojis:
    text = re.sub('[<][a-z0-9]+[>]', ' ', text)
    # Remove HTML tags:
    cleanhtml = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    text = re.sub(cleanhtml, '', text)
    # Change encoding to remove non-english words
    text = text.encode("ascii", errors="ignore").decode()
    # Lower case all letters
    text = text.lower()
    # Remove "'s"
    text = re.sub(r"'s(?=\s)", ' ', text)
    # Replace usernames with "usernametag"
    text = re.sub(r'[@]\w+(?=\s|$)', ' usernametag ', text)
    # Replace Twitter picuters with picturetag
    text = re.sub(r'pic.twitter.com/[0-9a-zA-Z]*(?=\s|$)', ' picturetag ', text)
    # Replace URLs with "urltag"
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' urltag ', text)
    # Replace Q1 with first quarter tag:
    text = re.sub('q1', ' firstquartertag ', text)
    # Replace Q2 with first quarter tag:
    text = re.sub('q2', ' secondquartertag ', text)
    # Replace Q3 with first quarter tag:
    text = re.sub('q3', ' thirdquartertag ', text)
    # Replace Q4 with first quarter tag:
    text = re.sub('q4', ' fourthquartertag ', text)
    # Replace percent numbers with tag:
    text = re.sub(r'([+-]*\d+[.,:]\d+[%])|([+-]*\d+[%])', ' numbertag ', text)
    # Replace numbers with tag:
    text = re.sub(r'([+-]*\d+[.,:]\d+)|([+-]*\d+)', ' numbertag ', text)
    # Replace company cashtag
    text = re.sub(regex_cashtag, ' companycashtag ', text)
    # Replace company ticker
    text = re.sub(regex_ticker, ' companytickertag ', text)
    # Replace all other cashtags with a tag
    text = re.sub(r'[$]\b[a-zA-z]+\b', ' cashtag ', text)
    # Replace company name with tag:
    text = re.sub(regex_name, ' companynametag ', text)
    text = re.sub(regex_cleanname, ' companynametag ', text)
    # Characters that appear more two or more times are shortened (e.g. loooool -> lool):
    text = re.sub(r'(\w)\1{2,}', r'\1\1', text)
    # Remove remaining punctuation
    text = re.sub('['+string.punctuation+']', ' ', text)
    # Remove double spaces
    text = re.sub(r'\s+', ' ', text)
    # Lemmatize text:
    text = ' '.join([lemmer.lemmatize(word) for word in text.split(' ')])
    return text

Initialize the lemmatizer need for the cleaning function:

In [16]:
lemmer = WordNetLemmatizer()

## Holidays

In [17]:
holidays = pd.read_csv(dir_original_data + 'Miscellaneous/NYSE_closing_days.tsv', delimiter='\t')

holidays.columns = ['Date', 'Time', 'Holiday']

holidays = holidays.drop('Time', axis=1)

holidays['Holiday'] = holidays['Holiday'].map(lambda x: x==1)
holidays['Date'] = holidays['Date'].map(lambda x: pd.Timestamp(x))

In [18]:
closing_info = pd.DataFrame({'Date': pd.date_range(start=datetime.datetime(2010, 1, 1), end=datetime.datetime(2019, 1, 1))})

closing_info['Weekend'] = closing_info['Date'].map(lambda x: x.weekday() in [5,6])

closing_info = closing_info.merge(holidays, how='left', on='Date')

closing_info['Holiday'] = closing_info['Holiday'].fillna(False)

closing_info['Closed'] = closing_info.apply(lambda x: x['Weekend'] or x['Holiday'], axis=1)

closing_info.Date = closing_info.Date.dt.date

closing_info = closing_info.drop(['Weekend', 'Holiday'], axis=1)


# Process messages
Define location where the processed data should be saved:

In [19]:
dir_processed = "D:/Project data/Data Project Sentiment Race/01_processed/"

Define bullishness measure for aggregating intra-day sentiment ([Antweiler and Frank, 2004](https://onlinelibrary.wiley.com/doi/10.1111/j.1540-6261.2004.00662.x))

In [20]:
def bull(x):
     return np.log((1+np.sum(x>0))/(1+np.sum(x<0)))

Define whether the aggregation takes place close-to-close or open-to-open:

In [21]:
aggregation_tweets = 'c2c'
if aggregation_tweets == 'c2c':
    hours_shift = 8
    minutes_shift = 0
elif aggregation_tweets == 'o2o':
    hours_shift = -9
    minutes_shift = -30
else:
    hours_shift = 0
    minutes_shift = 0

Filtering of the messages, i.e. which messages should we keep in the sample:

In [22]:
# Keep only messages which contain the company's cashtag:
has_cashtag = True
# Keep only messages which only contain the compnay's cashtag:
unique_cashtag = True
# Unique cashtags makes only sense if we keep only tweets which mention the company's cashtag:
unique_cashtag = unique_cashtag and has_cashtag

Define name datails for saving the aggregated sentiment:

In [23]:
file_specifications = ''

file_specifications = file_specifications + aggregation_tweets

if has_cashtag:
    file_specifications = file_specifications + '_cashtag_only'
if unique_cashtag:
    file_specifications = file_specifications + '_unique'

## StockTwits
Load StockTwits messages, clean text, predict sentiment, aggregate sentiment on daily level:

In [24]:
sentiment_stocktwits = pd.DataFrame()
tz_NY = pytz.timezone('America/New_York')
for rpid_i in company_mapping['rpid'].unique():
    # Load data for the company with ID 'rpid_i':
    data_i = pd.read_csv(
        dir_original_data + 'StockTwits SP500/' + rpid_i + '_tweets.tsv',
        encoding="ANSI", quotechar='"', delimiter="\t", engine='python')
    # Keep only relevant columns:
    data_i = data_i[['text', 'tweet_datetime']]
    # Define regular expression for the company's cashtag:
    cashtag_regex_i = '|'.join(r'([$]{1}\b' + company_mapping.loc[company_mapping['rpid'] == rpid_i, 'taq_ticker'] + r'\b)')
    ticker_regex_i = '|'.join(r'(\b' + company_mapping.loc[company_mapping['rpid'] == rpid_i, 'taq_ticker'] + r'\b)')
    # Define regular expression for the company's name:
    name_regex_i = '|'.join(r'(\b' + company_mapping.loc[company_mapping['rpid'] == rpid_i, 'original_name'] + r'\b)')
    nameclean_regex_i = '|'.join(r'(\b' + company_mapping.loc[company_mapping['rpid'] == rpid_i, 'cleaned_name'] + r'\b)')
    # Clean text data:
    data_i['text'] = data_i['text'].map(lambda x: clean_text(x,
                                                             cashtag_regex_i,
                                                             ticker_regex_i,
                                                             name_regex_i,
                                                             nameclean_regex_i,
                                                             emojis_positive,
                                                             emojis_negative,
                                                             lemmer))
    # Count number of company cashtags:
    data_i['num_companycashtag'] = data_i['text'].map(lambda x: len(re.findall(r'\bcompanycashtag\b', x))) 
    # Count number of other cashtags:
    data_i['num_cashtag'] = data_i['text'].map(lambda x: len(re.findall(r'\bcashtag\b', x))) 
    # If wanted, remove tweets that do not mention the company's cashtag:
    if has_cashtag:
        data_i = data_i.loc[data_i['num_companycashtag']>0]
    # If wanted, remove tweets that mention other cashtags:
    if unique_cashtag:
        data_i = data_i.loc[data_i['num_cashtag']==0]
    # Tranform strings to timestamps:
    data_i['tweet_datetime'] = data_i['tweet_datetime'].map(lambda x: pd.Timestamp(x))
    # Change timezone to Eastern Time:
    data_i['tweet_datetime_ET'] = data_i['tweet_datetime'].map(lambda x: x.astimezone(tz_NY))
    # Shift time depending on the aggregation scheme choosen previously:
    data_i['tweet_datetime_ET_shifted'] = data_i['tweet_datetime_ET'].map(lambda x: x + datetime.timedelta(hours=hours_shift, minutes=minutes_shift))
    # Define date based on the shifted ET timestamp:
    data_i['Date'] = data_i['tweet_datetime_ET_shifted'].dt.date
    # Vectorize the text data:
    X_i = vectorizer_bow.transform(data_i['text'])
    X_i = vectorizer_tfidf.transform(X_i)
    # Predict sentiemnt of the messages using the logistic and the naive bayes model:
    data_i['Logistic'] = (logistic.predict(X_i) -0.5)*2
    data_i['NaiveBayes'] = (predict_GaussianNB(naivebayes, X_i, 10000) -0.5)*2
    # For the aggregation, we shift the date of messages posted during holidays or weekends to the next trading day:
    data_i = data_i.merge(closing_info, how='left', on='Date')[['Date', 'Closed', 'Logistic', 'NaiveBayes']]
    while any(data_i.Closed):
        data_i['Date'] = data_i.apply(lambda x: x['Date'] + datetime.timedelta(days=1) if x['Closed'] else x['Date'], axis=1)
        data_i = data_i.drop('Closed', axis=1).merge(closing_info, how='left', on='Date')
    # Aggregate sentiments on a daily basis:
    sentiment_i = data_i.drop('Closed', axis=1).groupby('Date').aggregate({'Logistic': [bull, np.mean], 'NaiveBayes': [bull, np.mean]} )
    # Delete the raw data:
    del(data_i)
    # Transform multi-index column names to single level:
    sentiment_i.columns = ['_'.join(col).strip() for col in sentiment_i.columns.values]
    # Date (which acts as an index) to a column:
    sentiment_i.reset_index(level=0, inplace=True)
    # Add information about RavenPack ID:
    sentiment_i['rpid'] = rpid_i
    # Append data:
    sentiment_stocktwits = sentiment_stocktwits.append(sentiment_i, ignore_index=True)
    # Remove the sentiment data:
    del(sentiment_i)

Save the StockTwits sentiment:

In [25]:
sentiment_stocktwits.to_csv(dir_processed + 'StockTwits_daily_' + file_specifications + '.csv')

Delete data frame from memory:

In [26]:
del(sentiment_stocktwits)

## Twitter

In [27]:
sentiment_twitter = pd.DataFrame()
tz_NY = pytz.timezone('America/New_York')
for rpid_i in company_mapping['rpid'].unique():
    # Load data for the company with ID 'rpid_i':
    data_i = pd.read_csv(
        dir_original_data + 'Twitter SP500/' + rpid_i + '_tweets.tsv',
        encoding="ANSI", quotechar='"', delimiter="\t", engine='python')
    # Keep only relevant columns:
    data_i = data_i[['text', 'datetime']]
    # Define regular expression for the company's cashtag:
    cashtag_regex_i = '|'.join(r'([$]{1}\b' + company_mapping.loc[company_mapping['rpid'] == rpid_i, 'taq_ticker'] + r'\b)')
    ticker_regex_i = '|'.join(r'(\b' + company_mapping.loc[company_mapping['rpid'] == rpid_i, 'taq_ticker'] + r'\b)')
    # Define regular expression for the company's name:
    name_regex_i = '|'.join(r'(\b' + company_mapping.loc[company_mapping['rpid'] == rpid_i, 'original_name'] + r'\b)')
    nameclean_regex_i = '|'.join(r'(\b' + company_mapping.loc[company_mapping['rpid'] == rpid_i, 'cleaned_name'] + r'\b)')
    # Clean text data:
    data_i['text'] = data_i['text'].map(lambda x: clean_text(x,
                                                         cashtag_regex_i,
                                                         ticker_regex_i,
                                                         name_regex_i,
                                                         nameclean_regex_i,
                                                         emojis_positive,
                                                         emojis_negative,
                                                         lemmer))
    # Count number of company cashtags:
    data_i['num_companycashtag'] = data_i['text'].map(lambda x: len(re.findall(r'\bcompanycashtag\b', x))) 
    # Count number of other cashtags:
    data_i['num_cashtag'] = data_i['text'].map(lambda x: len(re.findall(r'\bcashtag\b', x))) 
    # If wanted, remove tweets that do not mention the company's cashtag:
    if has_cashtag:
        data_i = data_i.loc[data_i['num_companycashtag']>0]
    # If wanted, remove tweets that mention other cashtags:
    if unique_cashtag:
        data_i = data_i.loc[data_i['num_cashtag']==0]
    # Tranform strings to timestamps:
    data_i['datetime'] = data_i['datetime'].map(lambda x: pd.Timestamp(x).tz_localize(tz='Europe/Zurich', ambiguous=True))
    # Change timezone to Eastern Time:
    data_i['datetime_ET'] = data_i['datetime'].map(lambda x: x.astimezone(tz_NY))
    # Shift time depending on the aggregation scheme choosen previously:
    data_i['datetime_ET_shifted'] = data_i['datetime_ET'].map(lambda x: x + datetime.timedelta(hours=hours_shift, minutes=minutes_shift))
    # Define date based on the shifted ET timestamp:
    data_i['Date'] = data_i['datetime_ET_shifted'].dt.date
    # Vectorize the text data:
    X_i = vectorizer_bow.transform(data_i['text'])
    X_i = vectorizer_tfidf.transform(X_i)
    # Predict sentiemnt of the messages using the logistic and the naive bayes model:
    data_i['Logistic'] = (logistic.predict(X_i) -0.5)*2
    data_i['NaiveBayes'] = (predict_GaussianNB(naivebayes, X_i, 10000) -0.5)*2
    # For the aggregation, we shift the date of messages posted during holidays or weekends to the next trading day:
    data_i = data_i.merge(closing_info, how='left', on='Date')[['Date', 'Closed', 'Logistic', 'NaiveBayes']]
    while any(data_i.Closed):
        data_i['Date'] = data_i.apply(lambda x: x['Date'] + datetime.timedelta(days=1) if x['Closed'] else x['Date'], axis=1)
        data_i = data_i.drop('Closed', axis=1).merge(closing_info, how='left', on='Date')
    # Aggregate sentiments on a daily basis:
    sentiment_i = data_i.drop('Closed', axis=1).groupby('Date').aggregate({'Logistic': [bull, np.mean], 'NaiveBayes': [bull, np.mean]} )
    # Transform multi-index column names to single level:
    sentiment_i.columns = ['_'.join(col).strip() for col in sentiment_i.columns.values]
    # Date (which acts as an index) to a column:
    sentiment_i.reset_index(level=0, inplace=True)
    # Add information about RavenPack ID:
    sentiment_i['rpid'] = rpid_i
    # Append data:
    sentiment_twitter = sentiment_twitter.append(sentiment_i, ignore_index=True)

Save the Twitter sentiment:

In [28]:
sentiment_twitter.to_csv(dir_processed + 'Twitter_daily_' + file_specifications + '.csv')

Delete data frame from memory:

In [29]:
del(sentiment_twitter)