Import modules:

In [1]:
import pandas as pd
import numpy as np
import datetime
import pytz
import re
import string
from nltk.stem import WordNetLemmatizer
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, matthews_corrcoef
from joblib import dump, load
from text_unidecode import unidecode

Raw data
=======

Define directory where the data is loacated:

In [2]:
dir_raw_data = "C:/Users/danie/Documents/Research Data/Project data/Data Project Sentiment Race/00_raw/"
dir_original_data = 'C:/Users/danie/Documents/Research Data/Original data/StockTwits SP500/'

## Data issues
Load file that summarise the issues with the data:

In [3]:
data_issue = pd.read_csv(
    dir_raw_data + 'data_issue_info.tsv', 
    delimiter = '\t')

In [4]:
to_be_excluded = data_issue.loc[data_issue['exclude']==1, 'rpid'].values

## Mapping file
Load mapping file:

In [5]:
company_mapping = pd.read_csv(
    dir_raw_data + "SP500_Company_Mapping.tsv",
    delimiter="\t")

Lowercase company's ticker and name:

In [6]:
company_mapping['taq_ticker'] = company_mapping['taq_ticker'].map(lambda ticker: ticker.lower())
company_mapping['original_name'] = company_mapping['original_name'].map(lambda name: name.lower())
company_mapping['cleaned_name'] = company_mapping['cleaned_name'].map(lambda name: name.lower())

Remove observations for which we have data issues:

In [7]:
to_remove = company_mapping['rpid'].map(lambda x: x in to_be_excluded)
company_mapping = company_mapping.loc[~to_remove, ]

## Emoticons
Load emoticons:

In [8]:
emojis = pd.read_csv(dir_raw_data + 'emojis.csv', delimiter=';', index_col='unicode')

Load tagged emoticons:

In [9]:
emojis_tags = pd.read_csv(dir_raw_data + 'emojis_tags.csv', delimiter=';', index_col='unicode')

Define regular expressions for positive and negative emoticons:

In [10]:
emojis_positive = '|'.join('(' + pd.concat([emojis_tags.loc[emojis_tags['tag'] == 'positive'], emojis],
                                           join='inner', axis=1)['ftu8'] + ')')
emojis_negative = '|'.join('(' + pd.concat([emojis_tags.loc[emojis_tags['tag'] == 'negative'], emojis],
                                           join='inner', axis=1)['ftu8'] + ')')

## Text cleaning function
Define function that cleans the text of each tweet:

In [11]:
def clean_text(text, regex_cashtag, regex_ticker, regex_name, regex_cleanname, regex_posemoji, regex_negemoji, lemmer):
    # Transform text to unicode:
    text = unidecode(text)
    # Replace positive emojis:
    text = re.sub(regex_posemoji, ' emojipostag ', text)
    text = re.sub('(:[)])|(;[)])|(:-[)])|(=[)])|(:D)', ' emojipostag ', text)
    # Replace negative emojis:
    text = re.sub(regex_negemoji, ' emojinegtag ', text)
    text = re.sub('(:[(])|(:-[(])|(=[(])', ' emojinegtag ', text)
    # Remove other emojis:
    text = re.sub('[<][a-z0-9]+[>]', ' ', text)
    # Remove HTML tags:
    cleanhtml = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    text = re.sub(cleanhtml, '', text)
    # Change encoding to remove non-english words
    text = text.encode("ascii", errors="ignore").decode()
    # Lower case all letters
    text = text.lower()
    # Remove "'s"
    text = re.sub(r"'s(?=\s)", ' ', text)
    # Replace usernames with "usernametag"
    text = re.sub(r'[@]\w+(?=\s|$)', ' usernametag ', text)
    # Replace Twitter picuters with picturetag
    text = re.sub(r'pic.twitter.com/[0-9a-zA-Z]*(?=\s|$)', ' picturetag ', text)
    # Replace URLs with "urltag"
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' urltag ', text)
    # Replace Q1 with first quarter tag:
    text = re.sub('q1', ' firstquartertag ', text)
    # Replace Q2 with first quarter tag:
    text = re.sub('q2', ' secondquartertag ', text)
    # Replace Q3 with first quarter tag:
    text = re.sub('q3', ' thirdquartertag ', text)
    # Replace Q4 with first quarter tag:
    text = re.sub('q4', ' fourthquartertag ', text)
    # Replace percent numbers with tag:
    text = re.sub(r'([+-]*\d+[.,:]\d+[%])|([+-]*\d+[%])', ' numbertag ', text)
    # Replace numbers with tag:
    text = re.sub(r'([+-]*\d+[.,:]\d+)|([+-]*\d+)', ' numbertag ', text)
    # Replace company cashtag
    text = re.sub(regex_cashtag, ' companycashtag ', text)
    # Replace company ticker
    text = re.sub(regex_ticker, ' companytickertag ', text)
    # Replace all other cashtags with a tag
    text = re.sub(r'[$]\b[a-zA-z]+\b', ' cashtag ', text)
    # Replace company name with tag:
    text = re.sub(regex_name, ' companynametag ', text)
    text = re.sub(regex_cleanname, ' companynametag ', text)
    # Characters that appear more two or more times are shortened (e.g. loooool -> lool):
    text = re.sub(r'(\w)\1{2,}', r'\1\1', text)
    # Remove remaining punctuation
    text = re.sub('['+string.punctuation+']', ' ', text)
    # Remove double spaces
    text = re.sub(r'\s+', ' ', text)
    # Lemmatize text:
    text = ' '.join([lemmer.lemmatize(word) for word in text.split(' ')])
    return text

Initialize the lemmatizer need for the cleaning function:

In [12]:
lemmer = WordNetLemmatizer()

## Load tweets
Load and clean tweets:

In [13]:
data_tweets = pd.DataFrame()
for rpid_i in company_mapping['rpid'].unique():
    # Load data for the company with ID 'rpid_i':
    data_i = pd.read_csv(
        dir_original_data + rpid_i + '_tweets.tsv',
        encoding="ANSI", quotechar='"', delimiter="\t", engine='python')
    # Keep only observations which have been classified by users:
    data_i = data_i.loc[data_i['StockTwits_sentiment'] != 'None', ['StockTwits_sentiment', 'text', 'tweet_datetime']]
    # Define regular expression for the company's cashtag:
    cashtag_regex_i = '|'.join(r'([$]{1}\b' + company_mapping.loc[company_mapping['rpid'] == rpid_i, 'taq_ticker'] + r'\b)')
    ticker_regex_i = '|'.join(r'(\b' + company_mapping.loc[company_mapping['rpid'] == rpid_i, 'taq_ticker'] + r'\b)')
    # Define regular expression for the company's name:
    name_regex_i = '|'.join(r'(\b' + company_mapping.loc[company_mapping['rpid'] == rpid_i, 'original_name'] + r'\b)')
    nameclean_regex_i = '|'.join(r'(\b' + company_mapping.loc[company_mapping['rpid'] == rpid_i, 'cleaned_name'] + r'\b)')
    # Clean text data:
    data_i['text'] = data_i['text'].map(lambda x: clean_text(x,
                                                             cashtag_regex_i,
                                                             ticker_regex_i,
                                                             name_regex_i,
                                                             nameclean_regex_i,
                                                             emojis_positive,
                                                             emojis_negative,
                                                             lemmer))
    # Add RavenPack ID:
    data_i['rpid'] = rpid_i
    # Append data:
    data_tweets = data_tweets.append(data_i, ignore_index=True)

Convert string to datetime-object and add New York time zone (Eastern time):

In [14]:
tz_utc = pytz.timezone('UTC')
tz_NY = pytz.timezone('America/New_York')
data_tweets['tweet_datetime'] = data_tweets['tweet_datetime'].map(lambda x: pd.Timestamp(x))
data_tweets['tweet_datetime_ET'] = data_tweets['tweet_datetime'].map(lambda x: x.astimezone(tz_NY))
data_tweets['tweet_date_ET'] = data_tweets['tweet_datetime_ET'].dt.date

# Feature construction
## Training and test set
Define index of training observations. For consistency with [Renault (2017, JBF)](https://www.sciencedirect.com/science/article/abs/pii/S0378426617301589), the training period starts in June 2013 and ends in August 2014:

In [15]:
training_date_range = pd.date_range(start=datetime.datetime(2013, 6, 1), end=datetime.datetime(2014, 8, 31))
idx_train = data_tweets['tweet_date_ET'].map(lambda x: x in training_date_range)

## Target
Define the outcome variable:

In [16]:
lb = preprocessing.LabelBinarizer()
y_train = lb.fit_transform(data_tweets.loc[idx_train, 'StockTwits_sentiment'])
y_test = lb.transform(data_tweets.loc[~idx_train, 'StockTwits_sentiment'])

## Feature construction
Define different vectorizations of the text data.
* Bag-of-words: consider uni- and bi-grams, only keep tokens which appear at least in 0.1% of the tweets

In [46]:
vectorizer_bow = CountVectorizer(stop_words=['a', 'an', 'the'], min_df=0.001, ngram_range=(1, 2))
X_bow_train = vectorizer_bow.fit_transform(data_tweets.loc[idx_train, 'text'])
X_bow_test = vectorizer_bow.transform(data_tweets.loc[~idx_train, 'text'])

In [47]:
print(vectorizer_bow.get_feature_names())

['able', 'about', 'about companycashtag', 'about numbertag', 'about to', 'above', 'above numbertag', 'ac', 'according', 'account', 'acquisition', 'acting', 'action', 'active', 'activity', 'actually', 'ad', 'add', 'add to', 'added', 'added to', 'adding', 'advice', 'aeur', 'after', 'after earnings', 'after er', 'after hour', 'after numbertag', 'afternoon', 'again', 'against', 'ago', 'agree', 'ah', 'ahead', 'ahead of', 'air', 'alert', 'all', 'all day', 'all in', 'all of', 'all time', 'almost', 'almost numbertag', 'aloha', 'along', 'along with', 'already', 'also', 'always', 'am', 'amazing', 'amazon', 'america', 'analysis', 'analyst', 'analyst rating', 'and', 'and buy', 'and cashtag', 'and companycashtag', 'and hold', 'and it', 'and more', 'and now', 'and numbertag', 'and then', 'and they', 'and this', 'and up', 'and we', 'and will', 'and you', 'android', 'announce', 'announced', 'announcement', 'another', 'another numbertag', 'any', 'anyone', 'anything', 'app', 'appl', 'apple', 'apr', 'apr

* TF-IDF: transform the count matrix to tf-idf values

In [48]:
vectorizer_tfidf = TfidfTransformer()
X_tfidf_train = vectorizer_tfidf.fit_transform(X_bow_train)
X_tfidf_test = vectorizer_tfidf.transform(X_bow_test)

# Model training
## Logistic regression
Train logistic regression using the vectorized features.
* Bag-of-words:

In [49]:
logistic_bow_cv = LogisticRegressionCV(random_state=0, max_iter=2000, cv=5, Cs=20, scoring='f1').fit(X_bow_train, y_train.flatten())
y_logistic_bow_test = logistic_bow_cv.predict(X_bow_test)

* TF-IDF:

In [50]:
logistic_tfidf_cv = LogisticRegressionCV(random_state=0, max_iter=2000, cv=5, Cs=20, scoring='f1').fit(X_tfidf_train, y_train.flatten())
y_logistic_tfidf_test = logistic_tfidf_cv.predict(X_tfidf_test)

## Naive Bayes
Define function for dividing iterable into chunks:

In [51]:
def get_chunks(iterable, chunk_size):
    size = iterable.shape[0]
    if size < chunk_size:
        yield iterable
    chunks_nb = int(size / chunk_size)
    iter_ints = range(0, chunks_nb)
    for i in iter_ints:
        j = i * chunk_size
        if i + 1 < chunks_nb:
            k = j + chunk_size
            yield iterable[j:k]
        else:
            yield iterable[j:]

Define function that makes predictions with Naive Bayes by making the forecasts iteratively to avoid memory issues:

In [52]:
def predict_GaussianNB(model, X, chunk_size):
    y = np.empty(0)
    for X_i in get_chunks(X, chunk_size):
        y_i = model.predict(X_i.toarray())
        y = np.concatenate((y, y_i))
    return y

Train naive bayes classifier using the vectorized features.
* Bag-of-words:

In [53]:
naivebayes_bow = MultinomialNB().fit(X_bow_train.toarray(), y_train.flatten())
y_naivebayes_bow_test = predict_GaussianNB(naivebayes_bow, X_bow_test, 10000)

* TF-IDF:

In [54]:
naivebayes_tfidf = MultinomialNB().fit(X_tfidf_train.toarray(), y_train.flatten())
y_naivebayes_tfidf_test = predict_GaussianNB(naivebayes_tfidf, X_tfidf_test, 10000)

# Model evaluation
Create data-frame that summarises the accuracy and F1-score of the different models:

In [55]:
data = {'Model': ['Logistic', 'Logistic', 'Naive-Bayes', 'Naive-Bayes'], 'Feature': ['BoW', 'TF-IDF', 'BoW', 'TF-IDF'], 
       'Accuracy': [accuracy_score(y_test, y_logistic_bow_test), accuracy_score(y_test, y_logistic_tfidf_test), 
                   accuracy_score(y_test, y_naivebayes_bow_test), accuracy_score(y_test, y_naivebayes_tfidf_test)], 
       'F1': [f1_score(y_test, y_logistic_bow_test), f1_score(y_test, y_logistic_tfidf_test), 
                   f1_score(y_test, y_naivebayes_bow_test), f1_score(y_test, y_naivebayes_tfidf_test)]}
model_performance = pd.DataFrame(data)

Print evaluation of the models:

In [56]:
print(model_performance)

         Model Feature  Accuracy        F1
0     Logistic     BoW  0.785563  0.867935
1     Logistic  TF-IDF  0.786946  0.869006
2  Naive-Bayes     BoW  0.769542  0.853529
3  Naive-Bayes  TF-IDF  0.765179  0.862610


# Save models
Define directory where the models are saved:

In [57]:
dir_models = "D:/Project data/Data Project Sentiment Race/02_models/"

Save Bag-of-Words vectorizer (we need to save also this vectorizer since we use the TF-IDF transformer which needs a BoW vectorized feature matrix):

In [58]:
dump(vectorizer_bow, dir_models+'vectorizer_bow.joblib') 

['D:/Project data/Data Project Sentiment Race/02_models/vectorizer_bow.joblib']

Save TF-IDF vectorizer:

In [59]:
dump(vectorizer_tfidf, dir_models+'vectorizer_tfidf.joblib') 

['D:/Project data/Data Project Sentiment Race/02_models/vectorizer_tfidf.joblib']

Save Logistic regression model:

In [60]:
dump(logistic_tfidf_cv, dir_models+'logistic_tfidf_cv.joblib')

['D:/Project data/Data Project Sentiment Race/02_models/logistic_tfidf_cv.joblib']

Save Naive-Bayes model:

In [61]:
dump(naivebayes_tfidf, dir_models+'naivebayes_tfidf.joblib')

['D:/Project data/Data Project Sentiment Race/02_models/naivebayes_tfidf.joblib']

# Robustness checks
Robustness checks are only done for the tf-idf feature matrix used with the logistic regression model, as this is clearly the best performing setting.
## Remove Apple Inc.
Repeat the analysis but without the tweets about Apple. Since Apple is the company with the largest amount of tweets, this robustness check, controls that the accuracy is not driven only by Apple.

Define data set without Apple Tweets:

In [62]:
apple_id = 'D8442A'
data_tweets_no_apple = data_tweets[data_tweets.rpid!=apple_id]
idx_train_no_apple = data_tweets_no_apple['tweet_date_ET'].map(lambda x: x in training_date_range)

Vectorize text to create the feature matrix:

In [63]:
vectorizer_tfidf_no_apple = TfidfVectorizer(stop_words=['a', 'an', 'the'], min_df=0.001, ngram_range=(1, 1))
X_tfidf_no_apple_train = vectorizer_tfidf_no_apple.fit_transform(data_tweets_no_apple.loc[idx_train_no_apple, 'text'])
X_tfidf_no_apple_test = vectorizer_tfidf_no_apple.transform(data_tweets_no_apple.loc[~idx_train_no_apple, 'text'])

Define target variables without Apple:

In [64]:
y_no_apple_train = lb.fit_transform(data_tweets_no_apple.loc[idx_train_no_apple, 'StockTwits_sentiment'])
y_no_apple_test = lb.transform(data_tweets_no_apple.loc[~idx_train_no_apple, 'StockTwits_sentiment'])

Estimate the logistic model without Apple:

In [65]:
logistic_tfidf_no_apple_cv = LogisticRegressionCV(random_state=0, max_iter=2000, cv=5, Cs=20, scoring='f1').fit(X_tfidf_no_apple_train, y_no_apple_train.flatten())
y_logistic_tfidf_no_apple_test = logistic_tfidf_no_apple_cv.predict(X_tfidf_no_apple_test)

Compute accuracy of the model:

In [66]:
print(accuracy_score(y_no_apple_test, y_logistic_tfidf_no_apple_test))
print(f1_score(y_no_apple_test, y_logistic_tfidf_no_apple_test))

0.7822233087987649
0.8633259916703908


## Only uni-grams
Consider only uni-grams when defining the feature matrix:

In [67]:
vectorizer_tfidf_uni = TfidfVectorizer(stop_words=['a', 'an', 'the'], min_df=0.001, ngram_range=(1, 1))
X_tfidf_uni_train = vectorizer_tfidf_uni.fit_transform(data_tweets.loc[idx_train, 'text'])
X_tfidf_uni_test = vectorizer_tfidf_uni.transform(data_tweets.loc[~idx_train, 'text'])

Fit the logistic model with the unigrams:

In [68]:
logistic_tfidf_uni_cv = LogisticRegressionCV(random_state=0, max_iter=2000, cv=5, Cs=20, scoring='f1').fit(X_tfidf_uni_train, y_train.flatten())
y_logistic_tfidf_uni_test = logistic_tfidf_uni_cv.predict(X_tfidf_uni_test)

Evaluate the performance of the logistic model trained on uni-grams:

In [69]:
print(accuracy_score(y_test, y_logistic_tfidf_uni_test))
print(f1_score(y_test, y_logistic_tfidf_uni_test))

0.7838767356063683
0.8668061459006281
