In [None]:
import pandas as pd
import os
import numpy as np
import json
from nltk import TweetTokenizer
import re
import matplotlib.pyplot as plt

In [None]:
data_path = os.path.join('..', '..', 'data')

# Raw data
The raw data can be found as [Data supplement/supporting information](https://www.pnas.org/content/114/52/13762/tab-figures-data) from the paper "Critical dynamics in population vaccinating behavior" by Pananos et al.

This raw data has been processed in a way that only tweets which have 3 annotations of either positive (1), negative (-1) or neutral (0) are present in the data. Furthermore for each tweet the agreement score was computer. The cleaned data has then been exported to the file which can be found under `/data/vaccine_tweets_all.csv`.

The data consists of a total of 27'906 labelled tweets.

## Annotator agreement

In [None]:
df_labels = pd.read_csv(os.path.join(data_path, 'vaccine_sentiment_data.csv'))

In [None]:
df_labels['agreement'].value_counts()

In [None]:
agg = df_labels['agreement'].values
# mean agreement
print('Mean agreement {} with std {}'.format(np.nanmean(agg), np.nanstd(agg)))

# Download tweets

Generate a set of Twitter API keys and download the tweets using the following command:
```
python download_tweets.py -i ./data/vaccine_sentiment_data.csv -o ./data/tweets.jsonl --consumerkey XXX --consumersecret XXX --accesstoken XXX  --accesssecret XXX
```


# Preprocess
Here we merge the data with the labels, tokenize the data and select only tweets with at least 3 words.

In [None]:
def preprocess_tweet(tweet):
    tweet = tweet.lower()
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))','<url>',tweet)
    tweet = re.sub('(\@[^\s]+)','<user>',tweet)
    try:
        tweet = tweet.decode('unicode_escape').encode('ascii','ignore')
    except:
        pass
    return tweet

In [None]:
def read_data():
    df = []
    with open(os.path.join(data_path, 'tweets.jsonl'), 'r') as f:
        for line in f:
            tweet = json.loads(line)
            df.append({'tweet_id': int(tweet['id_str']), 'text': tweet['text']})
    return pd.DataFrame(df)
df = read_data()

In [None]:
df = df.merge(df_labels, on='tweet_id')

In [None]:
tknzr = TweetTokenizer()
indices = [] # indices of tweets which were used
indices_black_list = []
df['tweet_text_tokenized'] = ''
count = 0
total_count = len(df)
for i, line in enumerate(df[['text']].values):
    if count % 1000 == 0:
        print('Tokenized {} out of {}'.format(count, total_count))
    count += 1
    try:
        tweet = line[0].replace('\n', '').replace('\r', '').strip()
    except:
        print("could not parse line.")
        indices_black_list.append(i)
        continue
    tweet = tknzr.tokenize(tweet)
    # throw away anything below 2 words
    if not 2 < len(tweet) < 110:
        indices_black_list.append(i)
        continue
    tweet = preprocess_tweet(' '.join(tweet))
    df.loc[i, 'text_tokenized'] = tweet 
    indices.append(i)

In [None]:
# Number of non-tokenizable tweets
non_tokenizable = df['tweet_text_tokenized'].isnull().sum()
tokenizable = len(df['tweet_text_tokenized']) - non_tokenizable
print('#tweets tokenizable:\t\t{}\n#tweets non-tokenizable:\t{}'.format(tokenizable, non_tokenizable))

In [None]:
# black listed tweets:
df.iloc[indices_black_list].head()

# FastText

FastText was installed the following way:
```
git clone git@github.com:facebookresearch/fastText.git
cd fastText
pip install .
```

## Prepare data

In [None]:
df.loc[:,'label'] = df['label'].apply(lambda s: '__label__'+str(s)+' ')
df = df[['label', 'text_tokenized']]
df.to_csv(os.path.join(data_path, 'all_data.csv'))

## Train
The following command runs a grid search through hyperparameters (ngrams, dimensions, epochs, learning rate).
A word of caution: The code produces very large ouput files (in total ~50 GB).
```
python ./code/fasttext/train.py
```


## Analyse results

In [None]:
results = pd.read_csv(os.path.join(data_path, 'fasttext_results.csv'))
results.head()

In [None]:
results_new = {}
import ast
for i, row in enumerate(results.values):
    results_new[i] = ast.literal_eval(row[-1])
    results_new[i]['precision'] = results.loc[i, 'precision']
    results_new[i]['recall'] = results.loc[i, 'recall']
    results_new[i]['f1'] = results.loc[i, 'f1']

In [None]:
results_new = pd.DataFrame(results_new).transpose()
results_new.head()

In [None]:
# max precision
results_new.iloc[results_new['precision'].values.argmax()]

In [None]:
os.getcwd()

In [None]:
import matplotlib.patches as mpatches

color_dict = {1: 'red', 2: 'green', 3: 'blue'}
colors = [color_dict[int(c)] for c in results_new['ngrams']]
plt.clf()
plt.scatter(results_new['dim'], results_new['precision'], c=colors)
plt.xlabel('dimension')
plt.ylabel('precision')
plt.xlim([0,900])
leg = [mpatches.Circle((0.5, 0.5), color=color_dict[k], label=k) for k in color_dict.keys()]
plt.legend(handles=leg, title='ngrams')
plt.show()

In [None]:
color_dict = {1: 'red', 2: 'green', 3: 'blue'}
colors = [color_dict[int(c)] for c in results_new['ngrams']]
plt.clf()
plt.scatter(results_new['dim'], results_new['recall'], c=colors)
plt.xlabel('dimension')
plt.ylabel('recall')
plt.xlim([0,900])
leg = [mpatches.Circle((0.5, 0.5), color=color_dict[k], label=k) for k in color_dict.keys()]
plt.legend(handles=leg, title='ngrams')
plt.show()

In [None]:
color_dict = {1: 'red', 2: 'green', 3: 'blue'}
colors = [color_dict[int(c)] for c in results_new['ngrams']]

plt.clf()
plt.scatter(results_new['dim'], results_new['f1'], c=colors)
plt.xlabel('dimension')
plt.ylabel('f1')
plt.xlim([0,900])
leg = [mpatches.Circle((0.5, 0.5), color=color_dict[k], label=k) for k in color_dict.keys()]
plt.legend(handles=leg, title='ngrams')
plt.show()

In [None]:
color_dict = {1: 'red', 2: 'green', 3: 'blue'}
colors = [color_dict[int(c)] for c in results_new['ngrams']]

plt.clf()
plt.scatter(results_new['l'], results_new['precision'], c=colors)
plt.xlabel('learning rate')
plt.ylabel('precision')
#     plt.xlim([0,900])
leg = [mpatches.Circle((0.5, 0.5), color=color_dict[k], label=k) for k in color_dict.keys()]
plt.legend(handles=leg, title='ngrams')
plt.show()

After some more testing optimal learning seems to be around 0.015. The model selected has the following hyperparameters:
```
    Dimensions: 100
    Epochs: 200
    ngrams: 3
    learning_rate: 0.015
```