## machine-learners

In this project we try out different models and feature sets to see what works best when trying to predict the sentiment of tweets about stocks. 

In [52]:
import pandas as pd
from nltk.tokenize import word_tokenize
import re
import numpy as np

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [53]:
# We read in the data and organize it into train & test.
train_examples = pd.read_csv('sent_train.csv').to_numpy()
X_train = train_examples[:,0]
y_train = train_examples[:,1].astype('int')

test_examples = pd.read_csv('sent_valid.csv').to_numpy()
X_test = test_examples[:,0]
y_test = test_examples[:,1].astype('int')

In [57]:
# Step I: Normalize
# Step II: Tokenize
# Step III: Get Features
# Step IV: Train Model
# Step V: Test to Get F1-Score & Other Measures. 

In [58]:
# The baseline. We don't try and do anything fancy here. We use this to compare with any improvements we make. 

def normalize_baseline(text):
    text = text.lower()
    # Remove links. 
    text = re.sub(r'http\S+', '', text)
    return word_tokenize(text)

# This returns:
#  (1) A map that maps a token to a spot in the feature array. 
def create_baseline_feature_dict(training_examples):
    curr_pos = 0
    features = {}

    for example in training_examples:
        tokens = normalize_baseline(example)

        for token in tokens:
            if token not in features:
                features[token] = curr_pos
                curr_pos += 1
    
    return features

# Our baseline features are arrays with binary (0 or 1) values where we note whether or not a particular word has been seen. 
#
# If the secord argument is 
def featurize_baseline(examples, feature_dict):
    feature_vecs = np.zeros((len(examples), len(feature_dict)))

    for i in range(len(examples)):
        tokens = normalize_baseline(examples[i])

        for j in range(len(tokens)):
            if tokens[j] in feature_dict:
                feature_idx = feature_dict[tokens[j]]
                feature_vecs[i][feature_idx] = 1

    return feature_vecs

feature_dict = create_baseline_feature_dict(X_train)
X_train_baseline = featurize_baseline(X_train, feature_dict)
X_test_baseline = featurize_baseline(X_test, feature_dict)
mnb = MultinomialNB()
y_pred = mnb.fit(X_train_baseline, y_train).predict(X_test_baseline)

print(len(y_train))

print(f1_score(y_test, y_pred, average=None))
print(accuracy_score(y_test, y_pred))

9543
[0.55083179 0.66666667 0.88155922]
0.8036013400335008
