## machine-learners

In this project we try out different models and feature sets to see what works best when trying to predict the sentiment of tweets about stocks. 

In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
import re
import numpy as np

from sklearn.naive_bayes import MultinomialNB
from sklearn import tree
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [2]:
# We read in the data and organize it into train & test.
train_examples = pd.read_csv('sent_train.csv').to_numpy()
X_train = train_examples[:,0]
y_train = train_examples[:,1].astype('int')

test_examples = pd.read_csv('sent_valid.csv').to_numpy()
X_test = test_examples[:,0]
y_test = test_examples[:,1].astype('int')

In [3]:
# Step I: Normalize
# Step II: Tokenize
# Step III: Get Features
# Step IV: Train Model
# Step V: Test to Get F1-Score & Other Measures. 

In [4]:
def token_one(texts):
    tokenized_texts = []
    for text in texts: 
        tokenized_texts.append(word_tokenize(re.sub(r'http\S+', '', text.lower())))
    return tokenized_texts

token_funcs = [token_one]

In [5]:
# This returns:
#  (1) A map that maps a token to a spot in the feature array. 
def build_feat_dict(training_examples):
    curr_pos = 0
    features = {}

    for example in training_examples:
        for token in example:
            if token not in features:
                features[token] = curr_pos
                curr_pos += 1
    
    return features

#
# To add a function to the feature
#
# Our baseline features are arrays with binary (0 or 1) values where we note whether or not a particular word has been seen. 
def featurize_one(X_train, X_test):
    # Build the dictionary
    feature_dict = build_feat_dict(X_train)

    # Go through each example and record which words we've seen. 
    def get_seen_words(examples):
        features = np.zeros((len(examples), len(feature_dict)))

        for i in range(len(examples)):
            tokens = examples[i]
            for j in range(len(tokens)):
                if tokens[j] in feature_dict:
                    feature_idx = feature_dict[tokens[j]]
                    features[i][feature_idx] = 1
        return features

    return get_seen_words(X_train), get_seen_words(X_test)

feature_funcs = [featurize_one]

In [6]:
def classify_mvb(X_train, X_test, y_train):
    mnb = MultinomialNB()
    return mnb.fit(X_train, y_train).predict(X_test)

def classify_dt(X_train, X_test, y_train):
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(X_train, y_train)
    return clf.predict(X_test)

pred_funcs = [classify_mvb, classify_dt]


In [7]:
def get_score(token_func, feature_func, pred_func, X_train, y_train, X_test, y_test):
    x_tr_tokens = token_func(X_train)
    x_te_tokens = token_func(X_test)
    x_tr_features, x_te_features = feature_func(x_tr_tokens, x_te_tokens)
    y_pred = pred_func(x_tr_features, x_te_features, y_train)
    return accuracy_score(y_test, y_pred)

for token_func in token_funcs:
    for feature_func in feature_funcs:
        for pred_func in pred_funcs:
            print(get_score(token_func, feature_func, pred_func, X_train, y_train, X_test, y_test))


0.8036013400335008
0.751675041876047
