## machine-learners

In this project we try out different models and feature sets to see what works best when trying to predict the sentiment of tweets about stocks. 

In [11]:
import pandas as pd
from nltk.tokenize import word_tokenize
import re
import numpy as np

from sklearn.naive_bayes import MultinomialNB
from sklearn import tree
from sklearn import svm
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import nltk
from nltk.stem import *
from nltk.stem import WordNetLemmatizer
from nltk.corpus import words
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danielhettinger/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
# We read in the data and organize it into train & test.
train_examples = pd.read_csv('sent_train.csv').to_numpy()
X_train = train_examples[:,0]
y_train = train_examples[:,1].astype('int')

test_examples = pd.read_csv('sent_valid.csv').to_numpy()
X_test = test_examples[:,0]
y_test = test_examples[:,1].astype('int')

In [13]:
# Step I: Normalize
# Step II: Tokenize
# Step III: Get Features
# Step IV: Train Model
# Step V: Test to Get F1-Score & Other Measures. 

In [14]:
# This is the classification function that we use in all experiments. 
def classify_mvb(X_train, X_test, y_train):
    mnb = MultinomialNB()
    return mnb.fit(X_train, y_train).predict(X_test)

In [46]:
# Section 3.1

# Building the positive / negative lexicon. 
lines = []
with open('NRC-Emotion-Lexicon-Wordlevel-v0.92.txt') as file:
    lines = [line.rstrip() for line in file]

sentiment_lexicon = {}
for line in lines:
    [word, sense, flag] = line.split()
    if sense == 'positive' and flag == '1':
        sentiment_lexicon[word] = 'positive'
    if sense == 'negative' and flag == '1':
        sentiment_lexicon[word] = 'negative'

# To tokenize we just lowercase and remove links. 
def tokenize_3_1(texts):
    tokenized_texts = []
    for text in texts: 
        tokenized_texts.append(word_tokenize(re.sub(r'http\S+', '', text.lower())))
    return tokenized_texts

def featurize_3_1(examples):
    featurized_examples = []
    for example in examples:
        neg_count = 0
        pos_count = 0

        for token in example:
            if token in sentiment_lexicon and sentiment_lexicon[token] == 'positive':
                pos_count += 1
            if token in sentiment_lexicon and sentiment_lexicon[token] == 'negative':
                neg_count += 1
        
        featurized_examples.append([neg_count, pos_count])
    
    return featurized_examples

train_tokens_3_1 = tokenize_3_1(X_train)
test_tokens_3_1 = tokenize_3_1(X_test)

train_features_3_1 = featurize_3_1(train_tokens_3_1)
test_features_3_1 = featurize_3_1(test_tokens_3_1)

y_pred_3_1 = classify_mvb(train_features_3_1, test_features_3_1, y_train)

# Get the results. 
accuracy_score(y_test, y_pred_3_1)


0.6520100502512562

In [50]:
# Section 3.10

# Get a set of stop words. 
stopwords_set = set(stopwords.words('english'))
englishwords_set = set([word.lower() for word in words.words()])

# We tokenize, remove stop words & words that contain non-alphabetic characters. We then lemmatize all tokens. 
def tokenize_3_10(texts):
    tokenized_texts = []
    lemmatizer = WordNetLemmatizer()
    
    for text in texts:
        tokens = word_tokenize(re.sub(r'http\S+', '', text.lower()))
        #lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
        #tokenized_texts.append([lemmatized_token for lemmatized_token in lemmatized_tokens if lemmatized_token not in stopwords_set and lemmatized_token in sentiment_lexicon])
        tokenized_texts.append([token for token in tokens if token not in stopwords_set and token in sentiment_lexicon])
    return tokenized_texts

def featurize_3_10(examples):
    featurized_examples = []
    for example in examples:
        neg_count = 0
        pos_count = 0

        for token in example:
            if token in sentiment_lexicon and sentiment_lexicon[token] == 'positive':
                pos_count += 1
            if token in sentiment_lexicon and sentiment_lexicon[token] == 'negative':
                neg_count += 1
        
        featurized_examples.append([neg_count, pos_count])
    
    return featurized_examples

train_tokens_3_10 = tokenize_3_10(X_train)
test_tokens_3_10 = tokenize_3_10(X_test)

train_features_3_10 = featurize_3_10(train_tokens_3_10)
test_features_3_10 = featurize_3_10(test_tokens_3_10)

y_pred_3_10 = classify_mvb(train_features_3_10, test_features_3_10, y_train)

# Get the results. 
accuracy_score(y_test, y_pred_3_10)


0.6528475711892797

In [20]:
# Section 3.2

# Building the emotion lexicon. 
lines = []
with open('NRC-Emotion-Lexicon-Wordlevel-v0.92.txt') as file:
    lines = [line.rstrip() for line in file]

emotion_lexicon = {}
for line in lines:
    [word, sense, flag] = line.split()

    if word not in emotion_lexicon:
        emotion_lexicon[word] = []

    if sense == 'anger' and flag == '1':
        emotion_lexicon[word].append(0)
    if sense == 'anticipation' and flag == '1':
        emotion_lexicon[word].append(1)
    if sense == 'disgust' and flag == '1':
        emotion_lexicon[word].append(2)
    if sense == 'fear' and flag == '1':
        emotion_lexicon[word].append(3)
    if sense == 'joy' and flag == '1':
        emotion_lexicon[word].append(4)
    if sense == 'sadness' and flag == '1':
        emotion_lexicon[word].append(5)
    if sense == 'surprise' and flag == '1':
        emotion_lexicon[word].append(6)
    if sense == 'trust' and flag == '1':
        emotion_lexicon[word].append(7)

# To tokenize we just lowercase and remove links. 
def tokenize_3_2(texts):
    tokenized_texts = []
    for text in texts: 
        tokenized_texts.append(word_tokenize(re.sub(r'http\S+', '', text.lower())))
    return tokenized_texts

def featurize_3_2(examples):
    featurized_examples = []
    for example in examples:
        features = [0] * 8

        for token in example:
            if token in emotion_lexicon:
                emotions = emotion_lexicon[token]
                for emotion in emotions:
                    features[emotion] += 1
        
        featurized_examples.append(features)
    
    return featurized_examples

train_tokens_3_2 = tokenize_3_2(X_train)
test_tokens_3_2 = tokenize_3_2(X_test)

train_features_3_2 = featurize_3_2(train_tokens_3_2)
test_features_3_2 = featurize_3_2(test_tokens_3_2)

y_pred_3_2 = classify_mvb(train_features_3_2, test_features_3_2, y_train)

# Get the results. 
accuracy_score(y_test, y_pred_3_2)

0.647822445561139

In [49]:
# Section 3.11

# We tokenize, remove stop words & words that contain non-alphabetic characters. We then lemmatize all tokens. 
def tokenize_3_11(texts):
    tokenized_texts = []
    lemmatizer = WordNetLemmatizer()
    
    for text in texts:
        tokens = word_tokenize(re.sub(r'http\S+', '', text.lower()))
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
        tokenized_texts.append([lemmatized_token for lemmatized_token in lemmatized_tokens if lemmatized_token not in stopwords_set and lemmatized_token in englishwords_set])
    
    return tokenized_texts

def featurize_3_11(examples):
    featurized_examples = []
    for example in examples:
        features = [0] * 8

        for token in example:
            if token in emotion_lexicon:
                emotions = emotion_lexicon[token]
                for emotion in emotions:
                    features[emotion] += 1
        
        featurized_examples.append(features)
    
    return featurized_examples

train_tokens_3_11 = tokenize_3_11(X_train)
test_tokens_3_11 = tokenize_3_11(X_test)

train_features_3_11 = featurize_3_11(train_tokens_3_11)
test_features_3_11 = featurize_3_11(test_tokens_3_11)

y_pred_3_11 = classify_mvb(train_features_3_11, test_features_3_11, y_train)

# Get the results. 
accuracy_score(y_test, y_pred_3_11)

0.6432160804020101

In [33]:
# Section 3.3

# Get a set of stop words. 
stopwords_set = set(stopwords.words('english'))

# We tokenize and remove stop words. 
def tokenize_3_3(texts):
    tokenized_texts = []
    for text in texts:
        tokens = word_tokenize(re.sub(r'http\S+', '', text.lower()))
        tokenized_texts.append([token for token in tokens if token not in stopwords_set])
    
    return tokenized_texts

# Now we build a dictionary that maps a word to the position in the feature vector. 
train_tokens_3_3 = tokenize_3_3(X_train)

word_to_pos_3_3 = {}
curr_pos = 0

for tokens in train_tokens_3_3:
    for token in tokens:
        if token not in word_to_pos_3_3:
            word_to_pos_3_3[token] = curr_pos
            curr_pos += 1

print(word_to_pos_3_3)
print(len(word_to_pos_3_3))

# Feature vector is an array of binary values for if we've seen a word or not. 
def featurize_3_3(examples):
    featurized_examples = np.zeros((len(examples), len(word_to_pos_3_3)))

    for i in range(len(examples)):
        tokens = examples[i]
        for j in range(len(tokens)):
            if tokens[j] in word_to_pos_3_3:
                feature_idx = word_to_pos_3_3[tokens[j]]
                featurized_examples[i][feature_idx] = 1
    return featurized_examples

test_tokens_3_3 = tokenize_3_3(X_test)

train_features_3_3 = featurize_3_3(train_tokens_3_3)
test_features_3_3 = featurize_3_3(test_tokens_3_3)

y_pred_3_3 = classify_mvb(train_features_3_3, test_features_3_3, y_train)

# Get the results. 
accuracy_score(y_test, y_pred_3_3)

17714


0.7977386934673367

In [29]:
# Section 3.4

# We tokenize and remove stop words. 
def tokenize_3_4(texts):
    tokenized_texts = []
    for text in texts:
        tokens = word_tokenize(re.sub(r'http\S+', '', text.lower()))
        tokenized_texts.append([token for token in tokens if token not in stopwords_set])
    
    return tokenized_texts

# Now we build a dictionary that maps a word to the position in the feature vector. 
train_tokens_3_4 = tokenize_3_3(X_train)

bigram_to_pos_3_4 = {}
curr_pos = 0

for tokens in train_tokens_3_3:
    for i in range(len(tokens)):
        token_one = '' 
        token_two = '' 

        if i == 0:
            token_one = 'START_TOKEN'
            token_two = tokens[i]
        elif i == (len(tokens) - 1):
            token_one = tokens[i]
            token_two = 'END_TOKEN'      
        else:
            token_one = tokens[i]
            token_two = tokens[i + 1]

        bigram = token_one + '///' + token_two

        if bigram not in bigram_to_pos_3_4:
            bigram_to_pos_3_4[bigram] = curr_pos
            curr_pos += 1

# Feature vector is an array of binary values for if we've seen a word or not. 
def featurize_3_4(examples):
    featurized_examples = np.zeros((len(examples), len(bigram_to_pos_3_4)))

    for i in range(len(examples)):
        tokens = examples[i]
        
        for j in range(len(tokens)):
            token_one = '' 
            token_two = '' 

            if j == 0:
                token_one = 'START_TOKEN'
                token_two = tokens[j]
            elif j == (len(tokens) - 1):
                token_one = tokens[j]
                token_two = 'END_TOKEN'      
            else:
                token_one = tokens[j]
                token_two = tokens[j + 1]

            bigram = token_one + '///' + token_two
            
            if bigram in bigram_to_pos_3_4:
                feature_idx = bigram_to_pos_3_4[bigram]
                featurized_examples[i][feature_idx] = 1
    return featurized_examples

test_tokens_3_4 = tokenize_3_4(X_test)

train_features_3_4 = featurize_3_4(train_tokens_3_4)
test_features_3_4 = featurize_3_4(test_tokens_3_4)

y_pred_3_4 = classify_mvb(train_features_3_4, test_features_3_4, y_train)

# Get the results. 
accuracy_score(y_test, y_pred_3_4)

0.7855946398659966

In [44]:
# Section 3.5

# Get a set of stop words. 
stopwords_set = set(stopwords.words('english'))
englishwords_set = set([word.lower() for word in words.words()])

# We tokenize, remove stop words & words that contain non-alphabetic characters. We then lemmatize all tokens. 
def tokenize_3_5(texts):
    tokenized_texts = []
    lemmatizer = WordNetLemmatizer()
    
    for text in texts:
        tokens = word_tokenize(re.sub(r'http\S+', '', text.lower()))
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
        tokenized_texts.append([lemmatized_token for lemmatized_token in lemmatized_tokens if lemmatized_token not in stopwords_set and lemmatized_token in englishwords_set])
    
    return tokenized_texts

# Now we build a dictionary that maps a word to the position in the feature vector. 
train_tokens_3_5 = tokenize_3_5(X_train)

print(train_tokens_3_5)
word_to_pos_3_5 = {}
curr_pos = 0

for tokens in train_tokens_3_5:
    for token in tokens:
        if token not in word_to_pos_3_5:
            word_to_pos_3_5[token] = curr_pos
            curr_pos += 1

print(word_to_pos_3_5)
print(len(word_to_pos_3_5))
# Feature vector is an array of binary values for if we've seen a word or not. 
def featurize_3_5(examples):
    featurized_examples = np.zeros((len(examples), len(word_to_pos_3_5)))

    for i in range(len(examples)):
        tokens = examples[i]
        for j in range(len(tokens)):
            if tokens[j] in word_to_pos_3_5:
                feature_idx = word_to_pos_3_5[tokens[j]]
                featurized_examples[i][feature_idx] = 1
    return featurized_examples

test_tokens_3_5 = tokenize_3_5(X_test)

train_features_3_5 = featurize_3_5(train_tokens_3_3)
test_features_3_5 = featurize_3_5(test_tokens_3_3)

y_pred_3_5 = classify_mvb(train_features_3_5, test_features_3_5, y_train)

# Get the results. 
accuracy_score(y_test, y_pred_3_5)

6539


0.7374371859296482