In [1]:
## Load all libraries


import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier


In [16]:
positive_words = [line.strip() for line in open('data/positive-words.txt', encoding='latin-1').readlines() if line.strip() and not line.startswith(';')]
negative_words = [line.strip() for line in open('data/negative-words.txt', encoding='latin-1').readlines() if line.strip() and not line.startswith(';')]

# Please download the word embeddings from https://nlp.stanford.edu/projects/glove/
# This file here is a symlink only

def get_embeddings():
    with open('data/glove.840B.300d.txt') as fp:
        labels = []
        rows = []
        for i, line in enumerate(fp):
            items = line.rstrip().split(' ')
            if len(items) == 2:
                continue
            labels.append(items[0])
            rows.append(np.array([float(x) for x in items[1:]], 'f'))
        arr = np.vstack(rows)
        print(arr.shape)
        return pd.DataFrame(arr, index=labels, dtype='f')


In [17]:
word_vectors = get_embeddings()

(2196017, 300)


In [None]:
word_vectors[:3]

In [8]:
negative_words[:10]

['2-faced',
 '2-faces',
 'abnormal',
 'abolish',
 'abominable',
 'abominably',
 'abominate',
 'abomination',
 'abort',
 'aborted']

In [19]:
positive_vectors = word_vectors.loc[positive_vectors.index.intersection(positive_words)].dropna()
negative_vectors = word_vectors.loc[negative_vectors.index.intersection(negative_words)].dropna()

In [110]:
data = pd.concat([positive_vectors, negative_vectors])
targets = ([1 for word in positive_vectors.index] + [-1 for word in negative_vectors.index])
labels = data.index
train_data, test_data, train_target, test_target, train_labels, test_labels = train_test_split(data, targets, labels, test_size=0.1)

In [111]:
test_data[:10]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
talents,0.079738,0.22543,0.48147,-0.21501,-0.24233,0.24257,0.25378,0.2954,-0.61193,2.4873,...,-0.51137,0.061727,-0.23091,0.26796,-0.066753,0.12725,0.28856,-0.14957,-0.23165,-0.09176
compliment,-0.48248,0.000423,-0.26602,-0.28565,-0.059128,0.20516,0.14219,-0.31958,-0.54584,1.2688,...,0.21869,0.21234,-0.30248,0.17149,0.34338,-0.001801,0.1733,-0.15705,-0.097721,0.26514
authoritative,-0.3639,-0.26022,0.070223,0.62199,0.4956,0.45425,0.37644,0.26591,0.40669,1.2102,...,-0.34278,-0.33624,-0.031502,0.21786,0.29948,-0.24484,0.43877,-0.37458,0.18775,-0.28367
providence,0.042244,-0.43301,0.16692,-0.59343,0.10938,0.11396,-0.03035,0.717,0.41835,1.349,...,0.17381,-0.027388,-0.38737,0.47336,-0.013823,-0.022718,0.66873,-0.11685,-0.29087,-0.21707
well-mannered,-0.53095,0.14591,-0.31922,-0.17975,0.3144,0.35456,0.048244,-0.33542,0.15916,0.084073,...,-0.40621,-0.35861,0.11315,0.56635,0.69027,0.44813,0.28226,-0.31458,-0.12128,0.069098
dismaying,0.26318,0.39946,0.91933,0.12684,-0.10245,0.019753,-0.16081,0.51162,-0.1285,-0.12876,...,-0.029655,0.15286,0.25337,0.086322,0.28528,0.090775,-0.10952,-0.015017,-0.50069,-0.36652
inefficiency,-1.1057,0.46117,0.83948,0.015746,-0.91422,0.02702,0.13238,-0.080266,-0.4888,0.94953,...,0.2316,0.17747,0.72629,-0.057088,-0.50281,0.44869,0.28128,0.56426,0.041583,0.049635
concession,0.34479,-0.48761,0.9994,-0.004551,0.33673,0.2588,-0.007839,-0.066173,0.037484,1.7626,...,-0.42023,0.49476,0.23687,-0.029079,0.43959,0.019844,0.31214,-0.16646,0.31959,0.21391
anti-social,-0.20302,0.43107,-0.49637,0.27553,-0.27723,-0.36506,0.3432,-0.12708,-0.10139,0.97583,...,-0.37885,0.23782,0.48876,0.64117,0.41027,0.69265,0.094197,0.36678,-0.074061,0.064191
splendid,0.23717,0.025011,-0.031719,-0.27442,-0.27611,0.35515,0.42757,0.12422,-0.52821,1.7074,...,-0.34159,0.006202,-0.37265,0.25376,-0.003798,0.33476,0.51325,0.14104,-0.027448,0.044597


In [127]:
model = SGDClassifier(max_iter=200, loss='log')
model.fit(train_data, train_target)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=200, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [128]:
accuracy_score(model.predict(test_data), test_target)

0.9622356495468278

In [164]:
import re
TOKEN_RE = re.compile(r"\w.*?\b")

def get_sentiment(input_str):
    embeddings = [word_vectors.loc[token.casefold()] for token in TOKEN_RE.findall(input_str)]
    predictions = model.predict_log_proba(embeddings)
    return (predictions[:, 1] - predictions[:, 0]).mean()


In [173]:
#Testing on trivial words
print(get_sentiment("good"))
print(get_sentiment("bad"))
print(get_sentiment("ugly"))
print(get_sentiment("The good, the bad and the ugly"))

8.233151308586075
-9.688738095325608
-16.99371687959816
-1.9686040217022105


In [174]:
print(get_sentiment("My name is Emily"))
print(get_sentiment("My name is Yvonne"))
print(get_sentiment("My name is Shaniqua"))


0.4445063989124153
-0.4437736356852131
-2.4018470012600437


In [175]:
print(get_sentiment("My name is James"))
print(get_sentiment("My name is Mario"))
print(get_sentiment("My name is DeShawn"))


0.2332782816924801
0.08885950109471236
-0.4140725296787715
