In [3]:
import gensim, logging
# the model is organized like this: word = embeddings
model = gensim.models.KeyedVectors.load_word2vec_format('resources/small-embeddings.txt', binary=False)


In [34]:
import nltk, string
from nltk.corpus import stopwords

exclude = set(string.punctuation)
stop_word_list = stopwords.words('english')

# input should be a string
def text_embedding(text):
    
    #it depends if the words have been lowercased or not
    text = text.lower()
    
    text = nltk.word_tokenize(text)
        
    text = [token for token in text if token not in exclude and token.isalpha()]
    
    text = [token for token in text if token not in stop_word_list]

    article_embedd = []
    
    for word in text:
            try:
                embed_word = small_model[word]
                article_embedd.append(embed_word)
            except KeyError:
                continue

    avg = [float(sum(col))/len(col) for col in zip(*article_embedd)]
    return avg


In [35]:
sentence = "Barack Obama was president of the USA"

embed_sentence = text_embedding(sentence)
print (embed_sentence)

[-0.26903926208615303, 0.7144050151109695, 0.6478350050747395, 0.3210650235414505, 0.07041849289089441, -0.021145001519471407, -0.7754549980163574, -0.2608862593770027, -0.2335975021123886, -0.44401000440120697, -0.7981760036200285, -0.2174225002527237, -0.2587737590074539, 0.020651994738727808, 0.16106024757027626, 0.11651949770748615, -0.40395849477499723, -0.11410426755901426, 0.19817999750375748, -0.18172174505889416, -0.09501974750310183, 0.13451825641095638, 0.24662524834275246, -0.6156899929046631, 0.17309998720884323, -2.1071474701166153, 0.201797503978014, 0.0012424960732460022, -0.7112424969673157, -0.06578348483890295, 1.8628499507904053, 0.3978800028562546, -1.0788562297821045, -0.5919300131499767, -0.5450749918818474, -1.0065224766731262, -0.4167025089263916, 0.18753925105556846, -0.6404455121737556, -0.8506049998104572, -0.3560800105333328, 0.3043750002980232, -0.3289024978876114, -0.748709999024868, 0.187304999679327, 0.5087025091052055, -0.7147175222635269, 0.2065749987

In [36]:
# YELP product reviews dataset

import codecs

sentiment_dataset = codecs.open("datasets/yelp-test.csv","r","utf-8").read().strip().split("\n")

print (sentiment_dataset[1])
print (" ")
print (sentiment_dataset[2])

"1","Last summer I had an appointment to get new tires and had to wait a super long time. I also went in this week for them to fix a minor problem with a tire they put on. They \""fixed\"" it for free, and the very next morning I had the same issue. I called to complain, and the \""manager\"" didn't even apologize!!! So frustrated. Never going back.  They seem overpriced, too."
 
"2","Friendly staff, same starbucks fair you get anywhere else.  Sometimes the lines can get long."


In [37]:
# first, we define two folders, "corpus" - with the text and "labels", with the labels

corpus = []
labels = []

# be careful with this, the dataset is huge!
#for line in sentiment_dataset:
for line in sentiment_dataset[:1000]:
    text = line.split(",")[1].replace('"','')
    label = line.split(",")[0].replace('"','').replace("1","-1").replace("2","1")
    
    corpus.append(text)
    labels.append(label)

In [38]:
# AFINN Dictionary for Sentiment Analysis: https://github.com/fnielsen/afinn
#!pip install afinn

from afinn import Afinn

afinn = Afinn()

print (afinn.score("This is bad fake news"))

print (afinn.score("An exam in the mid of December? Oh, that's great!"))

print (afinn.score("That movie is horrible and beautiful at the same time"))


-6.0
3.0
0.0


In [46]:
pred = []

for review in corpus:
    score = afinn.score(review)
    
    if score < 0.0:
        pred.append("-1")
    else:
        pred.append("1")

In [47]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

print (precision_recall_fscore_support(labels, pred, average="macro"))

(0.7065650739683889, 0.6226429452570531, 0.5722449661809613, None)


Homework: Instead of taking the entire text, you can process it (e.g., remove the POS tagger, keep stopwords, etc) and see if you can improve the performance of the analysis

In [53]:
# first, we define two folders, "corpus" - with the text and "labels", with the labels

corpus = []
labels = []

# be careful with this, the dataset is huge!
#for line in sentiment_dataset:
for line in sentiment_dataset[:10000]: #<-- by adding more training data performance will improve (i hope!)
# however, it'll use lots of memory ;-)
    label = line.split(",")[0].replace('"','').replace("1","-1").replace("2","1")
    text = line.split(",")[1].replace('"','')
    
    doc_emb = text_embedding(text)
    
    if len(doc_emb)>0:
        corpus.append(doc_emb)
        labels.append(label)
print ("ready!")

ready!


In [54]:
import numpy as np

X = np.array(corpus)
y = np.array(labels)

In [55]:
#here's the documentation: http://scikit-learn.org/stable/supervised_learning.html#supervised-learning
from sklearn import cross_validation
from sklearn.naive_bayes import GaussianNB

final_f1 = []

kf_total = cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True)

for train, test in kf_total:
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    final_C = 1
    classifier = GaussianNB().fit(X_train , y_train)
    y_pred = classifier.predict(X_test)
    
    print (precision_recall_fscore_support(y_test, y_pred, average="macro"))
    f1_score = precision_recall_fscore_support(y_test, y_pred, average="macro")[2]
    final_f1.append(f1_score)
print (" ")
print (sum(final_f1)/len(final_f1))

(0.6720529412238131, 0.6729801351296678, 0.6715878607412213, None)
(0.6545087265675501, 0.655017267376818, 0.6525417918573935, None)
(0.649571903557524, 0.6493384892261297, 0.6455733973392903, None)
(0.6566430626960261, 0.6574333601436383, 0.6565523747632627, None)
(0.6638046116504854, 0.6644873138511784, 0.6621883790862353, None)
(0.6361441742826437, 0.6368808241325241, 0.6356771212636145, None)
(0.623965900124458, 0.6238108004907099, 0.6238781582284656, None)
(0.6502592349938932, 0.6509257679527489, 0.6503011685156095, None)
(0.6402366432614981, 0.6406809817446197, 0.6381291824265348, None)
(0.6645062331152557, 0.6642456148902809, 0.6602929292929294, None)
 
0.6496722363514558


In [58]:
#here's the documentation: http://scikit-learn.org/stable/supervised_learning.html#supervised-learning
from sklearn import svm

SVM = svm.SVC(kernel = "linear", C=1) 

final_f1 = []

kf_total = cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True)

for train, test in kf_total:
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    final_C = 1
    classifier = SVM.fit(X_train , y_train)
    y_pred = classifier.predict(X_test)
    
    print (precision_recall_fscore_support(y_test, y_pred, average="macro"))
    f1_score = precision_recall_fscore_support(y_test, y_pred, average="macro")[2]
    final_f1.append(f1_score)
print (" ")
print (sum(final_f1)/len(final_f1))

(0.671067221067221, 0.6697799085649553, 0.670148981083056, None)
(0.7010244758855726, 0.6972493798336494, 0.6979189111633735, None)
(0.6934955827776228, 0.6882995281871687, 0.6888746414090257, None)
(0.6806154608269348, 0.6718053084403715, 0.6717363672937087, None)
(0.6881176599993231, 0.6806303671386905, 0.6809388149435847, None)
(0.7136651878484019, 0.710507608439559, 0.7112334657706709, None)
(0.6650283301484516, 0.6606709075694428, 0.6609811165845649, None)
(0.6657761226252159, 0.6622003135993241, 0.6625912687314119, None)
(0.6847393267651889, 0.6828076888704737, 0.6832968790806999, None)
(0.6924376299376299, 0.6864981679624981, 0.6870374481941977, None)
 
0.6814757894254294
