In [1]:
# load the small file
# play around with the big file to see if you get an improvement in performance!

import gensim, logging
# the model is organized like this: word = embeddings
small_model = gensim.models.KeyedVectors.load_word2vec_format('../small-embeddings.txt', binary=False)


In [2]:
# homeworks!

import codecs, nltk, string
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

exclude = set(string.punctuation)
stop_word_list = stopwords.words('english')

# input should be a string
def text_embedding(text):
    
    #it depends if the words have been lowercased or not
    text = text.lower()
    
    text = nltk.word_tokenize(text)
        
    text = [token for token in text if token not in exclude and token.isalpha()]
    
    text = [token for token in text if token not in stop_word_list]

    article_embedd = []
    
    for word in text:
            try:
                embed_word = small_model[word]
                article_embedd.append(embed_word)
            except KeyError:
                continue

    avg = [float(sum(col))/len(col) for col in zip(*article_embedd)]
    return avg


In [3]:
sentence = "Barack Obama was president of the USA"

embed_sentence = text_embedding(sentence)
print (embed_sentence)

[-0.26903926208615303, 0.7144050151109695, 0.6478350050747395, 0.3210650235414505, 0.07041849289089441, -0.021145001519471407, -0.7754549980163574, -0.2608862593770027, -0.2335975021123886, -0.44401000440120697, -0.7981760036200285, -0.2174225002527237, -0.2587737590074539, 0.020651994738727808, 0.16106024757027626, 0.11651949770748615, -0.40395849477499723, -0.11410426755901426, 0.19817999750375748, -0.18172174505889416, -0.09501974750310183, 0.13451825641095638, 0.24662524834275246, -0.6156899929046631, 0.17309998720884323, -2.1071474701166153, 0.201797503978014, 0.0012424960732460022, -0.7112424969673157, -0.06578348483890295, 1.8628499507904053, 0.3978800028562546, -1.0788562297821045, -0.5919300131499767, -0.5450749918818474, -1.0065224766731262, -0.4167025089263916, 0.18753925105556846, -0.6404455121737556, -0.8506049998104572, -0.3560800105333328, 0.3043750002980232, -0.3289024978876114, -0.748709999024868, 0.187304999679327, 0.5087025091052055, -0.7147175222635269, 0.2065749987

In [4]:
# YELP product reviews dataset

import codecs

sentiment_dataset = codecs.open("yelp_review_polarity_csv/test.csv","r","utf-8").read().strip().split("\n")

print (sentiment_dataset[1])

"1","Last summer I had an appointment to get new tires and had to wait a super long time. I also went in this week for them to fix a minor problem with a tire they put on. They \""fixed\"" it for free, and the very next morning I had the same issue. I called to complain, and the \""manager\"" didn't even apologize!!! So frustrated. Never going back.  They seem overpriced, too."


In [5]:
# AFINN Dictionary for Sentiment Analysis: https://github.com/fnielsen/afinn

from afinn import Afinn

afinn = Afinn()

print (afinn.score("This is bad fake news"))

print (afinn.score("An exam on the 20th of December? Oh, that's great!"))

print (afinn.score("That movie is horrible and beautiful at the same time"))


-6.0
3.0
0.0


In [6]:
# we need an NLP pipeline for Sentiment Analysis

import codecs, nltk, string
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

exclude = set(string.punctuation)
stop_word_list = stopwords.words('english')

# input should be a string
def nlp_pipeline(text):
    
    # if you want you can split in sentences - i'm usually skipping this step
    # text = nltk.sent_tokenize(text) 
    
    #tokenize words for each sentence
    text = nltk.word_tokenize(text)
    
    # pos tagger
    text = nltk.pos_tag(text)

    # lemmatizer
    text = [wordnet_lemmatizer.lemmatize(token.lower(),"v")if "V" in pos else wordnet_lemmatizer.lemmatize(token.lower()) for token,pos in text]
    
    # remove punctuation and numbers
    text = [token for token in text if token not in exclude and token.isalpha()]
    
    # remove stopwords - be careful with this step    
    text = " ".join([token for token in text if token not in stop_word_list])

    return text


In [7]:
# first, we define two folders, "corpus" - with the text and "labels", with the labels

corpus = []
labels = []

# be careful with this, the dataset is huge!
#for line in sentiment_dataset:
for line in sentiment_dataset[:1000]:
    text = line.split(",")[1].replace('"','')
    label = line.split(",")[0].replace('"','').replace("1","-1").replace("2","1")
    
    text = nlp_pipeline(text)
    
    corpus.append(text)
    labels.append(label)

In [8]:
pred = []

for review in corpus:
    score = afinn.score(review)
    
    if score < 0.0:
        pred.append("-1")
    else:
        pred.append("1")

In [9]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

print (precision_recall_fscore_support(labels, pred, average="macro"))

(0.70287869412769965, 0.61031096300805843, 0.55213903743315518, None)


Homework! Change the text processing pipeline (e.g., remove the POS tagger, keep stopwords, etc) and see if you can improve the performance of the analysis

In [10]:
# first, we define two folders, "corpus" - with the text and "labels", with the labels

corpus = []
labels = []

# be careful with this, the dataset is huge!
#for line in sentiment_dataset:
#for line in sentiment_dataset[:10000]: <-- by adding more training data performance will improve (i hope!)
# however, it'll use lots of memory ;-)
for line in sentiment_dataset[:20000]:
    label = line.split(",")[0].replace('"','').replace("1","-1").replace("2","1")
    text = line.split(",")[1].replace('"','')
    
    text = text_embedding(text)
    if len(text)>0:
        corpus.append(text)
        labels.append(label)
print ("ready!")

ready!


In [11]:
import numpy as np

X = np.array(corpus)
y = np.array(labels)

In [12]:
#here's the documentation: http://scikit-learn.org/stable/supervised_learning.html#supervised-learning
from sklearn import cross_validation
from sklearn.naive_bayes import GaussianNB

final_f1 = []

kf_total = cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True)
for train, test in kf_total:
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    final_C = 1
    classifier = GaussianNB().fit(X_train , y_train)
    y_pred = classifier.predict(X_test)
    
    print (precision_recall_fscore_support(y_test, y_pred, average="macro"))
    f1_score = precision_recall_fscore_support(y_test, y_pred, average="macro")[2]
    final_f1.append(f1_score)
print (" ")
print (sum(final_f1)/len(final_f1))

(0.64175919097218714, 0.64182366734722751, 0.64088071654080614, None)




(0.65045851638640606, 0.65057561301016797, 0.65049716090272214, None)
(0.66465719131066781, 0.66486688633285485, 0.66468795078173581, None)
(0.62578580812861118, 0.62599405688410925, 0.6256495883595381, None)
(0.65175340336855281, 0.65199114597020358, 0.65171405052679721, None)
(0.66429393184042951, 0.66454799975742374, 0.66426991932308121, None)
(0.64304457953394123, 0.64283835394693578, 0.64290477907911137, None)
(0.64990841652165954, 0.64919788533406253, 0.64930166290308222, None)
(0.66486592270302747, 0.66509433962264142, 0.66488315331232584, None)
(0.68337613751263904, 0.68367514356029524, 0.68315981001776027, None)
 
0.653794879175
