In [1]:
"""
for checking the homeworks we need a few things:

- import embeddings
- load our functions (text_embeddings and nlp_pipeline)
- load the aFinn dictionary
"""

'\nfor checking the homeworks we need a few things:\n\n- import embeddings\n- load our functions (text_embeddings and nlp_pipeline)\n- load the aFinn dictionary\n'

In [2]:
# load the small file

import gensim

small_model = gensim.models.KeyedVectors.load_word2vec_format('../small-embeddings.txt', binary=False)

#large model
#small_model = gensim.models.KeyedVectors.load_word2vec_format('/Users/federiconanni/WordEmbeddings/GoogleNews-vectors-negative300.bin.gz', binary=True)


In [3]:
import codecs, nltk, string
from nltk.corpus import stopwords

exclude = set(string.punctuation)
stop_word_list = stopwords.words('english')

# input should be a string, you convert text in a doc-embedding
def text_embedding(text):
    
    #it depends if the words are lowercased or not in the word embeddings that you use, if they are not skip this step
    text = text.lower()
    
    text = nltk.word_tokenize(text)
    
    # remove numbers
    text = [token for token in text if token not in exclude and token.isalpha()]

    # remove stopwords (not essential)
    text = [token for token in text if token not in stop_word_list]

    article_embedd = []
    
    # you take all embeddings
    for word in text:
            try:
                embed_word = small_model[word]
                article_embedd.append(embed_word)
            except KeyError:
                continue
    
    # then you average them
    avg = [float(sum(col))/len(col) for col in zip(*article_embedd)]
    
    return avg


In [4]:
# YELP product reviews dataset


# we are using only the "small" test-set, you can also train on the large training set if you'd like
sentiment_dataset = codecs.open("yelp_review_polarity_csv/test.csv","r","utf-8").read().strip().split("\n")

print (sentiment_dataset[1])

"1","Last summer I had an appointment to get new tires and had to wait a super long time. I also went in this week for them to fix a minor problem with a tire they put on. They \""fixed\"" it for free, and the very next morning I had the same issue. I called to complain, and the \""manager\"" didn't even apologize!!! So frustrated. Never going back.  They seem overpriced, too."


In [5]:
# AFINN Dictionary for Sentiment Analysis: https://github.com/fnielsen/afinn

from afinn import Afinn

afinn = Afinn()


In [6]:
# we need an NLP pipeline for Sentiment Analysis

from nltk.stem.wordnet import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()


# input should be a string
def nlp_pipeline(text):
    
    # if you want you can split in sentences - i'm usually skipping this step
    # text = nltk.sent_tokenize(text) 
    
    #tokenize words for each sentence
    text = nltk.word_tokenize(text)
    
    # pos tagger
    text = nltk.pos_tag(text)

    # lemmatizer
    text = [wordnet_lemmatizer.lemmatize(token.lower(),"v")if "V" in pos else wordnet_lemmatizer.lemmatize(token.lower()) for token,pos in text]
    
    # remove punctuation and numbers
    text = [token for token in text if token not in exclude and token.isalpha()]
    
    # remove stopwords - be careful with this step    
    text = " ".join([token for token in text if token not in stop_word_list])

    return text


In [7]:
# first, we define two lists, "corpus" - with the text and "labels", with the labels

corpus = []
labels = []

# be careful with this, the dataset is huge (and this is only a small part of it - you can also use "train")!

#for line in sentiment_dataset:
for line in sentiment_dataset[:1000]:
    text = line.split(",")[1].replace('"','')
    label = line.split(",")[0].replace('"','').replace("1","-1").replace("2","1")
    
    text = nlp_pipeline(text)
    
    corpus.append(text)
    labels.append(label)

In [8]:
pred = []

for review in corpus:
    score = afinn.score(review)
    
    if score < 0.0:
        pred.append("-1")
    else:
        pred.append("1")

In [9]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

print (precision_recall_fscore_support(labels, pred, average="macro"))

(0.70287869412769965, 0.61031096300805843, 0.55213903743315518, None)


Homework! Change the text processing pipeline (e.g., remove the POS tagger, keep stopwords, etc) and see if you can improve the performance of the analysis

In [10]:
# solution n.1: assign neutral to negative ;-)

corpus = []
labels = []

# be careful with this, the dataset is huge!
#for line in sentiment_dataset:
for line in sentiment_dataset[:1000]:
    text = line.split(",")[1].replace('"','')
    label = line.split(",")[0].replace('"','').replace("1","-1").replace("2","1")
    
    text = nlp_pipeline(text)
    
    corpus.append(text)
    labels.append(label)

In [11]:
pred = []

for review in corpus:
    score = afinn.score(review)
    # assign neutral to negative!
    if score < 0.1:
        pred.append("-1")
    else:
        pred.append("1")

from sklearn.metrics import precision_recall_fscore_support, accuracy_score

print (precision_recall_fscore_support(labels, pred, average="macro"))

(0.67040052665802363, 0.66898300197055383, 0.66667967917168403, None)


In [12]:
# solution n.2: no text processing at all!

corpus = []
labels = []

# be careful with this, the dataset is huge!
#for line in sentiment_dataset:
for line in sentiment_dataset[:1000]:
    text = line.split(",")[1].replace('"','')
    label = line.split(",")[0].replace('"','').replace("1","-1").replace("2","1")
    
#    text = nlp_pipeline(text)
    
    corpus.append(text)
    labels.append(label)

In [13]:
pred = []

for review in corpus:
    score = afinn.score(review)
    # assign neutral to negative!
    if score < 0.1:
        pred.append("-1")
    else:
        pred.append("1")

from sklearn.metrics import precision_recall_fscore_support, accuracy_score

print (precision_recall_fscore_support(labels, pred, average="macro"))

(0.66953967642526968, 0.66922731860491202, 0.66796679667966785, None)


In [14]:
# let's train a naive bayes classifier for doing the same

corpus = []
labels = []

# be careful with this, the dataset is huge!
#for line in sentiment_dataset:

#for line in sentiment_dataset[:10000]: <-- by adding more training data performance will improve (i hope!)
# however, it'll use lots of memory ;-)
for line in sentiment_dataset[:10000]:
    label = line.split(",")[0].replace('"','').replace("1","-1").replace("2","1")
    text = line.split(",")[1].replace('"','')
    
    text = text_embedding(text)
    if len(text)>0:
        corpus.append(text)
        labels.append(label)
print ("ready!")

ready!


In [15]:
# numpy array is a different type of list: https://stackoverflow.com/questions/993984/why-numpy-instead-of-python-lists
# they are super efficient for storing information in compat ways

import numpy as np

# X and y are standard abbreviations for corpus and labels
X = np.array(corpus)
y = np.array(labels)

In [16]:
#here's the documentation: http://scikit-learn.org/stable/supervised_learning.html#supervised-learning
from sklearn import cross_validation
from sklearn.naive_bayes import GaussianNB

final_f1 = []

# cross validation 10 folds
kf_total = cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True)

for train, test in kf_total:
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    classifier = GaussianNB().fit(X_train , y_train)
    y_pred = classifier.predict(X_test)
    
    print (precision_recall_fscore_support(y_test, y_pred, average="macro"))
    f1_score = precision_recall_fscore_support(y_test, y_pred, average="macro")[2]
    final_f1.append(f1_score)
print (" ")
print (sum(final_f1)/len(final_f1))

(0.65815837581438807, 0.6589108710604038, 0.65815120491161827, None)
(0.66081263744023233, 0.66154725424388339, 0.66082567463104336, None)




(0.64388007168574513, 0.64448659954277931, 0.64242610042597126, None)
(0.65209825542854372, 0.65285326638881447, 0.65101799263960447, None)
(0.65538120036874703, 0.6561070624842591, 0.65408652485709884, None)
(0.62538521268593339, 0.62594140729727754, 0.62395309544212518, None)
(0.64612912685418245, 0.64574447342123864, 0.64589451164455447, None)
(0.65566198278796128, 0.65606034755904363, 0.65323130540192442, None)
(0.65478076482488001, 0.65478076482488001, 0.65125628140703506, None)
(0.65245495276915122, 0.65314980460974759, 0.65108570539270239, None)
 
0.649192839675


In [18]:
# let's compare it with a tf-idf bag-of-word approach

corpus = []
labels = []

# be careful with this, the dataset is huge!
for line in sentiment_dataset[:100]:
    label = line.split(",")[0].replace('"','').replace("1","-1").replace("2","1")
    text = line.split(",")[1].replace('"','')
    
    text = nlp_pipeline(text)
    if len(text)>0:
        corpus.append(text)
        labels.append(label)
print ("ready!")

ready!


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(norm='l2')

tfidf_matrix = tfidf_vectorizer.fit_transform(corpus).A

X = np.array(tfidf_matrix)
y = np.array(labels)

print ("tf-idf, done!")

tf-idf, done!


In [26]:
print (X[50][:200])

[ 0.          0.1225629   0.          0.1225629   0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.1225629
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.1225629
  0.          0.1225629   0.1225629   0.11246635  0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.1225629   0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.1225629   0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          

In [None]:
final_f1 = []

# cross validation 10 folds
kf_total = cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True)

for train, test in kf_total:
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    final_C = 1
    classifier = GaussianNB().fit(X_train , y_train)
    y_pred = classifier.predict(X_test)
    
    print (precision_recall_fscore_support(y_test, y_pred, average="macro"))
    f1_score = precision_recall_fscore_support(y_test, y_pred, average="macro")[2]
    final_f1.append(f1_score)
print (" ")
print (sum(final_f1)/len(final_f1))

In [None]:
# let's train an svm using word embeddings
from sklearn import svm

corpus = []
labels = []

# be super careful here 
for line in sentiment_dataset[:5000]:
    label = line.split(",")[0].replace('"','').replace("1","-1").replace("2","1")
    text = line.split(",")[1].replace('"','')
    
    text = text_embedding(text)
    if len(text)>0:
        corpus.append(text)
        labels.append(label)
print ("ready!")

X = np.array(corpus)
y = np.array(labels)

In [None]:
# the role of the parameter C: https://stats.stackexchange.com/questions/31066/what-is-the-influence-of-c-in-svms-with-linear-kernel

final_f1 = []

kf_total = cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True)

for train, test in kf_total:
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    classifier = svm.SVC(kernel = "linear", C=10).fit(X_train, y_train) 
    y_pred = classifier.predict(X_test)
    
    print (precision_recall_fscore_support(y_test, y_pred, average="macro"))
    f1_score = precision_recall_fscore_support(y_test, y_pred, average="macro")[2]
    final_f1.append(f1_score)
print (" ")
print (sum(final_f1)/len(final_f1))

In [None]:
# let's train an svm using tfidf
corpus = []
labels = []

# be super careful here - import no more than 5000 for tfidf
for line in sentiment_dataset[:5000]:
    label = line.split(",")[0].replace('"','').replace("1","-1").replace("2","1")
    text = line.split(",")[1].replace('"','')
    
    text = nlp_pipeline(text)
    if len(text)>0:
        corpus.append(text)
        labels.append(label)

print ("ready!")

tfidf_vectorizer = TfidfVectorizer(norm='l2')

tfidf_matrix = tfidf_vectorizer.fit_transform(corpus).A

X = np.array(tfidf_matrix)
y = np.array(labels)

print ("tf-idf, done!")

In [None]:
final_f1 = []

kf_total = cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True)
for train, test in kf_total:
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    classifier = svm.SVC(kernel = "linear", C=1).fit(X_train, y_train) 
    y_pred = classifier.predict(X_test)
    
    print (precision_recall_fscore_support(y_test, y_pred, average="macro"))
    f1_score = precision_recall_fscore_support(y_test, y_pred, average="macro")[2]
    final_f1.append(f1_score)
print (" ")
print (sum(final_f1)/len(final_f1))

Homework: read online what is a Nearest Centroid Classifier and implement one with word-embeddings and tf-idf features. Hint - to implement it you just need to change a couple of lines from the previous classifiers.

In addition, if you want you can also check what a K-nearest Neighbors Classifier is. To implement it, you only have 1 parameter to tune - which one?