In [1]:
import gensim, logging
# the model is organized like this: word = embeddings
model = gensim.models.KeyedVectors.load_word2vec_format('../resources/small-embeddings.txt', binary=False)


In [2]:
import nltk, string
from nltk.corpus import stopwords

exclude = set(string.punctuation)
stop_word_list = stopwords.words('english')

# input should be a string
def text_embedding(text):
    
    #it depends if the words have been lowercased or not
    text = text.lower()
    
    text = nltk.word_tokenize(text)
        
    text = [token for token in text if token not in exclude and token.isalpha()]
    
    text = [token for token in text if token not in stop_word_list]

    article_embedd = []
    
    for word in text:
            try:
                embed_word = model[word]
                article_embedd.append(embed_word)
            except KeyError:
                continue

    avg = [float(sum(col))/len(col) for col in zip(*article_embedd)]
    return avg


In [3]:
# YELP product reviews dataset

import codecs

sentiment_dataset = codecs.open("../datasets/yelp-test.csv","r","utf-8").read().strip().split("\n")

print (sentiment_dataset[1])
print (" ")
print (sentiment_dataset[2])

"1","Last summer I had an appointment to get new tires and had to wait a super long time. I also went in this week for them to fix a minor problem with a tire they put on. They \""fixed\"" it for free, and the very next morning I had the same issue. I called to complain, and the \""manager\"" didn't even apologize!!! So frustrated. Never going back.  They seem overpriced, too."
 
"2","Friendly staff, same starbucks fair you get anywhere else.  Sometimes the lines can get long."


In [4]:
# first, we define two folders, "corpus" - with the text and "labels", with the labels

corpus = []
labels = []

# be careful with this, the dataset is huge!
#for line in sentiment_dataset:
for line in sentiment_dataset[:10000]:
    text = line.split(",")[1].replace('"','')
    label = line.split(",")[0].replace('"','').replace("1","-1").replace("2","1")
    
    emb_text = text_embedding(text)
    if len(emb_text) > 0:
        corpus.append(emb_text)
        labels.append(label)
    
print ("done!")

done!


In [5]:
import numpy as np

# we use np array as they are more efficient
X = np.array(corpus)
y = np.array(labels)

In [6]:
#here's the documentation: http://scikit-learn.org/stable/supervised_learning.html#supervised-learning
from sklearn import cross_validation
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

final_f1 = []

kf_total = cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True)

for train, test in kf_total:
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    final_C = 1
    classifier = GaussianNB().fit(X_train , y_train)
    y_pred = classifier.predict(X_test)
    
    print (precision_recall_fscore_support(y_test, y_pred, average="macro"))
    f1_score = precision_recall_fscore_support(y_test, y_pred, average="macro")[2]
    final_f1.append(f1_score)
print (" ")
print (sum(final_f1)/len(final_f1))

(0.6646730237544434, 0.6653072783913906, 0.6628909551986475, None)
(0.659426601342769, 0.6602582810447979, 0.659115896597156, None)
(0.6537238507231404, 0.6544092611508342, 0.6537458717191031, None)
(0.663427326098559, 0.6641907756302452, 0.6621310242790268, None)
(0.6361441742826437, 0.6368808241325241, 0.6356771212636145, None)
(0.6792880115068848, 0.6802566477369665, 0.6789286866731203, None)
(0.6808125096062839, 0.6815910697311658, 0.6792673992673992, None)
(0.6106632562349481, 0.6112485477751509, 0.6103610281089712, None)
(0.627026229879113, 0.6277125122880565, 0.6265910431214033, None)
(0.6638889567703624, 0.6634575544127325, 0.659274456266459, None)
 
0.6527983482494901




In [None]:
#here's the documentation: http://scikit-learn.org/stable/supervised_learning.html#supervised-learning
from sklearn import svm

SVM = svm.SVC(kernel = "linear", C=1) 

final_f1 = []

kf_total = cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True)

for train, test in kf_total:
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    final_C = 1
    classifier = SVM.fit(X_train , y_train)
    y_pred = classifier.predict(X_test)
    
    print (precision_recall_fscore_support(y_test, y_pred, average="macro"))
    f1_score = precision_recall_fscore_support(y_test, y_pred, average="macro")[2]
    final_f1.append(f1_score)
print (" ")
print (sum(final_f1)/len(final_f1))

(0.6693838862559243, 0.6662863616134644, 0.666731890261302, None)
(0.7000496031746032, 0.6961671287513984, 0.696831525402954, None)
(0.6938606568455197, 0.6860499051510287, 0.6863966941468453, None)
(0.6612459175132527, 0.6596411481309967, 0.6600185022556637, None)
(0.6755101120647959, 0.6739359152469391, 0.6743555575040605, None)
(0.6949303901813519, 0.6901967713893424, 0.6908293262205158, None)
(0.6944317528006227, 0.6866931519981802, 0.6870943361986923, None)
(0.6938860971524288, 0.6871379593295799, 0.6876315580903916, None)


In [None]:
from sklearn.neighbors import KNeighborsClassifier

final_f1 = []
# we set that we do 10 fold cross validation
kf_total = cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True)
# for each of the 10 round
for train, test in kf_total:
    # we define training and test embeddings and labels
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    
    # we train the classifier (using 50 neighbors, but you can change that)
    # we train on the training set, using embeddings and labels
    classifier = KNeighborsClassifier(n_neighbors=50).fit(X_train, y_train) 
    
    # then we test it on the test set, we provide the embeddings and we make the classifier predict the labels
    y_pred = classifier.predict(X_test)
    
    # then we compare the prediction with the true test-labels using precision, recall and f1 (ignore the last None column)
    print (precision_recall_fscore_support(y_test, y_pred, average="macro"))
    f1_score = precision_recall_fscore_support(y_test, y_pred, average="macro")[2]
    final_f1.append(f1_score)
    
print (" ")
print (sum(final_f1)/len(final_f1))

In [None]:
# homework 2 - nearest centroid

from sklearn.neighbors.nearest_centroid import NearestCentroid

final_f1 = []

kf_total = cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True)
for train, test in kf_total:
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    
    classifier = NearestCentroid().fit(X_train, y_train) 
    
    y_pred = classifier.predict(X_test)
    
    print (precision_recall_fscore_support(y_test, y_pred, average="macro"))
    f1_score = precision_recall_fscore_support(y_test, y_pred, average="macro")[2]
    final_f1.append(f1_score)
print (" ")
print (sum(final_f1)/len(final_f1))