In [None]:
import gensim, logging
# the model is organized like this: word = embeddings
model = gensim.models.KeyedVectors.load_word2vec_format('../resources/small-embeddings.txt', binary=False)


In [None]:
import nltk, string
from nltk.corpus import stopwords

exclude = set(string.punctuation)
stop_word_list = stopwords.words('english')

# input should be a string
def text_embedding(text):
    
    #it depends if the words have been lowercased or not
    text = text.lower()
    
    text = nltk.word_tokenize(text)
        
    text = [token for token in text if token not in exclude and token.isalpha()]
    
    text = [token for token in text if token not in stop_word_list]

    article_embedd = []
    
    for word in text:
            try:
                embed_word = model[word]
                article_embedd.append(embed_word)
            except KeyError:
                continue

    avg = [float(sum(col))/len(col) for col in zip(*article_embedd)]
    return avg


In [None]:
# YELP product reviews dataset

import codecs

sentiment_dataset = codecs.open("../datasets/yelp-test.csv","r","utf-8").read().strip().split("\n")

print (sentiment_dataset[1])
print (" ")
print (sentiment_dataset[2])

In [None]:
# first, we define two folders, "corpus" - with the text and "labels", with the labels

corpus = []
labels = []

# be careful with this, the dataset is huge!
#for line in sentiment_dataset:
for line in sentiment_dataset[:10000]:
    text = line.split(",")[1].replace('"','')
    label = line.split(",")[0].replace('"','').replace("1","-1").replace("2","1")
    
    emb_text = text_embedding(text)
    if len(emb_text) > 0:
        corpus.append(emb_text)
        labels.append(label)
    
print ("done!")

In [None]:
import numpy as np

# we use np array as they are more efficient
X = np.array(corpus)
y = np.array(labels)

In [None]:
#here's the documentation: http://scikit-learn.org/stable/supervised_learning.html#supervised-learning
from sklearn import cross_validation
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

final_f1 = []

kf_total = cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True)

for train, test in kf_total:
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    final_C = 1
    classifier = GaussianNB().fit(X_train , y_train)
    y_pred = classifier.predict(X_test)
    
    print (precision_recall_fscore_support(y_test, y_pred, average="macro"))
    f1_score = precision_recall_fscore_support(y_test, y_pred, average="macro")[2]
    final_f1.append(f1_score)
print (" ")
print (sum(final_f1)/len(final_f1))

In [None]:
#here's the documentation: http://scikit-learn.org/stable/supervised_learning.html#supervised-learning
from sklearn import svm

SVM = svm.SVC(kernel = "linear", C=1) 

final_f1 = []

kf_total = cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True)

for train, test in kf_total:
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    final_C = 1
    classifier = SVM.fit(X_train , y_train)
    y_pred = classifier.predict(X_test)
    
    print (precision_recall_fscore_support(y_test, y_pred, average="macro"))
    f1_score = precision_recall_fscore_support(y_test, y_pred, average="macro")[2]
    final_f1.append(f1_score)
print (" ")
print (sum(final_f1)/len(final_f1))

In [None]:
from sklearn.neighbors import KNeighborsClassifier

final_f1 = []
# we set that we do 10 fold cross validation
kf_total = cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True)
# for each of the 10 round
for train, test in kf_total:
    # we define training and test embeddings and labels
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    
    # we train the classifier (using 50 neighbors, but you can change that)
    # we train on the training set, using embeddings and labels
    classifier = KNeighborsClassifier(n_neighbors=50).fit(X_train, y_train) 
    
    # then we test it on the test set, we provide the embeddings and we make the classifier predict the labels
    y_pred = classifier.predict(X_test)
    
    # then we compare the prediction with the true test-labels using precision, recall and f1 (ignore the last None column)
    print (precision_recall_fscore_support(y_test, y_pred, average="macro"))
    f1_score = precision_recall_fscore_support(y_test, y_pred, average="macro")[2]
    final_f1.append(f1_score)
    
print (" ")
print (sum(final_f1)/len(final_f1))

In [None]:
# homework 2 - nearest centroid

from sklearn.neighbors.nearest_centroid import NearestCentroid

final_f1 = []

kf_total = cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True)
for train, test in kf_total:
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    
    classifier = NearestCentroid().fit(X_train, y_train) 
    
    y_pred = classifier.predict(X_test)
    
    print (precision_recall_fscore_support(y_test, y_pred, average="macro"))
    f1_score = precision_recall_fscore_support(y_test, y_pred, average="macro")[2]
    final_f1.append(f1_score)
print (" ")
print (sum(final_f1)/len(final_f1))