In [None]:
# homeworks! 

import gensim

small_model = gensim.models.KeyedVectors.load_word2vec_format('../small-embeddings.txt', binary=False)


In [None]:
import codecs, nltk, string
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

exclude = set(string.punctuation)
stop_word_list = stopwords.words('english')

# input should be a string
def text_embedding(text):
    
    #it depends if the words have been lowercased or not
    text = text.lower()
    
    text = nltk.word_tokenize(text)
        
    text = [token for token in text if token not in exclude and token.isalpha()]
    
    text = [token for token in text if token not in stop_word_list]

    article_embedd = []
    
    for word in text:
            try:
                embed_word = small_model[word]
                article_embedd.append(embed_word)
            except KeyError:
                continue

    avg = [float(sum(col))/len(col) for col in zip(*article_embedd)]
    
    # the output is a doc-embedding
    return avg


# input should be a string
def nlp_pipeline(text):
    
    # if you want you can split in sentences - i'm usually skipping this step
    # text = nltk.sent_tokenize(text) 
    
    #tokenize words for each sentence
    text = nltk.word_tokenize(text)
    
    # pos tagger
    text = nltk.pos_tag(text)

    # lemmatizer
    text = [wordnet_lemmatizer.lemmatize(token.lower(),"v")if "V" in pos else wordnet_lemmatizer.lemmatize(token.lower()) for token,pos in text]
    
    # remove punctuation and numbers
    text = [token for token in text if token not in exclude and token.isalpha()]
    
    # remove stopwords - be careful with this step    
    text = [token for token in text if token not in stop_word_list]
    
    text = " ".join(text)
    
    # the output is text
    return text

In [None]:
# YELP product reviews dataset

import codecs

sentiment_dataset = codecs.open("yelp_review_polarity_csv/test.csv","r","utf-8").read().strip().split("\n")

print (sentiment_dataset[1])

In [None]:
from sklearn import cross_validation
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

corpus = []
labels = []

# be super careful here 
for line in sentiment_dataset[:5000]:
    # we clean a bit the label (they used 1 for negative and 2 for positive, we change that in -1 and 1)
    label = line.split(",")[0].replace('"','').replace("1","-1").replace("2","1")
    
    # we clean a bit the text, removing quotation marks at the beginning
    text = line.split(",")[1].replace('"','')
    
    # we embed the text
    text = text_embedding(text)
    # if we have a doc-embedding, then we save doc-embedding and label
    if len(text)>0:
        corpus.append(text)
        labels.append(label)
print ("ready!")

# we use np array as they are more efficient
X = np.array(corpus)
y = np.array(labels)

In [None]:
# homework 1 - knn classifier

from sklearn.neighbors import KNeighborsClassifier

final_f1 = []
# we set that we do 10 fold cross validation
kf_total = cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True)
# for each of the 10 round
for train, test in kf_total:
    # we define training and test embeddings and labels
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    
    # we train the classifier (using 50 neighbors, but you can change that)
    # we train on the training set, using embeddings and labels
    classifier = KNeighborsClassifier(n_neighbors=50).fit(X_train, y_train) 
    
    # then we test it on the test set, we provide the embeddings and we make the classifier predict the labels
    y_pred = classifier.predict(X_test)
    
    # then we compare the prediction with the true test-labels using precision, recall and f1 (ignore the last None column)
    print (precision_recall_fscore_support(y_test, y_pred, average="macro"))
    f1_score = precision_recall_fscore_support(y_test, y_pred, average="macro")[2]
    final_f1.append(f1_score)
    
print (" ")
print (sum(final_f1)/len(final_f1))

In [None]:
# homework 2 - nearest centroid

from sklearn.neighbors.nearest_centroid import NearestCentroid

final_f1 = []

kf_total = cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True)
for train, test in kf_total:
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    
    classifier = NearestCentroid().fit(X_train, y_train) 
    
    y_pred = classifier.predict(X_test)
    
    print (precision_recall_fscore_support(y_test, y_pred, average="macro"))
    f1_score = precision_recall_fscore_support(y_test, y_pred, average="macro")[2]
    final_f1.append(f1_score)
print (" ")
print (sum(final_f1)/len(final_f1))

In [None]:
# let's classify the articles from rt.com in topics!
# we can compare the performances of different classifiers

import warnings
warnings.filterwarnings("ignore")

from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm

dataset = codecs.open("dataset.tsv", "r", "utf-8").read().strip().split("\n")

article = dataset[4].split("\t")

corpus = []
labels =  []

# we load the first 5000 lines of our dataset (you can try to load it all at home)
for line in dataset[1:5000]:
    # the topic, like "usa" "uk", etc is the label that we want to predict
    label = line.split("\t")[2]
    text = line.split("\t")[3]
    # as usual, we use embeddings
    text = text_embedding(text)
    
    if len(text)>0:
        corpus.append(text)
        labels.append(label)
print ("ready!")

# again, we use np arrays as they are more efficient
X = np.array(corpus)
y = np.array(labels) 

# we set here 4 different types of classifier
kNN =  KNeighborsClassifier(n_neighbors=5)
NearCentroid = NearestCentroid()
naiveBayes = GaussianNB()
SVM = svm.SVC(kernel = "linear", C=1) 

final_f1 = []
# then in the ususal 10 fold cross validation setting
kf_total = cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True)
for train, test in kf_total:
    
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    
    # we test them all, one after the other
    classifier = kNN.fit(X_train, y_train) 
    y_pred = classifier.predict(X_test)
    print ("kNN", precision_recall_fscore_support(y_test, y_pred, average="macro"))

    classifier = NearCentroid.fit(X_train, y_train) 
    y_pred = classifier.predict(X_test)
    print ("NearCentroid", precision_recall_fscore_support(y_test, y_pred, average="macro"))
    
    classifier = naiveBayes.fit(X_train, y_train) 
    y_pred = classifier.predict(X_test)
    print ("NaiveBayes", precision_recall_fscore_support(y_test, y_pred, average="macro"))

    classifier = SVM.fit(X_train, y_train) 
    y_pred = classifier.predict(X_test)
    print ("SVM", precision_recall_fscore_support(y_test, y_pred, average="macro"))

    
    print (" ")

Homework 1: SVM multi-class is by standard one-vs-one. Learn the difference between SVM one-vs-one and one-vs-rest. Implement a one-vs-rest svm classifier (hint: it's not suuper obvious, but google a bit around).

Homework 2: run the same experiments, but use tf-idf features instead of word embeddings (be careful with the number of docs that you use, start with something link 100/200 examples)


In [None]:
# this is a function that I copy-pasted from here: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

# these are visualizations to study the type of error that the classifier is making
# we use here the last y_test and y_pred in memory, so the last svm run

%matplotlib notebook
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# this function creates a confusion matrix, is only to show you a way of doing some testing
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classifier.classes_))
    plt.xticks(tick_marks, classifier.classes_, rotation=90)
    plt.yticks(tick_marks, classifier.classes_)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


# Compute confusion matrix (using the scores from the SVM from before)
cm = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
plt.figure()

plot_confusion_matrix(cm)


In [None]:
# Normalize the confusion matrix by row (i.e by the number of samples
# in each class)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
plt.figure()
plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix')

#plt.show()

In [None]:
# let's try to cluster articles

# i'm re-loading everything here, because I want to use the titles of the articles to interpret the clusters
dataset = codecs.open("dataset.tsv", "r", "utf-8").read().strip().split("\n")

article = dataset[4].split("\t")

corpus = []
titles =  []

# you can run wit all data at home
for line in dataset[1:1000]:
    # to better understands which clusters are created, let's check the titles of the articles
    title = line.split("\t")[1]
    text = line.split("\t")[3]
    text = text_embedding(text)
    
    if len(text)>0:
        corpus.append(text)
        titles.append(title)
print ("ready!")

In [None]:
from sklearn.cluster import KMeans

# usual thing, np arrays
X = np.array(corpus)
y = np.array(titles) 

# we define kmeans, with 10 clusters (you can change this number and see how the results change)
# then we train it using only the documents
kmeans = KMeans(n_clusters=10).fit(X)


In [None]:
# these are the labels we obtain
kmeans.labels_

In [None]:
# to see which docs are in which clusters, we need to loop over all labels

# so the number of clusters
for i in range(10):
    # print the title of the document if the doc is in this cluster
    print ("this is cluster number",i)
    # then you loop over all titles
    for k in range(len(titles)):
        
        # this is the title
        title = titles[k]
        
        #this is its cluster label
        label = kmeans.labels_[k]
        
        # does it belong to this cluster?
        if i == label:
            #if yes, then print it out!
            print (title)
    print (" ")

In [None]:
# let's count which are the most popular words in the titles of each cluster
from collections import Counter

for i in range(10):
    
    # we create a list where we put the words from the titles
    title_words = []
    
    print ("this is cluster number",i)
    for k in range(len(titles)):
        # we clean the title with our pipeline
        title = nlp_pipeline(titles[k]).split(" ")
        label = kmeans.labels_[k]
        if i == label:
            # we put each word in the list
            for word in title:
                title_words.append(word)
    
    # then we count and print the 10 most common
    most_common = Counter(title_words).most_common(10)
    print (most_common)
    print (" ")