# Rocchio with tfidfs and word embeddings

In [1]:
import os
import csv
import re
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestCentroid
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from gensim.models.word2vec import Word2Vec

In [2]:
def get_corpus_dfs(n):
    if n == 1:
        s = 'corpus1'
    elif n == 2:
        s = 'split_corpus2'
    elif n == 3:
        s = 'split_corpus3'

    train = pd.read_csv('data/{}_train.labels'.format(s),
                        delim_whitespace=True,
                        names=['path', 'clf'])
    
    test = pd.read_csv('data/{}_test.labels'.format(s),
                       delim_whitespace=True,
                       names=['path', 'clf'])
    
    train.loc[:, 'path'] = train.loc[:, 'path'].map(lambda s: 'data' + s[1:])
    test.loc[:, 'path'] = test.loc[:, 'path'].map(lambda s: 'data' + s[1:])
    
    return train, test

In [3]:
def rocchio_performance(X_train, y_train, X_test, y_test):
    '''
    Given a train and test split, measure the overall accuracy,
    precision, recall, F-1 score and support of the Rocchio classifier.
    '''
    rocchio_tfidf = NearestCentroid().fit(X_train, y_train)
    
    predictions = rocchio_tfidf.predict(X_test)
    
    acc =  accuracy_score(y_test, predictions)
    prfs = np.vstack(precision_recall_fscore_support(predictions, y_test))
    
    print('Overall accuracy: {:f}'.format(acc))
    print()
    print(pd.DataFrame(data=prfs,
                       index=['Precision', 'Recall', 'F-1', 'Support'],
                       columns=rocchio_tfidf.classes_))
    
    return acc, prfs

## Rocchio-tfidf

In [4]:
for i in range(1, 4):
    train, test = get_corpus_dfs(i)

    vec = TfidfVectorizer(input='filename',
                          strip_accents='unicode',
                          stop_words='english',
                          max_df=0.90,
                          min_df=2,
                          norm='l2')

    tfidf_train = vec.fit_transform(train.loc[:, 'path'])
    tfidf_test = vec.transform(test.loc[:, 'path'])
    clf_train = train.loc[:, 'clf']
    clf_test = test.loc[:, 'clf']

    print('Corpus {}:'.format(i))
    acc, prfs = rocchio_performance(tfidf_train, clf_train, tfidf_test, clf_test)
    print('')
    print('')

Corpus 1:
Overall accuracy: 0.844244

                 Cri        Dis        Oth         Pol         Str
Precision   0.740000   0.977528   0.520000    0.791667    0.911111
Recall      0.948718   0.906250   0.812500    0.883721    0.754601
F-1         0.831461   0.940541   0.634146    0.835165    0.825503
Support    39.000000  96.000000  16.000000  129.000000  163.000000


Corpus 2:
Overall accuracy: 0.802013

                   I           O
Precision   0.626374    0.879227
Recall      0.695122    0.842593
F-1         0.658960    0.860520
Support    82.000000  216.000000


Corpus 3:
Overall accuracy: 0.886792

                 Ent        Fin        Sci        Spo        USN         Wor
Precision   0.750000   0.857143   0.973684   0.964286   0.913580    0.846154
Recall      0.900000   0.923077   0.698113   0.964286   0.850575    0.980198
F-1         0.818182   0.888889   0.813187   0.964286   0.880952    0.908257
Support    10.000000  39.000000  53.000000  28.000000  87.000000  101.0000

## Rocchio-glove

In [9]:
glove = pd.read_table('embeddings/glove.6B.300d.txt',
                      delimiter=' ',
                      index_col=0,
                      header=None,
                      quoting=csv.QUOTE_NONE)

In [13]:
for i in range(1, 4):
    train, test = get_corpus_dfs(i)

In [19]:
with open(train.loc[0, 'path']) as f:
    words = re.findall(r"\w+|[^\w\s]", f.read().lower(), re.UNICODE)

In [22]:
np.mean(glove.loc[words])

1     -0.116733
2      0.054272
3     -0.003800
4     -0.116176
5     -0.059533
6      0.021118
7     -0.020799
8      0.081189
9      0.015573
10    -1.426618
11     0.062071
12     0.033273
13    -0.066472
14     0.100328
15     0.101966
16     0.099439
17    -0.120892
18     0.024492
19    -0.023814
20    -0.069807
21    -0.027901
22     0.070001
23     0.161916
24     0.062927
25    -0.198864
26     0.030220
27     0.004897
28    -0.004325
29    -0.053925
30     0.006438
         ...   
271   -0.038225
272   -0.089785
273    0.054914
274    0.077272
275    0.021087
276    0.070060
277   -1.805465
278    0.007462
279    0.409813
280    0.105462
281   -0.152356
282   -0.052127
283   -0.011477
284   -0.031917
285   -0.011390
286    0.166856
287   -0.059605
288    0.145775
289    0.028906
290   -0.011449
291   -0.014843
292   -0.204676
293   -0.054925
294    0.019694
295    0.079888
296    0.219609
297    0.022031
298   -0.229434
299   -0.140070
300    0.055752
Length: 300, dtype: floa

## Rocchio-word2vec

In [5]:
w2v = Word2Vec.load('embeddings/w2v.corpus1.300d')

In [15]:
w2v.wv['king']

array([ 1.9415584 , -1.3826284 , -1.0156057 ,  0.2602323 ,  0.56473917,
        3.6947465 , -0.24956742, -1.0787795 , -1.0018904 , -1.0129349 ,
        0.8157601 , -0.46081585, -0.194969  , -1.20733   , -0.46452352,
       -1.3305407 ,  0.08218218,  3.240386  ,  0.95941305,  0.0625281 ,
       -0.46261966,  1.5702417 , -0.33920467, -1.202168  ,  0.5328616 ,
        0.71855175, -2.6587431 ,  0.7789982 ,  0.2857616 , -1.273185  ,
       -0.27209488, -0.47511098, -2.0400639 ,  0.7335952 , -2.326173  ,
        0.407348  , -1.0390999 ,  0.59787387,  1.2568825 , -0.20323618,
        1.3961715 , -1.51667   , -0.93408954, -0.52933574, -0.74006504,
        0.42957413,  0.697001  ,  0.15387633,  0.31007656,  0.49997726,
        0.40808576,  0.3003306 , -1.1185716 , -1.4720756 , -1.3439126 ,
       -0.18884575, -0.52496725,  0.24829341, -0.5797517 , -1.0566502 ,
        0.27602664, -1.0743768 ,  0.5447892 , -0.85218406, -0.5409668 ,
        0.8108928 ,  0.47089043, -1.0315297 ,  0.12063687, -0.35

## Rocchio-fastText