# Rocchio with tfidfs and word embeddings

In [16]:
import os
import csv
import re
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestCentroid
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from gensim.models.word2vec import Word2Vec

In [3]:
def get_corpus_dfs(n):
    if n == 1:
        s = 'corpus1'
    elif n == 2:
        s = 'split_corpus2'
    elif n == 3:
        s = 'split_corpus3'

    train = pd.read_csv('data/{}_train.labels'.format(s),
                        delim_whitespace=True,
                        names=['path', 'clf'])
    
    test = pd.read_csv('data/{}_test.labels'.format(s),
                       delim_whitespace=True,
                       names=['path', 'clf'])
    
    train.loc[:, 'path'] = train.loc[:, 'path'].map(lambda s: 'data' + s[1:])
    test.loc[:, 'path'] = test.loc[:, 'path'].map(lambda s: 'data' + s[1:])
    
    return train, test

In [4]:
def rocchio_performance(X_train, y_train, X_test, y_test):
    '''
    Given a train and test split, measure the overall accuracy,
    precision, recall, F-1 score and support of the Rocchio classifier.
    '''
    rocchio_tfidf = NearestCentroid().fit(X_train, y_train)
    
    predictions = rocchio_tfidf.predict(X_test)
    
    acc =  accuracy_score(y_test, predictions)
    prfs = np.vstack(precision_recall_fscore_support(predictions, y_test))
    
    print('Overall accuracy: {:f}'.format(acc))
    print()
    print(pd.DataFrame(data=prfs,
                       index=['Precision', 'Recall', 'F-1', 'Support'],
                       columns=rocchio_tfidf.classes_))
    
    return acc, prfs

In [34]:
def get_embeddeding_matrix(corpus_df, func, dim=300):
    X = np.zeros([len(corpus_df), dim])
    
    for j in range(len(corpus_df)):
        with open(corpus_df.loc[j, 'path']) as f:
            words = f.read().lower().split()
            #re.findall(r"\w+|[^\w\s]", f.read().lower(), re.UNICODE)
        X[j] = func(glove.loc[words])
    
    return X

## Rocchio-tfidf

In [4]:
for i in range(1, 4):
    train, test = get_corpus_dfs(i)

    vec = TfidfVectorizer(input='filename',
                          strip_accents='unicode',
                          stop_words='english',
                          max_df=0.90,
                          min_df=2,
                          norm='l2')

    tfidf_train = vec.fit_transform(train.loc[:, 'path'])
    tfidf_test = vec.transform(test.loc[:, 'path'])
    clf_train = train.loc[:, 'clf']
    clf_test = test.loc[:, 'clf']

    print('Corpus {}:'.format(i))
    acc, prfs = rocchio_performance(tfidf_train, clf_train, tfidf_test, clf_test)
    print('')
    print('')

Corpus 1:
Overall accuracy: 0.844244

                 Cri        Dis        Oth         Pol         Str
Precision   0.740000   0.977528   0.520000    0.791667    0.911111
Recall      0.948718   0.906250   0.812500    0.883721    0.754601
F-1         0.831461   0.940541   0.634146    0.835165    0.825503
Support    39.000000  96.000000  16.000000  129.000000  163.000000


Corpus 2:
Overall accuracy: 0.802013

                   I           O
Precision   0.626374    0.879227
Recall      0.695122    0.842593
F-1         0.658960    0.860520
Support    82.000000  216.000000


Corpus 3:
Overall accuracy: 0.886792

                 Ent        Fin        Sci        Spo        USN         Wor
Precision   0.750000   0.857143   0.973684   0.964286   0.913580    0.846154
Recall      0.900000   0.923077   0.698113   0.964286   0.850575    0.980198
F-1         0.818182   0.888889   0.813187   0.964286   0.880952    0.908257
Support    10.000000  39.000000  53.000000  28.000000  87.000000  101.0000

## Rocchio-glove

In [5]:
glove = pd.read_table('embeddings/glove.6B.300d.txt',
                      delimiter=' ',
                      index_col=0,
                      header=None,
                      quoting=csv.QUOTE_NONE)

In [35]:
# Mean

for i in range(1, 4):
    train, test = get_corpus_dfs(i)

    X_train = get_embeddeding_matrix(train, np.mean)
    X_test = get_embeddeding_matrix(test, np.mean)
    clf_train = train.loc[:, 'clf']
    clf_test = test.loc[:, 'clf']

    print('Corpus {}:'.format(i))
    acc, prfs = rocchio_performance(X_train, clf_train, X_test, clf_test)
    print('')
    print('')

Corpus 1:
Overall accuracy: 0.835214

                 Cri        Dis        Oth         Pol         Str
Precision   0.860000   0.910112   0.600000    0.840278    0.814815
Recall      0.826923   0.870968   0.555556    0.870504    0.833333
F-1         0.843137   0.890110   0.576923    0.855124    0.823970
Support    52.000000  93.000000  27.000000  139.000000  132.000000


Corpus 2:
Overall accuracy: 0.775168

                   I           O
Precision   0.670330    0.821256
Recall      0.622449    0.850000
F-1         0.645503    0.835381
Support    98.000000  200.000000


Corpus 3:
Overall accuracy: 0.814465

                 Ent        Fin        Sci        Spo        USN         Wor
Precision   0.833333   0.809524   0.868421   0.750000   0.777778    0.837607
Recall      0.312500   0.871795   0.785714   0.954545   0.768293    0.970297
F-1         0.454545   0.839506   0.825000   0.840000   0.773006    0.899083
Support    32.000000  39.000000  42.000000  22.000000  82.000000  101.0000

In [36]:
# Max

for i in range(1, 4):
    train, test = get_corpus_dfs(i)

    
    X_train = get_embeddeding_matrix(train, lambda x: np.amax(x, axis=0))
    X_test = get_embeddeding_matrix(test, lambda x: np.amax(x, axis=0))
    clf_train = train.loc[:, 'clf']
    clf_test = test.loc[:, 'clf']

    print('Corpus {}:'.format(i))
    acc, prfs = rocchio_performance(X_train, clf_train, X_test, clf_test)
    print('')
    print('')

Corpus 1:
Overall accuracy: 0.749436

                 Cri        Dis        Oth         Pol         Str
Precision   0.740000   0.842697   0.480000    0.784722    0.703704
Recall      0.506849   0.862069   0.461538    0.882812    0.736434
F-1         0.601626   0.852273   0.470588    0.830882    0.719697
Support    73.000000  87.000000  26.000000  128.000000  129.000000


Corpus 2:
Overall accuracy: 0.788591

                   I           O
Precision   0.692308    0.830918
Recall      0.642857    0.860000
F-1         0.666667    0.845209
Support    98.000000  200.000000


Corpus 3:
Overall accuracy: 0.745283

                 Ent        Fin        Sci        Spo        USN         Wor
Precision   0.916667   0.833333   0.684211   0.821429   0.691358    0.735043
Recall      0.440000   0.625000   0.764706   1.000000   0.708861    0.851485
F-1         0.594595   0.714286   0.722222   0.901961   0.700000    0.788991
Support    25.000000  56.000000  34.000000  23.000000  79.000000  101.0000

In [37]:
# Min

for i in range(1, 4):
    train, test = get_corpus_dfs(i)

    X_train = get_embeddeding_matrix(train, lambda x: np.amin(x, axis=0))
    X_test = get_embeddeding_matrix(test, lambda x: np.amin(x, axis=0))
    clf_train = train.loc[:, 'clf']
    clf_test = test.loc[:, 'clf']

    print('Corpus {}:'.format(i))
    acc, prfs = rocchio_performance(X_train, clf_train, X_test, clf_test)
    print('')
    print('')

Corpus 1:
Overall accuracy: 0.702032

                 Cri        Dis        Oth         Pol         Str
Precision   0.700000   0.842697   0.440000    0.680556    0.681481
Recall      0.416667   0.903614   0.423077    0.823529    0.702290
F-1         0.522388   0.872093   0.431373    0.745247    0.691729
Support    84.000000  83.000000  26.000000  119.000000  131.000000


Corpus 2:
Overall accuracy: 0.768456

                    I           O
Precision    0.714286    0.792271
Recall       0.601852    0.863158
F-1          0.653266    0.826196
Support    108.000000  190.000000


Corpus 3:
Overall accuracy: 0.820755

                 Ent        Fin        Sci        Spo        USN         Wor
Precision   0.916667   0.785714   0.868421   0.892857   0.765432    0.829060
Recall      0.343750   0.868421   0.825000   1.000000   0.784810    0.932692
F-1         0.500000   0.825000   0.846154   0.943396   0.775000    0.877828
Support    32.000000  38.000000  40.000000  25.000000  79.000000  104

In [38]:
# Concat max and min

for i in range(1, 4):
    train, test = get_corpus_dfs(i)


    X_train = get_embeddeding_matrix(train,
                                     lambda x: np.hstack([np.amax(x, axis=0),
                                                          np.amin(x, axis=0)]),
                                     dim=600)
    
    X_test = get_embeddeding_matrix(test,
                                    lambda x: np.hstack([np.amax(x, axis=0),
                                                         np.amin(x, axis=0)]),
                                    dim=600)
    clf_train = train.loc[:, 'clf']
    clf_test = test.loc[:, 'clf']

    print('Corpus {}:'.format(i))
    acc, prfs = rocchio_performance(X_train, clf_train, X_test, clf_test)
    print('')
    print('')

KeyboardInterrupt: 

## Rocchio-word2vec

In [5]:
w2v = Word2Vec.load('embeddings/w2v.corpus1.300d')

In [15]:
w2v.wv['king']

array([ 1.9415584 , -1.3826284 , -1.0156057 ,  0.2602323 ,  0.56473917,
        3.6947465 , -0.24956742, -1.0787795 , -1.0018904 , -1.0129349 ,
        0.8157601 , -0.46081585, -0.194969  , -1.20733   , -0.46452352,
       -1.3305407 ,  0.08218218,  3.240386  ,  0.95941305,  0.0625281 ,
       -0.46261966,  1.5702417 , -0.33920467, -1.202168  ,  0.5328616 ,
        0.71855175, -2.6587431 ,  0.7789982 ,  0.2857616 , -1.273185  ,
       -0.27209488, -0.47511098, -2.0400639 ,  0.7335952 , -2.326173  ,
        0.407348  , -1.0390999 ,  0.59787387,  1.2568825 , -0.20323618,
        1.3961715 , -1.51667   , -0.93408954, -0.52933574, -0.74006504,
        0.42957413,  0.697001  ,  0.15387633,  0.31007656,  0.49997726,
        0.40808576,  0.3003306 , -1.1185716 , -1.4720756 , -1.3439126 ,
       -0.18884575, -0.52496725,  0.24829341, -0.5797517 , -1.0566502 ,
        0.27602664, -1.0743768 ,  0.5447892 , -0.85218406, -0.5409668 ,
        0.8108928 ,  0.47089043, -1.0315297 ,  0.12063687, -0.35

## Rocchio-fastText