In [22]:
import os
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestCentroid
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from gensim.models.word2vec import Word2Vec

In [23]:
def get_corpus_dfs(n):
    if n == 1:
        s = 'corpus1'
    elif n == 2:
        s = 'split_corpus2'
    elif n == 3:
        s = 'split_corpus3'

    train = pd.read_csv('data/{}_train.labels'.format(s),
                        delim_whitespace=True,
                        names=['path', 'clf'])
    
    test = pd.read_csv('data/{}_test.labels'.format(s),
                       delim_whitespace=True,
                       names=['path', 'clf'])
    
    train.loc[:, 'path'] = train.loc[:, 'path'].map(lambda s: 'data' + s[1:])
    test.loc[:, 'path'] = test.loc[:, 'path'].map(lambda s: 'data' + s[1:])
    
    return train, test

In [68]:
def rocchio_performance(X_train, y_train, X_test, y_test):
    '''
    Given a train and test split, measure the overall accuracy,
    precision, recall, F-1 score and support of the Rocchio classifier.
    '''
    rocchio_tfidf = NearestCentroid().fit(X_train, y_train)
    
    predictions = rocchio_tfidf.predict(X_test)
    
    acc =  accuracy_score(y_test, predictions)
    prfs = np.vstack(precision_recall_fscore_support(predictions, y_test))
    
    print('Overall accuracy: {:f}'.format(acc))
    print()
    print(pd.DataFrame(data=prfs,
                       index=['Precision', 'Recall', 'F-1', 'Support'],
                       columns=rocchio_tfidf.classes_))
    
    return acc, prfs

## Rocchio-tfidf

In [76]:
for i in range(1, 4):
    train, test = get_corpus_dfs(i)

    vec = TfidfVectorizer(input='filename',
                          strip_accents='unicode',
                          stop_words='english',
                          max_df=0.90,
                          min_df=2,
                          norm='l2')

    tfidf_train = vec.fit_transform(train.loc[:, 'path'])
    tfidf_test = vec.transform(test.loc[:, 'path'])
    clf_train = train.loc[:, 'clf']
    clf_test = test.loc[:, 'clf']

    print('Corpus {}:'.format(i))
    acc, prfs = rocchio_performance(tfidf_train, clf_train, tfidf_test, clf_test)
    print('')
    print('')

Corpus 1:
Overall accuracy: 0.844244

                 Cri        Dis        Oth         Pol         Str
Precision   0.740000   0.977528   0.520000    0.791667    0.911111
Recall      0.948718   0.906250   0.812500    0.883721    0.754601
F-1         0.831461   0.940541   0.634146    0.835165    0.825503
Support    39.000000  96.000000  16.000000  129.000000  163.000000


Corpus 2:
Overall accuracy: 0.802013

                   I           O
Precision   0.626374    0.879227
Recall      0.695122    0.842593
F-1         0.658960    0.860520
Support    82.000000  216.000000


Corpus 3:
Overall accuracy: 0.886792

                 Ent        Fin        Sci        Spo        USN         Wor
Precision   0.750000   0.857143   0.973684   0.964286   0.913580    0.846154
Recall      0.900000   0.923077   0.698113   0.964286   0.850575    0.980198
F-1         0.818182   0.888889   0.813187   0.964286   0.880952    0.908257
Support    10.000000  39.000000  53.000000  28.000000  87.000000  101.0000

## Rocchio-glove

## Rocchio-word2vec

In [2]:
w2v = Word2Vec.load('embeddings/w2v.corpus1.300d')