# Exp7b: Voting #

In [10]:
from glob import glob

import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.cross_validation import cross_val_score, cross_val_predict, StratifiedKFold
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import Normalizer

from glove_transformer import GloveVectorizer

data = pd.read_csv(open('semeval2016-task6-trainingdata-utf-8.txt'), '\t',
                   encoding='utf8',
                   index_col=0)

targets = list(data.Target.unique()) + ['All']

macro_f_scorer = make_scorer(fbeta_score,
                             beta=1.0,
                             labels=['AGAINST', 'FAVOR'],
                             average='macro')


glove_fnames = glob('glove_vecs/*.pkl')
glove_ids = [fname.split('/')[-1].split('_')[0] for fname in glove_fnames]

In [13]:

    
for fname, glove_id in zip(glove_fnames, glove_ids):
    print 80 * '-'
    print 'GLOVE VECTORS:', glove_id
    print 80 * '-'

    glove_vecs = pd.read_pickle(fname)

    glove_clf = Pipeline([('vect', GloveVectorizer(glove_vecs)),
                          ('clf', LinearSVC(C=0.1,
                                                     #solver='lbfgs',
                                                     #multi_class='multinomial',
                                                     class_weight='balanced',
                                                     ))])

    char_clf = Pipeline([('vect', CountVectorizer(decode_error='ignore',
                                                  lowercase=False,
                                                  min_df=5,
                                                  ngram_range=(3, 3),
                                                  analyzer='char')),
                         ('clf', MultinomialNB())])

    word_clf = Pipeline([('vect', CountVectorizer(decode_error='ignore',
                                                  lowercase=False,
                                                  ngram_range=(2, 2))),
                         ('clf', MultinomialNB())])

    vot_clf = VotingClassifier(estimators=[('char', char_clf),
                                           ('word', word_clf),
                                           ('glove', glove_clf)],
                               voting='hard',
                               weights=[1, 1, 2])
    
    for target in targets:
        print 80 * "="
        print target
        print 80 * "="

        target_data = data[data.Target == target] if target != 'All' else data
        true_stances = target_data.Stance
            
        cv = StratifiedKFold(true_stances, n_folds=5, shuffle=True, random_state=1)

        pred_stances = cross_val_predict(vot_clf, target_data.Tweet, true_stances, cv=cv)
        print classification_report(true_stances, pred_stances, digits=4)

        macro_f = fbeta_score(true_stances, pred_stances, 1.0,
                              labels=['AGAINST', 'FAVOR'], average='macro')
        print 'macro-average of F-score(FAVOR) and F-score(AGAINST): {:.4f}\n'.format(
                macro_f)


--------------------------------------------------------------------------------
GLOVE VECTORS: glove.42B.300d
--------------------------------------------------------------------------------
Atheism
             precision    recall  f1-score   support

    AGAINST     0.7310    0.8849    0.8006       304
      FAVOR     0.5325    0.4457    0.4852        92
       NONE     0.6912    0.4017    0.5081       117

avg / total     0.6863    0.6959    0.6773       513

macro-average of F-score(FAVOR) and F-score(AGAINST): 0.6429

Climate Change is a Real Concern
             precision    recall  f1-score   support

    AGAINST     0.3846    0.3333    0.3571        15
      FAVOR     0.7034    0.8726    0.7789       212
       NONE     0.7983    0.5655    0.6620       168

avg / total     0.7317    0.7215    0.7132       395

macro-average of F-score(FAVOR) and F-score(AGAINST): 0.5680

Feminist Movement
             precision    recall  f1-score   support

    AGAINST     0.6290    0.7805   