# Exp7: Voting #

In [3]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.cross_validation import cross_val_predict, StratifiedKFold
from sklearn.metrics import fbeta_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import Normalizer

from glove_transformer import GloveVectorizer

data = pd.read_csv(open('semeval2016-task6-trainingdata.txt'), '\t',
                   index_col=0)
data = data[data.Target == 'Climate Change is a Real Concern']
true_stances = data.Stance

cv = StratifiedKFold(true_stances, n_folds=5, shuffle=True, random_state=1)

for dim in 25, 50, 100, 200:
    print 80 * '='
    print 'DIMENSIONS:', dim

    glove_fname = 'semeval2016-task6-trainingdata_climate_glove.twitter.27B.{}d.pkl'
    glove_vecs = pd.read_pickle(glove_fname.format(dim))

    glove_clf = Pipeline([('vect', GloveVectorizer(glove_vecs)),
                          ('clf', SVC(C=1, gamma=0.01, probability=True))])

    char_clf = Pipeline([('vect', CountVectorizer(decode_error='ignore',
                                                  lowercase=False,
                                                  min_df=5,
                                                  ngram_range=(3, 3),
                                                  analyzer='char')),
                         ('clf', MultinomialNB())])

    word_clf = Pipeline([('vect', CountVectorizer(decode_error='ignore',
                                                  lowercase=False,
                                                  ngram_range=(2, 2))),
                         ('clf', MultinomialNB())])

    word2_clf = Pipeline([('vect', CountVectorizer(decode_error='ignore',
                                                   lowercase=False,
                                                   ngram_range=(2, 2))),
                          ('norm', Normalizer()),
                          ('clf', SVC(C=10, probability=True))])

    vot_clf = VotingClassifier(estimators=[('char', char_clf),
                                           ('word', word_clf),
                                           ('glove', glove_clf)],
                               voting='soft',
                               weights=[2, 2, 1])

    pred_stances = cross_val_predict(vot_clf, data.Tweet, true_stances, cv=cv)
    print classification_report(true_stances, pred_stances, digits=4)

    macro_f = fbeta_score(true_stances, pred_stances, 1.0,
                          labels=['AGAINST', 'FAVOR'], average='macro')
    print 'macro-average of F-score(FAVOR) and F-score(AGAINST): {:.4f}\n'.format(
            macro_f)


DIMENSIONS: 25
             precision    recall  f1-score   support

    AGAINST     0.8333    0.3333    0.4762        15
      FAVOR     0.7273    0.7925    0.7585       212
       NONE     0.7152    0.6726    0.6933       168

avg / total     0.7262    0.7241    0.7200       395

macro-average of F-score(FAVOR) and F-score(AGAINST): 0.6173

DIMENSIONS: 50
             precision    recall  f1-score   support

    AGAINST     0.8333    0.3333    0.4762        15
      FAVOR     0.7296    0.8019    0.7640       212
       NONE     0.7244    0.6726    0.6975       168

avg / total     0.7313    0.7291    0.7248       395

macro-average of F-score(FAVOR) and F-score(AGAINST): 0.6201

DIMENSIONS: 100
             precision    recall  f1-score   support

    AGAINST     0.8333    0.3333    0.4762        15
      FAVOR     0.7265    0.8019    0.7623       212
       NONE     0.7226    0.6667    0.6935       168

avg / total     0.7289    0.7266    0.7222       395

macro-average of F-score(F