# Exp4 #

Combining normal word features and Glove vectors 

In [12]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import classification_report
from sklearn.cross_validation import cross_val_predict, StratifiedKFold
from sklearn.metrics import fbeta_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold

In [2]:
data = pd.read_csv(open('semeval2016-task6-trainingdata.txt'), '\t', index_col=0)
target_data = data[data.Target == 'Climate Change is a Real Concern']

In [3]:
vectorizer = CountVectorizer(decode_error='ignore',
                             lowercase=False,
                             ngram_range=(2,2))
bigram_vecs = vectorizer.fit_transform(target_data.Tweet)
# convert to numpy.ndarray
bigram_vecs = np.squeeze(np.asarray(bigram_vecs.todense()))

In [4]:
cv = StratifiedKFold(target_data.Stance, n_folds=5, shuffle=True, random_state=1)

In [18]:
# Vectorization is applied to the whole data set rather than part of the pipeline,
# because it is a difficult to concatenate the glove vectors to training and testing folds. 
# However, this introduces features (vocab terms) in the training data whse value is always zero.
# This seems to harm the classifier.
# Therefore, we use VarianceThreshols to remove these features, which have zero variance.
pipeline = Pipeline([('vect', VarianceThreshold()),
                     ('clf', SVC(C=10, gamma=))
                     #('clf', MultinomialNB())
                    ])

In [19]:
for dim in 25, 50, 100, 200:
    print 80 * '='
    print 'DIMENSION:', dim
    glove_vecs = pd.read_pickle('semeval2016-task6-trainingdata_climate_glove.twitter.27B.{}d.pkl'.format(dim))
    assert (glove_vecs.index == target_data.Stance).all()
    # NB can not deal with negative feature values, so rescale between (0,1) 
    #scaler = MinMaxScaler(copy=False)
    #glove_vecs = scaler.fit_transform(glove_vecs)
    bigram_glove_vecs = np.concatenate((bigram_vecs, glove_vecs), axis=1)
    
    pred_stances = cross_val_predict(pipeline, bigram_glove_vecs, target_data.Stance, cv=cv)
    print classification_report(target_data.Stance, pred_stances, digits=4)

    macro_f = fbeta_score(target_data.Stance, pred_stances, 1.0, 
                          labels=['AGAINST', 'FAVOR'], average='macro')
    print 'macro-average of F-score(FAVOR) and F-score(AGAINST): {:.4f}\n'.format(macro_f)

DIMENSION: 25
             precision    recall  f1-score   support

    AGAINST     0.0000    0.0000    0.0000        15
      FAVOR     0.6856    0.7406    0.7120       212
       NONE     0.6446    0.6369    0.6407       168

avg / total     0.6421    0.6684    0.6547       395

macro-average of F-score(FAVOR) and F-score(AGAINST): 0.3560

DIMENSION: 50
             precision    recall  f1-score   support

    AGAINST     0.0000    0.0000    0.0000        15
      FAVOR     0.7040    0.7406    0.7218       212
       NONE     0.6570    0.6726    0.6647       168

avg / total     0.6573    0.6835    0.6701       395

macro-average of F-score(FAVOR) and F-score(AGAINST): 0.3609

DIMENSION: 100
             precision    recall  f1-score   support

    AGAINST     0.0000    0.0000    0.0000        15
      FAVOR     0.7593    0.7736    0.7664       212
       NONE     0.7039    0.7500    0.7262       168

avg / total     0.7069    0.7342    0.7202       395

macro-average of F-score(FAVO

In [None]:
pipeline = Pipeline([('vect', CountVectorizer(decode_error='ignore',
                                              lowercase=False,
                                              ngram_range=(2,2))),
                     ('clf', MultinomialNB())])
print pipeline

pred_stances = cross_val_predict(pipeline, target_data.Tweet, target_data.Stance, cv=cv)
print classification_report(target_data.Stance, pred_stances, digits=4)

macro_f = fbeta_score(target_data.Stance, pred_stances, 1.0, 
                      labels=['AGAINST', 'FAVOR'], average='macro')
print 'macro-average of F-score(FAVOR) and F-score(AGAINST): {:.4f}\n'.\
format(macro_f)