# Exp3 #

Experiments with using word embedding obtained by summing the Glove vectors available for the words in the tweet.

Seems to perform barely better than random. Slightly better with higher dimensions (>50)

In [101]:
import pandas as pd
import numpy as np

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.metrics import classification_report
from sklearn.cross_validation import cross_val_predict, StratifiedKFold
from sklearn.metrics import fbeta_score
from sklearn.preprocessing import MinMaxScaler, normalize


from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.grid_search import GridSearchCV

In [102]:
for dim in 25,:# 50, 100, 200:
    print 80 * '='
    print 'DIMENSIONS:', dim
    data = pd.read_pickle('semeval2016-task6-trainingdata_climate_glove.twitter.27B.{}d.pkl'.format(dim))
    true_stances = data.index
    #normalize(data, copy=False)
    scaler = MinMaxScaler(copy=False)
    #data = scaler.fit_transform(data)
    
    C_range = np.logspace(-2, 10, 13)
    gamma_range = np.logspace(-9, 3, 13)
    param_grid = dict(gamma=gamma_range, C=C_range)
    cv = StratifiedShuffleSplit(true_stances, n_iter=5, test_size=0.2, random_state=42)
    grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
    grid.fit(data, true_stances)

    print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

DIMENSIONS: 25
The best parameters are {'C': 100000.0, 'gamma': 9.9999999999999995e-08} with a score of 0.71


In [109]:
#clf = LinearSVC(C=5)
#clf = MultinomialNB()
#clf = KNeighborsClassifier()
clf = SVC(C=10.0)

cv = None

for dim in 25, 50, 100, 200:
    print 80 * '='
    print 'DIMENSIONS:', dim
    data = pd.read_pickle('semeval2016-task6-trainingdata_climate_glove.twitter.27B.{}d.pkl'.format(dim))
    true_stances = data.index
    #normalize(data, copy=False)
    #scaler = MinMaxScaler(copy=False)
    #data = scaler.fit_transform(data)
    
    if not cv:
        cv = StratifiedKFold(true_stances, n_folds=5, shuffle=True, random_state=1)
    
    pred_stances = cross_val_predict(clf, data, true_stances, cv=cv)
    print classification_report(true_stances, pred_stances, digits=4)

    macro_f = fbeta_score(true_stances, pred_stances, 1.0,
                          labels=['AGAINST', 'FAVOR'], average='macro')
    print 'macro-average of F-score(FAVOR) and F-score(AGAINST): {:.4f}\n'.format(macro_f)

DIMENSIONS: 25
             precision    recall  f1-score   support

    AGAINST     1.0000    0.2000    0.3333        15
      FAVOR     0.5801    0.9057    0.7072       212
       NONE     0.6721    0.2440    0.3581       168

avg / total     0.6352    0.5975    0.5445       395

macro-average of F-score(FAVOR) and F-score(AGAINST): 0.5203

DIMENSIONS: 50
             precision    recall  f1-score   support

    AGAINST     1.0000    0.2000    0.3333        15
      FAVOR     0.6221    0.9009    0.7360       212
       NONE     0.7412    0.3750    0.4980       168

avg / total     0.6871    0.6506    0.6195       395

macro-average of F-score(FAVOR) and F-score(AGAINST): 0.5347

DIMENSIONS: 100
             precision    recall  f1-score   support

    AGAINST     1.0000    0.2000    0.3333        15
      FAVOR     0.6667    0.8491    0.7469       212
       NONE     0.7131    0.5179    0.6000       168

avg / total     0.6991    0.6835    0.6687       395

macro-average of F-score(F

In [51]:
# use random features as baseline

rnd_data = np.random.random(data.shape) 
pred_stances = cross_val_predict(clf, rnd_data, true_stances, cv=cv)
print classification_report(true_stances, pred_stances, digits=4)

macro_f = fbeta_score(true_stances, pred_stances, 1.0,
                      labels=['AGAINST', 'FAVOR'], average='macro')
print 'macro-average of F-score(FAVOR) and F-score(AGAINST): {:.4f}\n'.format(macro_f)

             precision    recall  f1-score   support

    AGAINST     0.0000    0.0000    0.0000        15
      FAVOR     0.5335    0.8255    0.6481       212
       NONE     0.4030    0.1607    0.2298       168

avg / total     0.4577    0.5114    0.4456       395

macro-average of F-score(FAVOR) and F-score(AGAINST): 0.3241

