# Exp2: vectorizer variants #

Experiment with NB on Climate Change data only.

This was originally inspired by Wang  & Manning (2012), "Baselines and bigrams: Simple, good sentiment and topic classification, ACL 2012. They report that simple Multinomial NB with binary word bigrams is competetive with more complicated approaches like SVM for classification of short text snippets.

Results below show that word bigrams indeed work better tha unigrams, but only without binarization!

However, character n-grams outperform words, in particular character trigrams without binarization and with a minimun document frequency of 5.  

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.cross_validation import cross_val_predict, StratifiedKFold
from sklearn.metrics import fbeta_score

import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
%matplotlib inline

In [2]:
data = pd.read_csv(open('semeval2016-task6-trainingdata.txt'), '\t', index_col=0)
target_data = data[data.Target == 'Climate Change is a Real Concern']

In [3]:
cv = StratifiedKFold(target_data.Stance, n_folds=5, shuffle=True, random_state=1)

In [4]:
results = pd.DataFrame(np.zeros(10000,
                                dtype=[('analyzer', 'S8'),
                                       ('ngram_range', 'S8'),
                                       ('lowercase', 'b'),
                                       ('binary', 'b'),
                                       ('min_df', 'i'),
                                       ('macro_f', 'i')]))

In [5]:
i = 0

for analyzer in 'word', 'char', 'char_wb':
    if analyzer == 'word':
        ngram_ranges = [(1,1), (2,2), (1,2)]
        min_dfs = 1,2
    else:
        ngram_ranges = [(2,2),(3,3),(2,3),(2,4),(2,5)]
        min_dfs = 1,2,3,5,10
    for ngram_range in ngram_ranges:
        for lowercase in True, False:
            for binary in True, False:
                for min_df in min_dfs:                
                    pipeline = Pipeline([('vect', CountVectorizer(decode_error='ignore',
                                                                  binary=binary,
                                                                  lowercase=lowercase,
                                                                  min_df=min_df,
                                                                  ngram_range=ngram_range,
                                                                  analyzer=analyzer)),
                                         ('clf', MultinomialNB())])
                    print pipeline

                    pred_stances = cross_val_predict(pipeline, target_data.Tweet, target_data.Stance, cv=cv)
                    print classification_report(target_data.Stance, pred_stances, digits=4)

                    macro_f = fbeta_score(target_data.Stance, pred_stances, 1.0, 
                                          labels=['AGAINST', 'FAVOR'], average='macro')
                    print 'macro-average of F-score(FAVOR) and F-score(AGAINST): {:.4f}\n'.\
                    format(macro_f)
                    results.iloc[i] = (analyzer, str(ngram_range), lowercase, binary, min_df, macro_f)
                    i += 1

Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=True, decode_error='ignore',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])
             precision    recall  f1-score   support

    AGAINST     1.0000    0.2000    0.3333        15
      FAVOR     0.6779    0.8538    0.7557       212
       NONE     0.7360    0.5476    0.6280       168

avg / total     0.7148    0.6987    0.6854       395

macro-average of F-score(FAVOR) and F-score(AGAINST): 0.5445

Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=True, decode_error='ignore',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df

In [6]:
results = results[results.analyzer != '']
pd.set_option('display.max_rows', len(results))
results.sort_values(by='macro_f', ascending=False, inplace=True)
print results

    analyzer ngram_range lowercase binary  min_df   macro_f
62      char      (3, 3)     False  False       5  0.603127
201  char_wb      (2, 4)     False  False       3  0.598788
221  char_wb      (2, 5)     False  False       3  0.595847
162  char_wb      (3, 3)     False  False       5  0.595571
101     char      (2, 4)     False  False       3  0.595314
80      char      (2, 3)     False  False       2  0.593824
180  char_wb      (2, 3)     False  False       2  0.593824
121     char      (2, 5)     False  False       3  0.591157
72      char      (2, 3)      True  False       5  0.589450
52      char      (3, 3)      True  False       5  0.583220
92      char      (2, 4)      True  False       5  0.582883
102     char      (2, 4)     False  False       5  0.581609
112     char      (2, 5)      True  False       5  0.581395
181  char_wb      (2, 3)     False  False       3  0.580827
122     char      (2, 5)     False  False       5  0.580195
81      char      (2, 3)     False  Fals

In [7]:
print results[results.analyzer == 'word']

   analyzer ngram_range lowercase binary  min_df   macro_f
14     word      (2, 2)     False  False       1  0.575761
2      word      (1, 1)      True  False       1  0.550000
22     word      (1, 2)     False  False       1  0.545765
0      word      (1, 1)      True   True       1  0.544537
20     word      (1, 2)     False   True       1  0.543750
18     word      (1, 2)      True  False       1  0.542424
16     word      (1, 2)      True   True       1  0.539136
4      word      (1, 1)     False   True       1  0.536415
6      word      (1, 1)     False  False       1  0.534868
12     word      (2, 2)     False   True       1  0.531189
8      word      (2, 2)      True   True       1  0.524374
10     word      (2, 2)      True  False       1  0.522966
3      word      (1, 1)      True  False       2  0.434491
1      word      (1, 1)      True   True       2  0.432065
17     word      (1, 2)      True   True       2  0.431628
19     word      (1, 2)      True  False       2  0.4289