In [56]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, \
                                    GridSearchCV
from sklearn.feature_selection import SelectFromModel, RFECV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn import tree
import matplotlib
import matplotlib.pyplot as plt

Loading and concatenating datasets.

In [59]:
%%time
institutoliberal = pd.read_csv('datasets/institutoliberal.tsv',
                               sep='\t', index_col=0)
mercadopopular = pd.read_csv('datasets/mercadopopular.tsv',
                             sep='\t', index_col=0)
psol = pd.read_csv('datasets/solidariedadesocialista.tsv',
                   sep='\t', index_col=0)
tijolaco = pd.read_csv('datasets/tijolaco.tsv', sep='\t', index_col=0)
ocafezinho = pd.read_csv('datasets/ocafezinho.tsv', sep='\t', index_col=0)
midiasemmascara = pd.read_csv('datasets/midiasemmascara.tsv',
                sep='\t', index_col=0)

poldata = pd.concat((institutoliberal, mercadopopular,
                     psol, tijolaco, ocafezinho, midiasemmascara),
                    ignore_index=True)

poldata = poldata[poldata['body'].notnull()]

CPU times: user 1.4 s, sys: 148 ms, total: 1.55 s
Wall time: 1.55 s


In [66]:
# token count
import nltk
import re

def getLen(text):
    nonPunct = re.compile('.*[A-Za-z0-9].*')  # must contain a letter or digit
    filtered = [w for w in text if nonPunct.match(w)]
    return len(filtered)

n_tokens = [getLen(text) for text in poldata.body]

We hold out 20% our data as test data.

In [58]:
data_train, data_test, pol_train, pol_test = \
            train_test_split(poldata['body'], poldata['pol'], test_size=0.2)

Creating bags of words:

In [4]:
%%time
stopwords = ['cafezinho', 'http', 'which',
             'il', 'he', 'we', 'dp', 'institute', 'instituto']
vectorizer = CountVectorizer(analyzer = "word", max_features = 5000,
                                      stop_words=stopwords)
train_features = vectorizer.fit_transform(data_train)
test_features = vectorizer.transform(data_test)
pol_features = vectorizer.transform(poldata['body'])

CPU times: user 29.4 s, sys: 184 ms, total: 29.6 s
Wall time: 29.6 s


We are using a random forest classifier with 100 estimators, and out-of-bag... 

In [0]:
arboles.fit(train_features, pol_train)
vocab = vectorizer.get_feature_names()


In [30]:
feat_import = pd.DataFrame(data = {"feature": vocab, \
                                   "importance": arboles.feature_importances_})
print(feat_import.sort_values(by='importance', ascending=False)[:100])

           feature  importance
2784       liberal    0.018783
974      comunista    0.007665
4656      tradução    0.007602
3632          pois    0.007454
3240         notas    0.007328
2783      liberais    0.006950
3147         mídia    0.006864
2407         idéia    0.006599
4518           tal    0.006431
2787     liberdade    0.006206
2012          farc    0.006017
1503         dilma    0.005970
4408    socialismo    0.005794
1838   esquerdista    0.005684
3744    presidenta    0.005675
2501    indivíduos    0.005396
2500     indivíduo    0.004936
2261         golpe    0.004586
3478        países    0.004552
2734          lava    0.004476
1839  esquerdistas    0.004403
1644       eduardo    0.004242
4409    socialista    0.004146
2646          jato    0.004054
2819         livre    0.004049
2258         globo    0.004022
4581   terroristas    0.004006
2462      impostos    0.003956
1073   constantino    0.003950
4261     salgueiro    0.003900
...            ...         ...
2380    

In [6]:
arboles = RandomForestClassifier(n_estimators=100, oob_score=True)
cvmodel = make_pipeline(vectorizer, arboles)
cvscores = cross_val_score(cvmodel, data_train, pol_train, cv=5)
print(cvscores)
print("Accuracy: %0.2f (+/- %0.2f)" % (cvscores.mean(), cvscores.std() * 2))

[ 0.93531501  0.93773373  0.93175019  0.93231114  0.92668786]
Accuracy: 0.93 (+/- 0.01)


In [9]:
arboles = arboles.fit(train_features, pol_train)
result_arb = arboles.predict(test_features)
print(np.sum(result_arb == pol_test)/len(result_arb))

0.935377711294


We want to plot how the OOB error evolves with the number of estimators

In [0]:
%%time
ensemble_clfs = [
    ("RandomForestClassifier, max_features='sqrt'",
        RandomForestClassifier(warm_start=True, oob_score=True,
                               max_features="sqrt", n_jobs=-1)),
    ("RandomForestClassifier, max_features='log2'",
        RandomForestClassifier(warm_start=True, max_features='log2',
                               oob_score=True, n_jobs=-1)),
    ("RandomForestClassifier, max_features=None",
        RandomForestClassifier(warm_start=True, max_features=None,
                               oob_score=True, n_jobs=-1))
]

# Mapping a classifier name to a list of (<n_estimators>, <error rate>) pairs.
error_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)

# Range of `n_estimators` values to explore.
min_estimators = 15
max_estimators = 175

for label, clf in ensemble_clfs:
    for i in range(min_estimators, max_estimators + 1):
        clf.set_params(n_estimators=i)
        clf.fit(important_polfeatures, poldata['pol'])

        # Record the OOB error for each `n_estimators=i` setting.
        oob_error = 1 - clf.oob_score_
        error_rate[label].append((i, oob_error))

# Generate the "OOB error rate" vs. "n_estimators" plot.
for label, clf_err in error_rate.items():
    xs, ys = zip(*clf_err)
    plt.plot(xs, ys, label=label)

plt.xlim(min_estimators, max_estimators)
plt.xlabel("n_estimators")
plt.ylabel("OOB error rate")
plt.legend(loc="upper right")
plt.savefig('oob_important', dpi=300)

A multinominal naive bayes classifier performs worse on this dataset

In [76]:
%%time
nb = MultinomialNB()

cvmodel_nb = make_pipeline(vectorizer, nb)
cvscores_nb = cross_val_score(cvmodel_nb, data_train, pol_train, cv=5)

print(cvscores_nb)
print("Accuracy: %0.2f (+/- %0.2f)" % (cvscores_nb.mean(), cvscores_nb.std() * 2))

[ 0.88913816  0.8921092   0.89846672  0.8894914   0.8973256 ]
Accuracy: 0.89 (+/- 0.01)
CPU times: user 1min 5s, sys: 436 ms, total: 1min 5s
Wall time: 1min 5s


In [79]:
nb = nb.fit(train_features, pol_train)
result_nb = nb.predict(test_features)
print(np.sum(result_nb == pol_test)/len(result_nb))

0.891697830965


Let's try to make a grid search for the best parameters.

In [0]:
sfm = SelectFromModel(arboles, threshold=0.001)

# Train the selector
sfm.fit(train_features, pol_train)

sup = sfm.get_support(indices=True)
print(len(sup))

In [0]:
for feat_index in sup:
    print(vocab[feat_index]

In [50]:
imp_tokens = [vocab[feat_index] for feat_index in sup]
print(imp_tokens)

['2014', '2016', 'aborto', 'americano', 'artigo', 'autor', 'aécio', 'bem', 'blog', 'brasil', 'brasileira', 'bush', 'capitalismo', 'castro', 'chávez', 'cidadãos', 'comunismo', 'comunista', 'comunistas', 'constantino', 'consumidores', 'crianças', 'cristã', 'cristãos', 'cuba', 'cultural', 'cunha', 'dessa', 'desse', 'desses', 'dilma', 'economia', 'eduardo', 'esquerda', 'esquerdista', 'esquerdistas', 'estado', 'farc', 'fato', 'federal', 'feira', 'fidel', 'folha', 'foto', 'gay', 'george', 'globo', 'golpe', 'golpista', 'governo', 'graça', 'homens', 'ideologia', 'idéia', 'idéias', 'igreja', 'imagem', 'impeachment', 'impostos', 'imprensa', 'individual', 'indivíduo', 'indivíduos', 'intelectual', 'jato', 'juiz', 'lava', 'liberais', 'liberal', 'liberalismo', 'liberdade', 'livre', 'livro', 'marxismo', 'marxista', 'menos', 'michel', 'ministério', 'mises', 'modo', 'moral', 'moro', 'muitos', 'mídia', 'nação', 'notas', 'ocidental', 'olavo', 'org', 'outros', 'países', 'pessoas', 'pois', 'porque', 'presi

Subset with most important features:

In [52]:
important_polfeatures = sfm.transform(pol_features)

In [40]:
important_train = sfm.transform(train_features)
important_test = sfm.transform(test_features)
# Create a new random forest classifier for the most important features

# Create a new random forest classifier for the most important features
clf_important = RandomForestClassifier(n_estimators=128, n_jobs=-1)

# Train the new classifier on the new dataset containing the most important features
#clf_important.fit(important_train, pol_train)

imp_cvmodel = make_pipeline(clf_important)
imp_cvscores = cross_val_score(imp_cvmodel, important_train, pol_train, cv=5)
print(imp_cvscores)
print("Accuracy: %0.2f (+/- %0.2f)" % (imp_cvscores.mean(), imp_cvscores.std() * 2))

[ 0.93400636  0.93362004  0.93474196  0.9315632   0.9251917 ]
Accuracy: 0.93 (+/- 0.01)


In [43]:
# Train the new classifier on the new dataset containing the most important features
clf_important.fit(important_train, pol_train)

# Apply The Full Featured Classifier To The Test Data
y_important_pred = clf_important.predict(important_test)
print(accuracy_score(pol_test, y_important_pred))

y_pred = arboles.predict(test_features)
print(accuracy_score(pol_test, y_pred))

0.935377711294
0.934480179506


In [12]:
tf_transformer = TfidfTransformer().fit(train_features)
tfidf_features = tf_transformer.transform(train_features)
print(tfidf_features.shape)
tfidf_features = tfidf_features.toarray()

(10088, 5000)


In [12]:
print(train_features.shape)
train_features_array = train_features.toarray()
dist = np.sum(train_features_array, axis=0)
vocab = vectorizer.get_feature_names()
counts = zip(vocab, dist)

In [13]:
sum(dist)

2383641

In [0]:
forest = RandomForestClassifier(n_estimators=100, oob_score=True)
forest = forest.fit( train_features, pol_train )
test_features = vectorizer.transform(data_test)
test_features = test_features.toarray()

Features sorted by their mean impurity decrease.

Some words in this set are suspicious. I should try to run the model using them as stopwords: [il, institute, blog,, pois, desses, dessa, nesse]

In [51]:
import pydotplus
atree = clf_important.estimators_[0].tree_
dot_data = tree.export_graphviz(atree, max_depth=10, \
                                       out_file=None,
                                       feature_names=imp_tokens,
                                       class_names=['right', 'left'],
                                       label=all,
                                       filled=True,
                                       proportion=True) 
graph = pydotplus.graph_from_dot_data(dot_data) 
graph.write_pdf("poltree.pdf")

True