# Baselines

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import itertools
import numpy as np
import pandas as pd
import helper
import sklearn
from sklearn import metrics
import matplotlib
import matplotlib.pyplot as plt
import w2v_d2v
import dataset_helper
import preprocessing

In [3]:
from notebook.services.config import ConfigManager
ConfigManager().update('notebook', {'limit_output': 1000});

### Fetch subset of newsgroup-20 dataset

In [4]:
if 0 == 1:
    REMOVE = ('headers', 'footers', 'quotes')
    CATEGORIES = ('talk.politics.misc', 'comp.graphics')

    newsgroup_dataset_train = sklearn.datasets.fetch_20newsgroups(subset = 'train', remove = REMOVE, categories = CATEGORIES)
    newsgroup_dataset_test = sklearn.datasets.fetch_20newsgroups(subset = 'test', remove = REMOVE, categories = CATEGORIES)

In [38]:
ana_datasets = {dataset: dataset_helper.get_dataset_dict_preprocessed(dataset, use_cached = True) for dataset in 'cade-ana mini20-ana ng20-ana r52-ana r8-ana webkb-ana'.split()}
X, Y, classes = ana_datasets['ng20-ana']


alt atheism faq atheist resources archive name atheism resources alt atheism archive name resources last modified december version atheist resources addresses of atheist organizations usa freedom from religion foundation darwin fish bumper stickers and assorted other atheist paraphernalia are available from the freedom from religion foundation in the us write to ffrf p o box madison wi telephone evolution designs evolution designs sell the darwin fish it s a fish symbol like the ones christians stick on their cars but with feet and the word darwin written inside the deluxe moulded d plastic fish is postpaid in the us write to evolution designs laurel canyon north hollywood ca people in the san francisco bay area can get darwin fish from lynn gold try mailing figmo netcom com for net people who go to lynn directly the price is per fish american atheist press aap publish various atheist books critiques of the bible lists of biblical contradictions and so on one such book is the bible han

In [6]:
def convert_target_names(data, target_names):
    return [target_names[x] for x in data]

data_train_X, data_train_Y = newsgroup_dataset_train.data, convert_target_names(newsgroup_dataset_train.target, newsgroup_dataset_train.target_names)
data_test_X, data_test_Y = newsgroup_dataset_test.data, convert_target_names(newsgroup_dataset_test.target, newsgroup_dataset_test.target_names)
classes = newsgroup_dataset_train.target_names

NameError: name 'newsgroup_dataset_train' is not defined

## Training classifiers on vectorized docs

### Feature extraction
Remove stopwords and vectorize with Tfidf

In [None]:
vectorizer = sklearn.feature_extraction.text.CountVectorizer(stop_words = 'english')
vectorizer.fit(data_train_X)

vectors_train = vectorizer.transform(data_train_X)
vectors_test = vectorizer.transform(data_test_X)

tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer()
tfidf_transformer.fit(vectors_train)
vectors_trans_train = tfidf_transformer.transform(vectors_train)
vectors_trans_test = tfidf_transformer.transform(vectors_test)

### Train different classifiers on tfidf vectors of docs

In [None]:
clfs = helper.get_classifiers()

results = {}
for clf_name, clf in clfs.items():
    print(clf_name)
    # Fit
    clf.fit(vectors_train, data_train_Y)
    
    # Predict
    pred_train = clf.predict(vectors_trans_train)
    pred_test = clf.predict(vectors_trans_test)
    
    # Metric
    f1_score_train = metrics.f1_score(data_train_Y, pred_train, average='macro')
    f1_score_test = metrics.f1_score(data_test_Y, pred_test, average='macro')
    results[clf_name] = {'train': f1_score_train, 'test': f1_score_test, 'acc': metrics.accuracy_score(data_test_Y, pred_test)}
    
    # Confusion matrix
    confusion_matrix = metrics.confusion_matrix(data_test_Y, pred_test, labels = classes)
    plt.figure(figsize = (8, 8))
    helper.plot_confusion_matrix(confusion_matrix, classes, title = '{}\n(f1 macro test-set: {:.2f})'.format(clf_name, f1_score_test), normalize = True)
print('F1 Scores')
pd.DataFrame(results)
plt.show()

## Train Doc2Vec and Word2Vec classifier

### Pre-process docs for d2v and w2v

In [None]:
w2v_data = [w2v_d2v.w2v_preproess(doc) for doc in data_train_X]
w2v_data_test = [w2v_d2v.w2v_preproess(doc) for doc in data_test_X]

### Train Word2Vec

In [None]:
model_w2v = w2v_d2v.train_w2v(w2v_data, iterations=1)

### Train Doc2Vec

In [None]:
model_d2v = w2v_d2v.train_d2v(w2v_data, data_train_Y, iterations = 1)

In [None]:
scores = w2v_d2v.score_d2v(clfs, data_train_Y, data_test_Y, data_test_Y_names, model_d2v, w2v_data, w2v_data_test, steps = 1)

In [None]:
scores