# 1. Baseline model

## Load 20 Newsgroups data, keeping only 4 categories

In [1]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(
    subset='train',
    categories=categories,
    shuffle=True,
    random_state=42
)
twenty_test = fetch_20newsgroups(
    subset='test',
    categories=categories,
    shuffle=True,
    random_state=42
)

### A basic text processing pipeline - bag of words features and Logistic Regression as a classifier:

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import make_pipeline

vec = CountVectorizer()
clf = LogisticRegressionCV(max_iter=200)
pipe = make_pipeline(vec, clf)
pipe.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
  ...    random_state=None, refit=True, scoring=None, solver='lbfgs',
           tol=0.0001, verbose=0))])

In [3]:
from sklearn import metrics

def print_report(pipe):
    y_test = twenty_test.target
    y_pred = pipe.predict(twenty_test.data)
    report = metrics.classification_report(y_test, y_pred, target_names=twenty_test.target_names)
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))

print_report(pipe)

                        precision    recall  f1-score   support

           alt.atheism       0.93      0.80      0.86       319
         comp.graphics       0.87      0.96      0.91       389
               sci.med       0.94      0.81      0.87       396
soc.religion.christian       0.85      0.97      0.91       398

             micro avg       0.89      0.89      0.89      1502
             macro avg       0.90      0.89      0.89      1502
          weighted avg       0.90      0.89      0.89      1502

accuracy: 0.890


In [4]:
import eli5
eli5.show_weights(clf, vec=vec, top=10,
                  target_names=twenty_test.target_names)

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3
+1.667,keith,,
+1.660,mathew,,
+1.560,atheism,,
+1.509,okcforum,,
+1.418,go,,
+1.404,psuvm,,
+1.350,believing,,
+1.322,psu,,
… 9771 more positive …,… 9771 more positive …,,
… 26008 more negative …,… 26008 more negative …,,

Weight?,Feature
+1.667,keith
+1.660,mathew
+1.560,atheism
+1.509,okcforum
+1.418,go
+1.404,psuvm
+1.350,believing
+1.322,psu
… 9771 more positive …,… 9771 more positive …
… 26008 more negative …,… 26008 more negative …

Weight?,Feature
+1.702,graphics
+0.825,images
+0.798,files
+0.786,software
+0.779,file
+0.773,image
+0.729,package
+0.724,card
+0.702,3d
… 11379 more positive …,… 11379 more positive …

Weight?,Feature
+2.016,pitt
+1.951,doctor
+1.759,information
+1.696,disease
+1.654,treatment
+1.521,msg
+1.518,health
… 13545 more positive …,… 13545 more positive …
… 22234 more negative …,… 22234 more negative …
-1.765,god

Weight?,Feature
+1.190,rutgers
+1.029,church
+1.022,christians
+0.944,clh
+0.898,christ
+0.794,christian
… 10322 more positive …,… 10322 more positive …
… 25457 more negative …,… 25457 more negative …
-0.855,graphics
-0.896,posting


In [5]:
eli5.show_prediction(clf, twenty_test.data[0], vec=vec,
                     target_names=twenty_test.target_names)

Contribution?,Feature
2.35,Highlighted in text (sum)
-9.842,<BIAS>

Contribution?,Feature
-1.379,<BIAS>
-3.213,Highlighted in text (sum)

Contribution?,Feature
8.864,Highlighted in text (sum)
-4.91,<BIAS>

Contribution?,Feature
-0.114,<BIAS>
-7.046,Highlighted in text (sum)


# Baseline model, improved data

In [6]:
twenty_train = fetch_20newsgroups(
    subset='train',
    categories=categories,
    shuffle=True,
    random_state=42,
    remove=['headers', 'footers'],
)
twenty_test = fetch_20newsgroups(
    subset='test',
    categories=categories,
    shuffle=True,
    random_state=42,
    remove=['headers', 'footers'],
)

vec = CountVectorizer()
clf = LogisticRegressionCV(max_iter=200)
pipe = make_pipeline(vec, clf)
pipe.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
  ...    random_state=None, refit=True, scoring=None, solver='lbfgs',
           tol=0.0001, verbose=0))])

In [7]:
print_report(pipe)

                        precision    recall  f1-score   support

           alt.atheism       0.84      0.77      0.81       319
         comp.graphics       0.83      0.95      0.89       389
               sci.med       0.90      0.79      0.84       396
soc.religion.christian       0.86      0.90      0.88       398

             micro avg       0.86      0.86      0.86      1502
             macro avg       0.86      0.85      0.85      1502
          weighted avg       0.86      0.86      0.86      1502

accuracy: 0.858


In [8]:
eli5.show_prediction(clf, twenty_test.data[0], vec=vec,
                     target_names=twenty_test.target_names,
                     targets=['sci.med'])

Contribution?,Feature
1.747,Highlighted in text (sum)
-1.716,<BIAS>


# Pipeline improvements

In [9]:
vec = CountVectorizer(stop_words='english')
clf = LogisticRegressionCV()
pipe = make_pipeline(vec, clf)
pipe.fit(twenty_train.data, twenty_train.target)

print_report(pipe)

                        precision    recall  f1-score   support

           alt.atheism       0.87      0.76      0.81       319
         comp.graphics       0.85      0.95      0.90       389
               sci.med       0.93      0.85      0.89       396
soc.religion.christian       0.85      0.89      0.87       398

             micro avg       0.87      0.87      0.87      1502
             macro avg       0.87      0.87      0.87      1502
          weighted avg       0.87      0.87      0.87      1502

accuracy: 0.871


In [10]:
eli5.show_prediction(clf, twenty_test.data[0], vec=vec,
                     target_names=twenty_test.target_names,
                     targets=['sci.med'])

Contribution?,Feature
2.184,Highlighted in text (sum)
-1.674,<BIAS>


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer()
clf = LogisticRegressionCV()
pipe = make_pipeline(vec, clf)
pipe.fit(twenty_train.data, twenty_train.target)

print_report(pipe)

                        precision    recall  f1-score   support

           alt.atheism       0.91      0.79      0.85       319
         comp.graphics       0.83      0.97      0.90       389
               sci.med       0.95      0.87      0.91       396
soc.religion.christian       0.90      0.91      0.91       398

             micro avg       0.89      0.89      0.89      1502
             macro avg       0.90      0.89      0.89      1502
          weighted avg       0.90      0.89      0.89      1502

accuracy: 0.892


In [12]:
eli5.show_prediction(clf, twenty_test.data[0], vec=vec,
                     target_names=twenty_test.target_names,
                     targets=['sci.med'])

Contribution?,Feature
6.783,Highlighted in text (sum)
-5.205,<BIAS>


In [13]:
vec = TfidfVectorizer(stop_words='english')
clf = LogisticRegressionCV()
pipe = make_pipeline(vec, clf)
pipe.fit(twenty_train.data, twenty_train.target)

print_report(pipe)

                        precision    recall  f1-score   support

           alt.atheism       0.93      0.77      0.84       319
         comp.graphics       0.84      0.97      0.90       389
               sci.med       0.95      0.89      0.92       396
soc.religion.christian       0.88      0.92      0.90       398

             micro avg       0.89      0.89      0.89      1502
             macro avg       0.90      0.89      0.89      1502
          weighted avg       0.90      0.89      0.89      1502

accuracy: 0.893


In [14]:
eli5.show_prediction(clf, twenty_test.data[0], vec=vec,
                     target_names=twenty_test.target_names,
                     targets=['sci.med'])

Contribution?,Feature
5.488,Highlighted in text (sum)
-3.578,<BIAS>


# Char-based pipeline

In [15]:
vec = TfidfVectorizer(stop_words='english', analyzer='char',
                      ngram_range=(3,5))
clf = LogisticRegressionCV()
pipe = make_pipeline(vec, clf)
pipe.fit(twenty_train.data, twenty_train.target)

print_report(pipe)

                        precision    recall  f1-score   support

           alt.atheism       0.93      0.79      0.85       319
         comp.graphics       0.81      0.97      0.89       389
               sci.med       0.95      0.86      0.90       396
soc.religion.christian       0.89      0.91      0.90       398

             micro avg       0.89      0.89      0.89      1502
             macro avg       0.90      0.88      0.89      1502
          weighted avg       0.89      0.89      0.89      1502

accuracy: 0.888


In [16]:
eli5.show_prediction(clf, twenty_test.data[0], vec=vec,
                     target_names=twenty_test.target_names)

Contribution?,Feature
-0.836,Highlighted in text (sum)
-6.481,<BIAS>

Contribution?,Feature
0.934,<BIAS>
-6.052,Highlighted in text (sum)

Contribution?,Feature
4.483,Highlighted in text (sum)
-5.142,<BIAS>

Contribution?,Feature
0.601,Highlighted in text (sum)
-5.647,<BIAS>


In [17]:
vec = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,5))
clf = LogisticRegressionCV()
pipe = make_pipeline(vec, clf)
pipe.fit(twenty_train.data, twenty_train.target)

print_report(pipe)

                        precision    recall  f1-score   support

           alt.atheism       0.93      0.79      0.85       319
         comp.graphics       0.87      0.96      0.91       389
               sci.med       0.91      0.90      0.90       396
soc.religion.christian       0.89      0.91      0.90       398

             micro avg       0.89      0.89      0.89      1502
             macro avg       0.90      0.89      0.89      1502
          weighted avg       0.90      0.89      0.89      1502

accuracy: 0.894


In [18]:
eli5.show_prediction(clf, twenty_test.data[0], vec=vec,
                     target_names=twenty_test.target_names)

Contribution?,Feature
-2.56,Highlighted in text (sum)
-6.318,<BIAS>

Contribution?,Feature
0.974,<BIAS>
-6.981,Highlighted in text (sum)

Contribution?,Feature
2.134,Highlighted in text (sum)
-2.573,<BIAS>

Contribution?,Feature
3.269,Highlighted in text (sum)
-5.774,<BIAS>


# Debugging HashingVectorizer

In [19]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vec = HashingVectorizer(stop_words='english', ngram_range=(1,2))
clf = SGDClassifier(n_iter=10, random_state=42)
pipe = make_pipeline(vec, clf)
pipe.fit(twenty_train.data, twenty_train.target)

print_report(pipe)



                        precision    recall  f1-score   support

           alt.atheism       0.90      0.80      0.85       319
         comp.graphics       0.88      0.96      0.92       389
               sci.med       0.93      0.90      0.92       396
soc.religion.christian       0.89      0.91      0.90       398

             micro avg       0.90      0.90      0.90      1502
             macro avg       0.90      0.89      0.90      1502
          weighted avg       0.90      0.90      0.90      1502

accuracy: 0.899


In [20]:
eli5.show_prediction(clf, twenty_test.data[0], vec=vec,
                     target_names=twenty_test.target_names,
                     targets=['sci.med'])

Contribution?,Feature
0.678,Highlighted in text (sum)
-0.581,<BIAS>


In [21]:
eli5.show_weights(clf, vec=vec, top=10,
                  target_names=twenty_test.target_names)


Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3
+2.836,x199378,,
+2.378,x938889,,
+1.776,x718537,,
+1.625,x349126,,
+1.554,x242643,,
+1.509,x71928,,
… 50341 more positive …,… 50341 more positive …,,
… 50567 more negative …,… 50567 more negative …,,
-1.634,x683213,,
-1.795,x741207,,

Weight?,Feature
+2.836,x199378
+2.378,x938889
+1.776,x718537
+1.625,x349126
+1.554,x242643
+1.509,x71928
… 50341 more positive …,… 50341 more positive …
… 50567 more negative …,… 50567 more negative …
-1.634,x683213
-1.795,x741207

Weight?,Feature
+3.737,x580586
+2.056,x342790
+1.956,x771885
+1.787,x363686
+1.717,x111283
… 32081 more positive …,… 32081 more positive …
… 31710 more negative …,… 31710 more negative …
-1.760,x857427
-1.779,x85557
-1.813,x693269

Weight?,Feature
+2.209,x988761
+2.194,x337555
+2.162,x154565
+1.818,x806262
… 44124 more positive …,… 44124 more positive …
… 43892 more negative …,… 43892 more negative …
-1.704,x790864
-1.750,x580586
-1.851,x34701
-2.085,x85557

Weight?,Feature
+3.034,x641063
+3.016,x199709
+2.977,x741207
+2.092,x396081
+1.901,x274863
… 51475 more positive …,… 51475 more positive …
… 51717 more negative …,… 51717 more negative …
-1.963,x672777
-2.096,x199378
-2.143,x443433


In [22]:
from eli5.sklearn import InvertableHashingVectorizer
import numpy as np

In [23]:
ivec = InvertableHashingVectorizer(vec)
sample_size = len(twenty_train.data) // 10
X_sample = np.random.choice(twenty_train.data, size=sample_size)
ivec.fit(X_sample)

InvertableHashingVectorizer(unkn_template='FEATURE[%d]',
              vec=HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 2), non_negative=False,
         norm='l2', preprocessor=None, stop_words='english',
         strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None))

In [24]:
eli5.show_weights(clf, vec=ivec, top=20,
                  target_names=twenty_test.target_names)

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3
+2.836,atheism,,
+2.378,writes,,
+1.634,morality,,
+1.625,FEATURE[349126],,
+1.554,religion,,
+1.509,islam,,
+1.489,keith,,
+1.476,religious,,
+1.439,objective,,
+1.414,wrote,,

Weight?,Feature
+2.836,atheism
+2.378,writes
+1.634,morality
+1.625,FEATURE[349126]
+1.554,religion
+1.509,islam
+1.489,keith
+1.476,religious
+1.439,objective
+1.414,wrote

Weight?,Feature
+3.737,graphics
+2.447,image
+2.056,code
+2.021,files
+1.956,images
+1.813,3d
+1.787,software
+1.717,file
+1.701,ftp
+1.587,video

Weight?,Feature
+2.209,health
+2.194,msg
+2.162,doctor
+2.150,disease
+2.147,treatment
+1.851,medical
+1.818,com
+1.704,pain
+1.663,effects
+1.616,cancer …

Weight?,Feature
+3.245,church
+3.034,christians
+3.016,christ
+2.977,rutgers
+2.963,rutgers edu
+2.143,christian
+2.092,heaven
+1.963,love
+1.901,athos rutgers
+1.901,athos


In [25]:
rutgers_example = [x for x in twenty_train.data if 'rutgers' in x.lower()][0]
print(rutgers_example)

In article <Apr.8.00.57.41.1993.28246@athos.rutgers.edu> REXLEX@fnal.gov writes:
>In article <Apr.7.01.56.56.1993.22824@athos.rutgers.edu> shrum@hpfcso.fc.hp.com
>Matt. 22:9-14 'Go therefore to the main highways, and as many as you find
>there, invite to the wedding feast.'...

>hmmmmmm.  Sounds like your theology and Christ's are at odds. Which one am I 
>to believe?


In [26]:
eli5.show_prediction(clf, rutgers_example, vec=vec,
                     target_names=twenty_test.target_names,
                     targets=['soc.religion.christian'])

Contribution?,Feature
2.706,Highlighted in text (sum)
-0.662,<BIAS>
