In [33]:
import joblib
import numpy
numpy.random.seed(42)

In [34]:
# extracting pickle files that were processed in the previous chapter: text learning
words_file = "../text_learning/your_word_data.pkl" 
authors_file = "../text_learning/your_email_authors.pkl"
word_data = joblib.load( open(words_file, "rb"))
authors = joblib.load( open(authors_file, "rb") )

In [35]:
### test_size is the percentage of events assigned to the test set (the
### remainder go into training)
### feature matrices changed to dense representations for compatibility with
### classifier functions in versions 0.15.2 and earlier
from sklearn.model_selection import train_test_split
features_train, features_test, labels_train, labels_test = train_test_split(word_data, authors, test_size=0.1, random_state=42)

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
features_train = vectorizer.fit_transform(features_train)
features_test  = vectorizer.transform(features_test).toarray()

In [37]:
### a classic way to overfit is to use a small number
### of data points and a large number of features;
### train on only 150 events to put ourselves in this regime
features_train = features_train[:150].toarray()
labels_train   = labels_train[:150]

In [38]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(features_train, labels_train)
clf.score(features_test, labels_test)

0.8168373151308305

The accuracy score is much higher than expected. If we are overfitting, then the test performance should be relatively low. Since we chose only 150 training points, we would normally expect overfitting.

In [39]:
features_importance_list = clf.feature_importances_
import numpy as np
indices = np.argsort(features_importance_list)[::-1]
for i in range(10):
    print ("{}. feature no.{}: ({})".format(i+1, indices[i], features_importance_list[indices[i]]))

1. feature no.21323: (0.36363636363636365)
2. feature no.18849: (0.1869272434489826)
3. feature no.11975: (0.10537857900318125)
4. feature no.22546: (0.08406920992286854)
5. feature no.29690: (0.047580525890385035)
6. feature no.16267: (0.047407407407407405)
7. feature no.18095: (0.04266666666666666)
8. feature no.13080: (0.026280193236714978)
9. feature no.25675: (0.02552933057280883)
10. feature no.24320: (0.02481019450033535)


An outlier typically has a significance of >0.2

In [41]:
# finding a word that discriminates the model
vectorizer.get_feature_names()[21323]

'houectect'

In [42]:
clf.score(features_test, labels_test)

0.8168373151308305

In [43]:
clf.score(features_train, labels_train)

1.0