In [1]:
#!/usr/bin/python

import pickle
import numpy
numpy.random.seed(42)


### The words (features) and authors (labels), already largely processed.
### These files should have been created from the previous (Lesson 10)
### mini-project.
words_file = "../text_learning/your_word_data.pkl" 
authors_file = "../text_learning/your_email_authors.pkl"
word_data = pickle.load( open(words_file, "r"))
authors = pickle.load( open(authors_file, "r") )



### test_size is the percentage of events assigned to the test set (the
### remainder go into training)
### feature matrices changed to dense representations for compatibility with
### classifier functions in versions 0.15.2 and earlier
from sklearn import cross_validation
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                             stop_words='english')
features_train = vectorizer.fit_transform(features_train)
features_test  = vectorizer.transform(features_test).toarray()


### a classic way to overfit is to use a small number
### of data points and a large number of features;
### train on only 150 events to put ourselves in this regime
features_train = features_train[:150].toarray()
labels_train   = labels_train[:150]



### your code goes here




In [2]:
# 练习: 特征数量和过拟合
print features_train.shape

(150, 37863)


In [3]:
# 练习: 过拟合决策树的准确率
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(features_train, labels_train)
clf.score(features_test, labels_test)

0.94766780432309439

In [4]:
# 练习: 识别最强大特征
for idx, importance in enumerate(clf.feature_importances_):
    if importance > 0.2:
        print idx, importance

33614 0.764705882353


In [5]:
# 练习: 使用 TfIdf 获得最重要的单词
vectorizer.get_feature_names()[33614]

u'sshacklensf'

In [6]:
# 练习: 删除、重复
word_data = [line.replace('sshacklensf', '') for line in word_data]

# -----------------------------------------------------------
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                             stop_words='english')
features_train = vectorizer.fit_transform(features_train)
features_test  = vectorizer.transform(features_test).toarray()
features_train = features_train[:150].toarray()
labels_train   = labels_train[:150]


clf = DecisionTreeClassifier()
clf.fit(features_train, labels_train)
# -----------------------------------------------------------

print "Score: ", clf.score(features_test, labels_test)

for idx, importance in enumerate(clf.feature_importances_):
    if importance > 0.2:
        print "Importance:", idx, importance

Score:  0.970420932878
Importance: 14343 0.666666666667


In [7]:
vectorizer.get_feature_names()[14343]

u'cgermannsf'

In [8]:
word_data = [line.replace('cgermannsf', '') for line in word_data]

# -----------------------------------------------------------
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                             stop_words='english')
features_train = vectorizer.fit_transform(features_train)
features_test  = vectorizer.transform(features_test).toarray()
features_train = features_train[:150].toarray()
labels_train   = labels_train[:150]


clf = DecisionTreeClassifier()
clf.fit(features_train, labels_train)
# -----------------------------------------------------------

print "Score: ", clf.score(features_test, labels_test)

for idx, importance in enumerate(clf.feature_importances_):
    if importance > 0.2:
        print "Importance:", idx, importance

Score:  0.816268486917
Importance: 21323 0.363636363636


In [9]:
vectorizer.get_feature_names()[21323]

u'houectect'