In [26]:
import pandas as pd
spam_raw = pd.read_csv('spam.csv',encoding = 'UTF-16')

In [27]:
# remove empty colomns & rename colomns
spam_raw = spam_raw.loc[:,'v1':'v2']
spam_raw = spam_raw.rename(columns	= {'v1':'label','v2':'text'})	
# lower case
spam_raw['text'] = spam_raw['text'].str.lower()

In [28]:
import seaborn as sns
%matplotlib inline

In [29]:
# convert label to a numerical variable
spam_raw['label_num'] = spam_raw.label.map({'ham':0, 'spam':1})

In [30]:
spam_raw.head()

Unnamed: 0,label,text,label_num
0,ham,"go until jurong point, crazy.. available only ...",0
1,ham,ok lar... joking wif u oni...,0
2,spam,free entry in 2 a wkly comp to win fa cup fina...,1
3,ham,u dun say so early hor... u c already then say...,0
4,ham,"nah i don't think he goes to usf, he lives aro...",0


In [31]:
# vectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,encoding='utf-16',stop_words='english')

# tfidf transformation
features_transformed = vectorizer.fit_transform(spam_raw['text'])

# feature reduction: select best proportion(5%) of features
from sklearn.feature_selection import SelectPercentile, f_classif
selector = SelectPercentile(f_classif, percentile = 15)
selector.fit(features_transformed, spam_raw['label_num'])
features_transformed = selector.transform(features_transformed).toarray()

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [33]:
depth_range = range(1,20)
parameters = dict(max_depth=depth_range)
rf = RandomForestClassifier()
clf = GridSearchCV(rf, parameters,cv = 10)

In [34]:
# fit grid with data
clf.fit(features_transformed, spam_raw['label_num'])

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': range(1, 20)}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)

In [35]:
print(clf.cv_results_)

{'mean_fit_time': array([ 0.07000396,  0.08490479,  0.1019058 ,  0.1183068 ,  0.13520763,
        0.1434082 ,  0.15470881,  0.16620941,  0.18221042,  0.19121094,
        0.20371163,  0.21911256,  0.23521347,  0.25041432,  0.25921485,
        0.26421502,  0.27401564,  0.29241664,  0.30161731]), 'std_fit_time': array([ 0.00126495,  0.0009435 ,  0.00225623,  0.00110006,  0.00552839,
        0.00458721,  0.00161565,  0.00177767,  0.00188706,  0.00193919,
        0.00357937,  0.00448249,  0.00285683,  0.0038525 ,  0.00299346,
        0.00203977,  0.0040003 ,  0.0137718 ,  0.00593667]), 'mean_score_time': array([ 0.00380023,  0.00360031,  0.00400026,  0.00420017,  0.00460036,
        0.0042002 ,  0.00360022,  0.00390041,  0.00420027,  0.00380023,
        0.00410028,  0.00410025,  0.00460024,  0.00460019,  0.00460026,
        0.00410032,  0.0041003 ,  0.00430031,  0.00470023]), 'std_score_time': array([ 0.0004001 ,  0.00048984,  0.00044723,  0.00040003,  0.00066341,
        0.00040002,  0.000

In [36]:
print(clf.best_score_)

0.95944005743


In [37]:
print(clf.best_index_)

18
