In [3]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv('SMSSpamCollection.tsv', sep='\t')
data.columns = ['label', 'body_text']

def count_punct(text):
    count = sum((1 for char in text if char in string.punctuation))
    return round(count/(len(text) - text.count(' ')), 3)*100

data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = ''.join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

tfidf_vect = TfidfVectorizer(analyzer = clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])

X_features = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_tfidf.toarray())], axis = 1) 
X_features.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,8094,8095,8096,8097,8098,8099,8100,8101,8102,8103
0,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,62,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,28,7.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,135,4.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)


In [11]:
print(dir(RandomForestClassifier))
print(RandomForestClassifier())

['__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_cache', '_abc_negative_cache', '_abc_negative_cache_version', '_abc_registry', '_estimator_type', '_get_param_names', '_make_estimator', '_set_oob_score', '_validate_X_predict', '_validate_estimator', '_validate_y_class_weight', 'apply', 'decision_path', 'feature_importances_', 'fit', 'get_params', 'predict', 'predict_log_proba', 'predict_proba', 'score', 'set_params']
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0,

In [12]:
from sklearn.model_selection import KFold, cross_val_score

In [14]:
rf = RandomForestClassifier(n_jobs= -1)
k_fold = KFold(n_splits=5)
cross_val_score(rf, X_features, data['label'], cv = k_fold, scoring = 'accuracy', n_jobs = -1 )

array([0.96858169, 0.97396768, 0.9703504 , 0.96495957, 0.96765499])

In [15]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_features, data['label'], test_size=0.2 )


In [19]:
rf_model = rf.fit(X_train, y_train)

In [22]:
sorted(zip(rf_model.feature_importances_, X_train.columns),reverse=True)[0:10]

[(0.04612667758561504, 7350),
 (0.045827773087150706, 4796),
 (0.03663372636848508, 1803),
 (0.025758942386370217, 3134),
 (0.02068615830338519, 'body_len'),
 (0.018489117843075675, 7218),
 (0.01718975338107227, 6971),
 (0.016626705849019326, 5724),
 (0.016409898849284524, 295),
 (0.015965431120102472, 1361)]

In [25]:
y_pred = rf_model.predict(X_test)
precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')

In [26]:
print('Precision:{} / Recall:{} /Accuracy: {}'.format(round(precision, 3), round(recall, 3), (y_pred==y_test).sum()/ len(y_pred),3))

Precision:1.0 / Recall:0.566 /Accuracy: 0.9470377019748654


In [33]:
X_train, X_test, y_train, y_test = train_test_split(X_features, data['label'], test_size=0.2 )

In [36]:
def train_RF(n_est, depth):
    rf = RandomForestClassifier(n_estimators=n_est, max_depth= depth , n_jobs = -1)
    rf_model = rf.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')
    print('Eat: {} / Depth: {} ==== Precision: {} / Recall {} / Accuracy: {}'.format(n_est, depth,
        round(precision, 3), round(recall, 3),round((y_pred==y_test).sum() / len(y_pred), 3)))

In [37]:
for n_est in [10, 50, 100]:
    for depth in [10, 20, 30, None]:
        train_RF(n_est, depth)

Eat: 10 / Depth: 10 ==== Precision: 1.0 / Recall 0.281 / Accuracy: 0.901
Eat: 10 / Depth: 20 ==== Precision: 1.0 / Recall 0.556 / Accuracy: 0.939
Eat: 10 / Depth: 30 ==== Precision: 0.99 / Recall 0.66 / Accuracy: 0.952
Eat: 10 / Depth: None ==== Precision: 0.991 / Recall 0.725 / Accuracy: 0.961
Eat: 50 / Depth: 10 ==== Precision: 1.0 / Recall 0.248 / Accuracy: 0.897
Eat: 50 / Depth: 20 ==== Precision: 1.0 / Recall 0.556 / Accuracy: 0.939
Eat: 50 / Depth: 30 ==== Precision: 1.0 / Recall 0.647 / Accuracy: 0.952
Eat: 50 / Depth: None ==== Precision: 0.992 / Recall 0.797 / Accuracy: 0.971
Eat: 100 / Depth: 10 ==== Precision: 1.0 / Recall 0.248 / Accuracy: 0.897
Eat: 100 / Depth: 20 ==== Precision: 1.0 / Recall 0.542 / Accuracy: 0.937
Eat: 100 / Depth: 30 ==== Precision: 1.0 / Recall 0.68 / Accuracy: 0.956
Eat: 100 / Depth: None ==== Precision: 0.984 / Recall 0.81 / Accuracy: 0.972


In [1]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import string

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv('SMSSpamCollection.tsv', sep='\t')
data.columns = ['label', 'body_text']

def count_punct(text):
    count = sum((1 for char in text if char in string.punctuation))
    return round(count/(len(text) - text.count(' ')), 3)*100

data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = ''.join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

# TF-IDF
tfidf_vect = TfidfVectorizer(analyzer = clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])
X_tfidf_feat = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_tfidf.toarray())], axis = 1)

# CountVectorizer
count_vect = CountVectorizer(analyzer = clean_text)
X_count = count_vect.fit_transform(data['body_text'])
X_count_feat = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_count.toarray())], axis = 1) 

X_count_feat.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,8094,8095,8096,8097,8098,8099,8100,8101,8102,8103
0,128,4.7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,49,4.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,62,3.2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,28,7.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,135,4.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [3]:
rf = RandomForestClassifier()
param = {"n_estimators" : [10, 150, 300],
        "max_depth" : [30, 60, 90, None]}

gs = GridSearchCV(rf, param, cv= 5, n_jobs= -1 )
gs_fit = gs.fit(X_tfidf_feat, data['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending = False)[0:5]




Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
8,55.68513,1.360486,0.619445,0.089435,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.978475,0.975741,0.974843,...,0.974313,0.003255,1,0.999326,0.999326,0.999102,0.999326,0.998877,0.999192,0.00018
11,54.509203,7.013257,0.540091,0.090461,,300,"{'max_depth': None, 'n_estimators': 300}",0.975785,0.978437,0.974843,...,0.973954,0.003598,2,1.0,1.0,1.0,1.0,1.0,1.0,0.0
7,29.620848,0.777243,0.457138,0.03997,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.978475,0.973046,0.974843,...,0.972876,0.004426,3,0.999326,0.998877,0.999326,0.999102,0.998877,0.999102,0.000201
10,33.056481,0.783319,0.476447,0.067291,,150,"{'max_depth': None, 'n_estimators': 150}",0.979372,0.973944,0.975741,...,0.972696,0.004819,4,1.0,1.0,1.0,1.0,1.0,1.0,0.0
4,25.288928,1.014422,0.38378,0.06043,60.0,150,"{'max_depth': 60, 'n_estimators': 150}",0.974888,0.974843,0.973944,...,0.971978,0.003482,5,0.993711,0.993264,0.994387,0.994163,0.993264,0.993758,0.000458


In [4]:
rf = RandomForestClassifier()
param = {"n_estimators" : [10, 150, 300],
        "max_depth" : [30, 60, 90, None]}

gs = GridSearchCV(rf, param, cv= 5, n_jobs= -1 )
gs_fit = gs.fit(X_count_feat, data['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending = False)[0:5]




Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
11,53.547554,7.146727,0.560279,0.149701,,300,"{'max_depth': None, 'n_estimators': 300}",0.977578,0.973046,0.974843,...,0.972876,0.003197,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
10,30.91051,0.425368,0.534693,0.087512,,150,"{'max_depth': None, 'n_estimators': 150}",0.978475,0.972147,0.974843,...,0.972517,0.003974,2,1.0,1.0,1.0,1.0,1.0,1.0,0.0
7,28.617822,0.304787,0.387979,0.016464,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.977578,0.972147,0.973046,...,0.972337,0.003441,3,0.998652,0.998877,0.998653,0.999326,0.998653,0.998832,0.000262
8,54.418655,1.62117,0.672015,0.110184,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.976682,0.973944,0.974843,...,0.972157,0.003822,4,0.998877,0.998653,0.998877,0.999326,0.998877,0.998922,0.00022
4,23.577107,0.475266,0.3496,0.033313,60.0,150,"{'max_depth': 60, 'n_estimators': 150}",0.977578,0.972147,0.971249,...,0.97072,0.004437,5,0.993261,0.991693,0.994163,0.993489,0.99304,0.993129,0.000811
