In [659]:
import pandas as pd
import numpy as np
import matplotlib as mp
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import re, os, random

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

from sklearn.model_selection import KFold, train_test_split
from sklearn.ensemble import RandomForestClassifier

from nltk.corpus import stopwords
from sklearn.feature_extraction import stop_words

%matplotlib inline

# reproducible results
os.environ['PYTHONHASHSEED'] = '0'
np.random.seed(37)
random.seed(17)

pd.set_option('display.max_rows',1000)
pd.set_option('display.max_columns',150)

#root = '/Users/schwalmdaniel/github/kaggle/ml_training/session2'
root = 'e:/kaggle/ml_training/session3'

# data explanation here: https://rstudio-pubs-static.s3.amazonaws.com/155304_cc51f448116744069664b35e7762999f.html

train=pd.read_csv(root + "/spam.csv", usecols=['v1','v2'])

# have a look at the ds
train.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [660]:
# first let's see what is the shape of the data (cols, rows)

train.shape

(5572, 2)

In [661]:
train['v1'] = train['v1'].apply(lambda x: 1 if x == 'spam' else 0)

In [662]:
train['v1'].value_counts()

# 15% of the rows are spam

0    4825
1     747
Name: v1, dtype: int64

In [663]:
train['v2'] = train['v2'].str.lower()

In [664]:
train[train['v1']==1]

Unnamed: 0,v1,v2
2,1,free entry in 2 a wkly comp to win fa cup fina...
5,1,freemsg hey there darling it's been 3 week's n...
8,1,winner!! as a valued network customer you have...
9,1,had your mobile 11 months or more? u r entitle...
11,1,"six chances to win cash! from 100 to 20,000 po..."
12,1,urgent! you have won a 1 week free membership ...
15,1,"xxxmobilemovieclub: to use your credit, click ..."
19,1,england v macedonia - dont miss the goals/team...
34,1,thanks for your subscription to ringtone uk yo...
42,1,07732584351 - rodger burns - msg = we tried to...


In [665]:
train['no_of_num'] =  train['v2'].apply(lambda x : sum([1 for ch in x if ch in string.digits]))
#train['no_of_punc'] =  train['v2'].apply(lambda x : sum([1 for ch in x if ch in ['?','!']]))
#train['no_of_punc'] =  train['v2'].apply(lambda x : sum([int(ch) for ch in x if ch in string.]))

In [666]:
train.head()

Unnamed: 0,v1,v2,no_of_num
0,0,"go until jurong point, crazy.. available only ...",0
1,0,ok lar... joking wif u oni...,0
2,1,free entry in 2 a wkly comp to win fa cup fina...,25
3,0,u dun say so early hor... u c already then say...,0
4,0,"nah i don't think he goes to usf, he lives aro...",0


In [667]:
corpus = train['v2'].tolist()
corpus[:10]

['go until jurong point, crazy.. available only in bugis n great world la e buffet... cine there got amore wat...',
 'ok lar... joking wif u oni...',
 "free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005. text fa to 87121 to receive entry question(std txt rate)t&c's apply 08452810075over18's",
 'u dun say so early hor... u c already then say...',
 "nah i don't think he goes to usf, he lives around here though",
 "freemsg hey there darling it's been 3 week's now and no word back! i'd like some fun you up for it still? tb ok! xxx std chgs to send, ĺł1.50 to rcv",
 'even my brother is not like to speak with me. they treat me like aids patent.',
 "as per your request 'melle melle (oru minnaminunginte nurungu vettam)' has been set as your callertune for all callers. press *9 to copy your friends callertune",
 'winner!! as a valued network customer you have been selected to receivea ĺł900 prize reward! to claim call 09061701461. claim code kl341. valid 12 hours only.',
 'had y

In [668]:
n_features = 450
mystopwords = (stopwords.words('english')) + list(stop_words.ENGLISH_STOP_WORDS)
tfidf = TfidfVectorizer(max_features=n_features,stop_words=mystopwords)
tfidf.fit(corpus)

tfidf_train = np.array(tfidf.transform(train['v2']).todense(), dtype=np.float16)

for i in range(n_features):
    train['v2_tfidf_' + str(i)] = tfidf_train[:, i]
    
del tfidf_train

In [669]:
count_vect = CountVectorizer(max_features=n_features,stop_words=mystopwords)
X_train_counts = count_vect.fit_transform(corpus)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
weights = np.asarray(X_train_tfidf.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': count_vect.get_feature_names(), 'weight': weights})
weights_df = weights_df.sort_values(ascending=False, by=['weight'])
weights_df.head(100)

Unnamed: 0,term,weight
264,ok,0.02424
393,ur,0.01903
190,know,0.018186
78,come,0.017554
204,like,0.017435
150,gt,0.017269
216,lt,0.017237
146,good,0.016654
147,got,0.0163
370,time,0.015068


In [670]:
train['v2_len'] = train['v2'].apply(lambda x: len(x.strip()))

In [672]:
train.head()

Unnamed: 0,v1,v2,no_of_num,v2_tfidf_0,v2_tfidf_1,v2_tfidf_2,v2_tfidf_3,v2_tfidf_4,v2_tfidf_5,v2_tfidf_6,v2_tfidf_7,v2_tfidf_8,v2_tfidf_9,v2_tfidf_10,v2_tfidf_11,v2_tfidf_12,v2_tfidf_13,v2_tfidf_14,v2_tfidf_15,v2_tfidf_16,v2_tfidf_17,v2_tfidf_18,v2_tfidf_19,v2_tfidf_20,v2_tfidf_21,v2_tfidf_22,v2_tfidf_23,v2_tfidf_24,v2_tfidf_25,v2_tfidf_26,v2_tfidf_27,v2_tfidf_28,v2_tfidf_29,v2_tfidf_30,v2_tfidf_31,v2_tfidf_32,v2_tfidf_33,v2_tfidf_34,v2_tfidf_35,v2_tfidf_36,v2_tfidf_37,v2_tfidf_38,v2_tfidf_39,v2_tfidf_40,v2_tfidf_41,v2_tfidf_42,v2_tfidf_43,v2_tfidf_44,v2_tfidf_45,v2_tfidf_46,v2_tfidf_47,v2_tfidf_48,v2_tfidf_49,v2_tfidf_50,v2_tfidf_51,v2_tfidf_52,v2_tfidf_53,v2_tfidf_54,v2_tfidf_55,v2_tfidf_56,v2_tfidf_57,v2_tfidf_58,v2_tfidf_59,v2_tfidf_60,v2_tfidf_61,v2_tfidf_62,v2_tfidf_63,v2_tfidf_64,v2_tfidf_65,v2_tfidf_66,v2_tfidf_67,v2_tfidf_68,v2_tfidf_69,v2_tfidf_70,v2_tfidf_71,...,v2_tfidf_376,v2_tfidf_377,v2_tfidf_378,v2_tfidf_379,v2_tfidf_380,v2_tfidf_381,v2_tfidf_382,v2_tfidf_383,v2_tfidf_384,v2_tfidf_385,v2_tfidf_386,v2_tfidf_387,v2_tfidf_388,v2_tfidf_389,v2_tfidf_390,v2_tfidf_391,v2_tfidf_392,v2_tfidf_393,v2_tfidf_394,v2_tfidf_395,v2_tfidf_396,v2_tfidf_397,v2_tfidf_398,v2_tfidf_399,v2_tfidf_400,v2_tfidf_401,v2_tfidf_402,v2_tfidf_403,v2_tfidf_404,v2_tfidf_405,v2_tfidf_406,v2_tfidf_407,v2_tfidf_408,v2_tfidf_409,v2_tfidf_410,v2_tfidf_411,v2_tfidf_412,v2_tfidf_413,v2_tfidf_414,v2_tfidf_415,v2_tfidf_416,v2_tfidf_417,v2_tfidf_418,v2_tfidf_419,v2_tfidf_420,v2_tfidf_421,v2_tfidf_422,v2_tfidf_423,v2_tfidf_424,v2_tfidf_425,v2_tfidf_426,v2_tfidf_427,v2_tfidf_428,v2_tfidf_429,v2_tfidf_430,v2_tfidf_431,v2_tfidf_432,v2_tfidf_433,v2_tfidf_434,v2_tfidf_435,v2_tfidf_436,v2_tfidf_437,v2_tfidf_438,v2_tfidf_439,v2_tfidf_440,v2_tfidf_441,v2_tfidf_442,v2_tfidf_443,v2_tfidf_444,v2_tfidf_445,v2_tfidf_446,v2_tfidf_447,v2_tfidf_448,v2_tfidf_449,v2_len
0,0,"go until jurong point, crazy.. available only ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.490967,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.594238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.120044
1,0,ok lar... joking wif u oni...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.660645,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.029736
2,1,free entry in 2 a wkly comp to win fa cup fina...,0.531915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.302002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.222412,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.265137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.168502
3,0,u dun say so early hor... u c already then say...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.051762
4,0,"nah i don't think he goes to usf, he lives aro...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.064978


In [673]:
X = train.drop(['v1','v2'], axis=1)
y = train['v1']

In [674]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42, shuffle=True)
print ('Training shape: %s, test shape: %s' % (X_train.shape, X_test.shape))

Training shape: (5014, 452), test shape: (558, 452)


In [675]:
forest = RandomForestClassifier(n_estimators=1000)
forest.fit(X_train, y_train)
predictions = forest.predict(X_test)

In [676]:
forest.score(X_test, y_test)

0.9910394265232975

In [80]:
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [81]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

rf_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


KeyboardInterrupt: 

In [None]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [1, 5, 10],
    'max_features': ['auto'],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [1.0],
    'n_estimators': [100,500, 900]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(X_train, y_train)
grid_search.best_params_

In [None]:
best_grid = grid_search.best_estimator_
