In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
sns.set_context('notebook') 
sns.set_style('ticks')
import nltk

In [2]:
data= pd.read_csv('ytube_spam_trainset.csv')


In [3]:
#WE FIRST TOKENIZE THE CONTENT INTO WORDS FOR BAG OF WORDS REPRESENTATION

from nltk.tokenize import word_tokenize

#Now remove stop words 
from nltk.corpus import stopwords


#convert words to root words -stemming
from nltk.stem.porter import PorterStemmer

#create a function to apply the above process on all of the dataframe

def process_text(text):
    tokenized = word_tokenize(text)
    tokenized_no_punctuation=[word.lower() for word in tokenized if word.isalpha()]
    tokenized_no_stopwords=[word for word in tokenized_no_punctuation if word not in stopwords.words('english')]
    tokens = [PorterStemmer().stem(word) for word in tokenized_no_stopwords]
    return tokens

data['tokens']=data['CONTENT'].apply(process_text)



In [4]:
from sklearn.feature_extraction.text import CountVectorizer
#Step 1
bow_transformer= CountVectorizer(analyzer=process_text, stop_words='english',min_df=2,max_df=0.01,max_features=1000).fit(data['CONTENT'])
print(len(bow_transformer.vocabulary_))

len(bow_transformer.get_feature_names())

#Now transform bow_transformer into a sparse matrix by applying .transform method
messages_bow= bow_transformer.transform(data['CONTENT'])
print(messages_bow.shape)

#convert to dataframe

import pandas as pd
X=pd.DataFrame(messages_bow.toarray(), columns=bow_transformer.get_feature_names())
#Step 2

#Need to remove top 1% words before this step
from sklearn.feature_extraction.text import TfidfTransformer
tfid_transformer= TfidfTransformer().fit_transform(X)
#messages_tfid= tfid_transformer.transform(X)


357
(750, 357)


In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
Y= data.iloc[:,4].values

X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=0, test_size=0.2)

parameters= {'max_depth': np.arange(1,50,1)}
dtc= RandomForestClassifier(criterion='entropy')
dtc_clv= GridSearchCV(dtc, param_grid= parameters, cv=3)
print(dtc_clv)
dtc_clv.fit(X_train,y_train)
best_depth= dtc_clv.best_estimator_

print(best_depth)


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=33, max_features='

In [6]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
Y= data.iloc[:,4].values
X= data.iloc[:,3]
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=0, test_size=0.2)

#Build a pipeline
pipeline= Pipeline([
        ('bow',CountVectorizer(analyzer= process_text, stop_words='english',min_df=2,max_features=1000)),
        ('tfid',TfidfTransformer()),
        ('classifier',RandomForestClassifier(criterion='entropy',max_depth=27))])
    
pipeline.fit(X_train,y_train)    

prediction_NB= pipeline.predict(X_test)

print(classification_report(y_test,prediction_NB))

print(confusion_matrix(y_test,prediction_NB))


             precision    recall  f1-score   support

          0       0.96      0.99      0.98       122
          1       0.96      0.82      0.88        28

avg / total       0.96      0.96      0.96       150

[[121   1]
 [  5  23]]


In [8]:
test_data= pd.read_csv('ytube_spam_testset.csv')

y_test = test_data.iloc[:,4].values
X_test= test_data.iloc[:,3]

pipeline= Pipeline([
        ('bow',CountVectorizer(analyzer= process_text, stop_words='english',min_df=2,max_features=1000)),
        ('tfid',TfidfTransformer()),
        ('classifier',RandomForestClassifier(max_depth=27))])
    
pipeline.fit(X_test,y_test)    

y_pred_final= pipeline.predict(X_test)

print(classification_report(y_test,y_pred_final))

print(confusion_matrix(y_test,y_pred_final))




             precision    recall  f1-score   support

          0       0.99      1.00      0.99       350
          1       0.99      0.95      0.97        75

avg / total       0.99      0.99      0.99       425

[[349   1]
 [  4  71]]
