# Library

In [1]:
import pandas as pd
import re

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import numpy as np

from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate

pd.set_option('max_columns', 1000)
pd.set_option('max_rows', 1000)

english_stemmer = SnowballStemmer("english", ignore_stopwords=True)
en_stops = set(stopwords.words('english'))
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Data import

In [2]:
from google.colab import files

uploaded = files.upload()
df_data = pd.read_excel(uploaded['Comment Spam.xls'])
print(df_data.shape[0])
df_data.head()

Saving Comment Spam.xls to Comment Spam.xls
1300


Unnamed: 0,No,Comment,Class
0,1,this song is racist,0
1,2,and how many subscribers compared to her over ...,1
2,3,HI! CHECK OUT OUR AWESOME COVERS! AND SAY WHAT...,1
3,4,well done shakira,0
4,5,:D subscribe to me for daily vines,1


In [3]:
df_data_processed = df_data[['Comment', 'Class']]
df_data_processed.tail()

Unnamed: 0,Comment,Class
1295,Awsome<br />﻿,0
1296,https://www.tsu.co/KodysMan plz ^^﻿,1
1297,Sign up for free on TSU and start making money...,1
1298,MEGAN FOX AND EMINEM TOGETHER IN A VIDEO DOES...,0
1299,Great.This is a song﻿,0


In [4]:
df_data_processed['Class'].value_counts(dropna=False) # the data is already balanced

1    669
0    631
Name: Class, dtype: int64

# Preprocessing

In [5]:
'''
Function to clean the data, which includes:
1. Lowercasing
2. Punctuation removal
3. Digit removal
''' 
def cleaning(text):
    # lowercase
    normal = text.lower()
    # remove punctuation
    normal = re.sub(r'[^\w\s]', '', normal) 
    # remove numbers
    normal = re.sub(r'\d+', ' ', normal)
    return normal


'''
Function to normalize the form of the token (lemmatization)
and to remove stopwords
'''
def normalize_and_remove_stopwords(text):
    tokens = nlp(text)
    token_new = []
    
    for k in tokens:
        if k.lemma_ not in en_stops:
            token_new.append(k.lemma_)

    str_clean = ' '.join(token_new)
    return str_clean


'''
Function to do stemming, in this case, we use lemmatization
instead of stemming
'''
def stemming(text):
    token = nltk.word_tokenize(text)
    stem_sentence = []
    for k in token:
        stem_word = english_stemmer.stem(k)
        stem_sentence.append(stem_word)

    stem_sentence_str = ' '.join(stem_sentence)
    return stem_sentence_str

'''
Data preprocessing function, which includes:
1. Text cleaning,
2. Text normalization, and
3. Stopword removal
'''
def preprocessing(list_text):
    text_clean = []
    for t in list_text:
        normal = cleaning(t)
#         normal = stemming(normal)
        normal = normalize_and_remove_stopwords(normal)
        text_clean.append(normal)
    return text_clean

In [6]:
raw_text = df_data_processed['Comment']

clean_text = preprocessing(raw_text) # do the preprocessing
clean_text[:3]

['song racist',
 'many subscriber compare -PRON- million',
 'hi check -PRON- awesome cover say -PRON- think']

In [7]:
# save the clean comments to csv, so we can use it later on
df_clean_comment = pd.DataFrame(clean_text, columns=['comment'])
df_clean_comment.to_csv('df_clean_comment_no_stemming.csv', index=False, encoding='utf-8')

# Feature extraction

In [8]:
class_ = df_data_processed['Class'].tolist() # target variable
clean_comment = df_clean_comment['comment'] # clean text to do prediction
comment = df_data_processed['Comment'] # raw text to do prediction

In [9]:
'''
Function to extract TF (1-gram) features
'''
def tf_extraction(text, ngram_start, ngram_end):
    ngram = CountVectorizer(ngram_range=(ngram_start, ngram_end))
    ngram_matrix = ngram.fit_transform(np.array(text)).todense()
    feature_names = ngram.get_feature_names()
    return ngram_matrix, feature_names

# unigram features
ngram_feat, feature_names = tf_extraction(clean_comment, 1, 1)
print(ngram_feat[:3])
print(feature_names[:3])

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
['aa', 'aaaaaaa', 'aaacwk']


In [10]:
'''
Function to extract orthography and url occurence features
'''
def orthography_and_url_extraction(text):
    all_orto_feat = []
    for t in text:
        capital_count = sum(1 for c in t if c.isupper())
        exclamation_count = sum(1 for c in t if c == "!")
        word_len = len(nltk.word_tokenize(t))
        char_len = len(t)
        url = 1 if 'http' in t.lower() else 0
        orto_feat = [capital_count, exclamation_count, word_len, char_len, url]
        all_orto_feat.append(orto_feat)
    return all_orto_feat

orto_feat = orthography_and_url_extraction(comment)
orto_feat[:3]

[[0, 0, 4, 19, 0], [0, 0, 10, 55, 0], [44, 3, 14, 57, 0]]

In [11]:
'''
Function to extract TF-IDF (1-gram) features
'''
def tf_idf_extraction(text):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(np.array(text)).todense()
    return tfidf_matrix

# tf-idf features
tfidf_feat = tf_idf_extraction(clean_comment)
print(tfidf_feat[:3])

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


# Modelling and Evaluation

In [12]:
# list of features combinations
feat_list = [ngram_feat, tfidf_feat, np.hstack((ngram_feat, orto_feat)), np.hstack((tfidf_feat, orto_feat))]
feat_name = ['tf', 'tf-idf', 'tf and orthography', 'tf-idf and orthography']

# list of model to do prediction
mnb = MultinomialNB()
rf = RandomForestClassifier(random_state=0)
gb = GradientBoostingClassifier(random_state=0)
ab = AdaBoostClassifier(random_state=0)
knn = KNeighborsClassifier()
lr = LogisticRegression(random_state=0)
mlp = MLPClassifier(random_state=0)
dt = DecisionTreeClassifier(random_state=0)
svm = SVC(random_state=0)
model_list = [mnb, rf, gb, ab, knn, lr, mlp, dt, svm]
model_name = ['Multinomial Naive Bayes', 'Random Forest', 'Gradient Boost', 'Ada Boost',
              'kNN', 'Logistic Regression', 'Multilayer Perceptron', 'Decision Tree', 'SVM']

# build the model and evaluate the performance of it for each feature combination
df_recap = pd.DataFrame()
for f, fn in zip(feat_list, feat_name):
    print("Features : ", fn)
    X = f
    y = class_
    for m, n in zip(model_list, model_name):
        scoring = ['accuracy', 'f1_macro', 'precision_macro', 'recall_macro']
        scores = cross_validate(m, X, y, cv=10, scoring=scoring)
        acc = np.mean(scores['test_accuracy'])
        f1 = np.mean(scores['test_f1_macro'])
        precision = np.mean(scores['test_precision_macro'])
        recall = np.mean(scores['test_recall_macro'])
        print("Classifier : ", n)
        print("Accuracy:", acc)
        print("F1-Score:", f1)
        print("Precision:", precision)
        print("Recall:", recall)
        df_recap = df_recap.append({
            'features': fn,
            'classifier': n,
            'accuracy': acc,
            'f1_score': f1,
            'precision': precision,
            'recall': recall
        }, ignore_index=True)
        print('='*90)

Features :  tf
Classifier :  Multinomial Naive Bayes
Accuracy: 0.8623076923076922
F1-Score: 0.8621813000472567
Precision: 0.8647611623735537
Recall: 0.8632148880866233
Classifier :  Random Forest
Accuracy: 0.9084615384615384
F1-Score: 0.9081245627538997
Precision: 0.9164923796246713
Recall: 0.9102506945790528
Classifier :  Gradient Boost
Accuracy: 0.8876923076923078
F1-Score: 0.8865498560771108
Precision: 0.9043828932902622
Recall: 0.8904218087053908
Classifier :  Ada Boost
Accuracy: 0.8923076923076924
F1-Score: 0.892008549039881
Precision: 0.8984433866538515
Recall: 0.8937983580474252
Classifier :  kNN
Accuracy: 0.8246153846153847
F1-Score: 0.82161816177367
Precision: 0.8551804498782601
Recall: 0.8288169541900885
Classifier :  Logistic Regression
Accuracy: 0.8969230769230769
F1-Score: 0.8964222845677121
Precision: 0.9064962273418417
Recall: 0.8987136826689065
Classifier :  Multilayer Perceptron
Accuracy: 0.8892307692307693
F1-Score: 0.8887072253091768
Precision: 0.898146726469748
Reca

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Classifier :  Logistic Regression
Accuracy: 0.9330769230769231
F1-Score: 0.9330304757050614
Precision: 0.9346374452031634
Recall: 0.9337048523615687
Classifier :  Multilayer Perceptron
Accuracy: 0.9276923076923078
F1-Score: 0.9275514064465291
Precision: 0.9288282542552212
Recall: 0.9276270332590295
Classifier :  Decision Tree
Accuracy: 0.9346153846153846
F1-Score: 0.934540935217022
Precision: 0.9358080958772025
Recall: 0.9349141905461866
Classifier :  SVM
Accuracy: 0.6369230769230768
F1-Score: 0.6339319584415637
Precision: 0.6431254278548688
Recall: 0.638510992790377
Features :  tf-idf and orthography
Classifier :  Multinomial Naive Bayes
Accuracy: 0.7569230769230769
F1-Score: 0.73918501830004
Precision: 0.8188500215410486
Recall: 0.7501730558516939
Classifier :  Random Forest
Accuracy: 0.9461538461538461
F1-Score: 0.946109126398035
Precision: 0.9473720961290898
Recall: 0.9466509444121384
Classifier :  Gradient Boost
Accuracy: 0.9315384615384616
F1-Score: 0.9314809941666405
Precision: 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Classifier :  Logistic Regression
Accuracy: 0.9130769230769232
F1-Score: 0.9130209117355401
Precision: 0.9154117546013627
Recall: 0.91406495660227




Classifier :  Multilayer Perceptron
Accuracy: 0.9200000000000002
F1-Score: 0.9198941826114568
Precision: 0.9214874805256645
Recall: 0.9203549689323942
Classifier :  Decision Tree
Accuracy: 0.9353846153846155
F1-Score: 0.93529622073322
Precision: 0.9371884999854736
Recall: 0.9356717663037625
Classifier :  SVM
Accuracy: 0.6338461538461537
F1-Score: 0.6306111924089018
Precision: 0.6402614636714334
Recall: 0.6354785360265771


In [13]:
# the recap of scenarios
df_recap.sort_values(by='accuracy', ascending=False)

Unnamed: 0,accuracy,classifier,f1_score,features,precision,recall
28,0.946154,Random Forest,0.946109,tf-idf and orthography,0.947372,0.946651
19,0.941538,Random Forest,0.941486,tf and orthography,0.942944,0.942042
30,0.935385,Ada Boost,0.935335,tf-idf and orthography,0.936822,0.935931
34,0.935385,Decision Tree,0.935296,tf-idf and orthography,0.937188,0.935672
21,0.935385,Ada Boost,0.935308,tf and orthography,0.93756,0.935956
25,0.934615,Decision Tree,0.934541,tf and orthography,0.935808,0.934914
23,0.933077,Logistic Regression,0.93303,tf and orthography,0.934637,0.933705
29,0.931538,Gradient Boost,0.931481,tf-idf and orthography,0.934698,0.932686
20,0.930769,Gradient Boost,0.930695,tf and orthography,0.934466,0.931987
24,0.927692,Multilayer Perceptron,0.927551,tf and orthography,0.928828,0.927627


In [14]:
df_recap.to_csv('df_recap.csv', index=False, encoding='utf-8')