In [1]:
# import neccessary libraries that may be useful

import re
import string
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# import language processing functions

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gordo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gordo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Data importing

all_data = []
csv_dir = './YouTube-Spam-Collection-v1/'
csv_files = ['Youtube01-Psy.csv','Youtube02-KatyPerry.csv','Youtube03-LMFAO.csv','Youtube04-Eminem.csv','Youtube05-Shakira.csv']

for file in csv_files:
    data = pd.read_csv(csv_dir + file)
    all_data.append(data)
all_data = pd.concat(all_data)

# Sanity checkpoint
all_data.head()

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
0,LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU,Julius NM,2013-11-07T06:20:48,"Huh, anyway check out this you[tube] channel: ...",1
1,LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A,adam riyati,2013-11-07T12:37:15,Hey guys check out my new channel and our firs...,1
2,LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8,Evgeny Murashkin,2013-11-08T17:34:21,just for test I have to say murdev.com,1
3,z13jhp0bxqncu512g22wvzkasxmvvzjaz04,ElNino Melendez,2013-11-09T08:28:43,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1
4,z13fwbwp1oujthgqj04chlngpvzmtt3r3dw,GsMega,2013-11-10T16:05:38,watch?v=vtaRGgvGtWQ Check this out .﻿,1


In [4]:
# Data imbalance check (no issue here)
all_data['CLASS'].value_counts()

1    1005
0     951
Name: CLASS, dtype: int64

In [5]:
# Data preprocessing / cleaning

# Only keep Comment content and Class label
all_data.drop(['COMMENT_ID','AUTHOR','DATE'], axis=1, inplace=True, errors='ignore')
all_data.head()

Unnamed: 0,CONTENT,CLASS
0,"Huh, anyway check out this you[tube] channel: ...",1
1,Hey guys check out my new channel and our firs...,1
2,just for test I have to say murdev.com,1
3,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1
4,watch?v=vtaRGgvGtWQ Check this out .﻿,1


In [6]:
# Data preprocessing / cleaning
def process_content(comment):
    edited_comment = " ".join(re.findall("[A-Za-z]+", comment.lower()))
    edited_comment = edited_comment.replace('\ufeff', '')
    edited_comment = re.sub(r"(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)",'http', edited_comment)
    return edited_comment

In [7]:
all_data['PROCESSED CONTENT'] = all_data['CONTENT'].apply(process_content)
all_data.head()

Unnamed: 0,CONTENT,CLASS,PROCESSED CONTENT
0,"Huh, anyway check out this you[tube] channel: ...",1,huh anyway check out this you tube channel kob...
1,Hey guys check out my new channel and our firs...,1,hey guys check out my new channel and our firs...
2,just for test I have to say murdev.com,1,just for test i have to say murdev com
3,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1,me shaking my sexy ass on my channel enjoy
4,watch?v=vtaRGgvGtWQ Check this out .﻿,1,watch v vtarggvgtwq check this out


In [8]:
# Train test split
x_train, x_test, y_train, y_test = train_test_split(all_data['PROCESSED CONTENT'],all_data['CLASS'], test_size=0.2, random_state=69)

# Sanity checkpoint
print(x_train)
print(y_train)

# Print the shape train and test sets
print("x_train.shape = " + str(x_train.shape))
print("x_test.shape = " + str(x_test.shape))

413                        me and my big sister like you
187    who else would give katy perry a good old migh...
39     its a good song and i like her video clip beca...
294                  i ll subscribe to you you look nice
428                            watch this with sound off
                             ...                        
89     http www aaas org tech i vote view vote sheldo...
40                                           watching in
269    when i hear katy singing this i cry the song h...
89     check out the new hot video by dante b called ...
378                                            subscribe
Name: PROCESSED CONTENT, Length: 1564, dtype: object
413    0
187    0
39     0
294    1
428    0
      ..
89     1
40     0
269    0
89     1
378    1
Name: CLASS, Length: 1564, dtype: int64
x_train.shape = (1564,)
x_test.shape = (392,)


In [9]:
# Feature extraction using Counter Vectorizer
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(stop_words='english')
x_train_counts = count_vect.fit_transform(x_train)
x_test_counts = count_vect.transform(x_test)

In [10]:
# Term frequency - inverse document frequency
from sklearn.feature_extraction.text import TfidfTransformer

tranformer = TfidfTransformer()
x_train_tfidf = tranformer.fit_transform(x_train_counts)
x_test_tfidf = tranformer.transform(x_test_counts)

In [11]:
# Create and train Logistic Regression model
from sklearn.linear_model import LogisticRegression

model_LR = LogisticRegression()
model_LR.fit(x_train_tfidf, y_train)

accuracy = model_LR.score(x_test_tfidf, y_test)
print(accuracy)

0.9540816326530612


In [12]:
# Create and train Random Forest Classifier model
from sklearn.ensemble import RandomForestClassifier

model_RFC = RandomForestClassifier()
model_RFC.fit(x_train_tfidf,y_train)

accuracy = model_RFC.score(x_test_tfidf, y_test)
print(accuracy)

0.9591836734693877


In [13]:
# Create and train Multi-Layer Perceptron model
from sklearn.neural_network import MLPClassifier

model_NN = MLPClassifier(hidden_layer_sizes=(20,40,40,20), activation='relu', solver='adam', max_iter=10000)
model_NN.fit(x_train_tfidf, y_train)

accuracy = model_NN.score(x_test_tfidf, y_test)
print(accuracy)

0.9387755102040817


In [14]:
# Create and train XGBClassifier
from xgboost import XGBClassifier

model_XGB = XGBClassifier(objective = 'binary:logistic', max_depth = 4, alpha = 10, learning_rate = 1.0, n_estimators = 100)
model_XGB.fit(x_train_tfidf, y_train)

accuracy = model_XGB.score(x_test_tfidf, y_test)
print(accuracy)



0.923469387755102


In [15]:
# To improve, can use Grid Search to find best parameters

# Try Grid Search with Random Forest Classifier

from sklearn.model_selection import GridSearchCV

parameters = {
                'n_estimators': [80, 100, 120],
                'bootstrap': [True, False],
                'criterion' : ['gini', 'entropy']
             }

model_RFC_GSCV = GridSearchCV(RandomForestClassifier(), parameters)
model_RFC_GSCV.fit(x_train_tfidf, y_train)

print(model_RFC_GSCV.best_params_)

accuracy = model_RFC_GSCV.score(x_test_tfidf, y_test)
print(accuracy)

{'bootstrap': False, 'criterion': 'gini', 'n_estimators': 120}
0.9642857142857143


In [16]:
# Also, let's try Naive Bayes method.

stopwords_english = stopwords.words('english') 
stemmer = PorterStemmer() 

In [17]:
def count_tweets(result, comments, ys):
    '''
    Input:
        result: a dictionary that will be used to map each pair to its frequency
        tweets: a list of comments
        ys: a list corresponding to the class of each comment (either 0 or 1)
    Output:
        result: a dictionary mapping each pair to its frequency
    '''
    ### START CODE HERE ###
    for y, comment in zip(ys, comments):
        comment_tokens = word_tokenize(process_content(comment))
        
        comment_stem = []

        for word in comment_tokens: # Go through every word in your tokens list
            if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
                
                stem_word = stemmer.stem(word)  # stemming word
                comment_stem.append(stem_word)  # append to the list
                
        for word in comment_stem:
            # define the key, which is the word and label tuple
            pair = (word, y)
            
            # if the key exists in the dictionary, increment the count
            if pair in result:
                result[pair] += 1

            # else, if the key is new, add it to the dictionary and set the count to 1
            else:
                result[pair] = 1
    ### END CODE HERE ###

    return result

In [18]:
# Build the freqs dictionary for later uses

freqs = count_tweets({}, x_train, y_train)

In [19]:
def train_naive_bayes(freqs, train_x, train_y):
    '''
    Input:
        freqs: dictionary from (word, label) to how often the word appears
        train_x: a list of tweets
        train_y: a list of labels correponding to the comments (0,1)
    Output:
        logprior: the log prior. (equation 3 above)
        loglikelihood: the log likelihood of you Naive bayes equation. (equation 6 above)
    '''
    loglikelihood = {}
    logprior = 0

    ### START CODE HERE ###

    # calculate V, the number of unique words in the vocabulary
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)

    # calculate N_pos, N_neg, V_pos, V_neg
    N_pos = N_neg = 0
    for pair in freqs.keys():
        # if the label is positive (greater than zero)
        if pair[1] > 0:

            # Increment the number of positive words by the count for this (word, label) pair
            N_pos += freqs.get(pair, 1)

        # else, the label is negative
        else:

            # increment the number of negative words by the count for this (word,label) pair
            N_neg += freqs.get(pair, 1)
    
    # Calculate D, the number of documents
    D = len(train_y)

    # Calculate D_pos, the number of positive documents
    D_pos = sum(train_y)

    # Calculate D_neg, the number of negative documents
    D_neg = D - D_pos

    # Calculate logprior
    logprior = np.log(D_pos) - np.log(D_neg)
    
    # For each word in the vocabulary...
    for word in vocab:
        # get the positive and negative frequency of the word
        freq_pos = freqs.get((word, 1.0), 0)
        freq_neg = freqs.get((word, 0.0), 0)

        # calculate the probability that each word is positive, and negative
        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)

        # calculate the log likelihood of the word
        loglikelihood[word] = np.log(p_w_pos) - np.log(p_w_neg)

    ### END CODE HERE ###

    return logprior, loglikelihood

In [20]:
logprior, loglikelihood = train_naive_bayes(freqs, x_train, y_train)

In [21]:
def naive_bayes_predict(comment, logprior, loglikelihood):
    '''
    Input:
        tweet: a string
        logprior: a number
        loglikelihood: a dictionary of words mapping to numbers
    Output:
        p: the sum of all the logliklihoods of each word in the comment (if found in the dictionary) + logprior (a number)

    '''
    ### START CODE HERE ###
    # process the tweet to get a list of words
    word_l = word_tokenize(process_content(comment))

    # initialize probability to zero
    p = 0

    # add the logprior
    p += logprior

    for word in word_l:

        # check if the word exists in the loglikelihood dictionary
        if word in loglikelihood:
            # add the log likelihood of that word to the probability
            p += loglikelihood[word]

    ### END CODE HERE ###

    return p

In [22]:
def test_naive_bayes(x_test, y_test, logprior, loglikelihood, naive_bayes_predict=naive_bayes_predict):
    """
    Input:
        test_x: A list of tweets
        test_y: the corresponding labels for the list of comments
        logprior: the logprior
        loglikelihood: a dictionary with the loglikelihoods for each word
    Output:
        accuracy: (# of comments classified correctly)/(total # of tweets)
    """
    accuracy = 0  # return this properly

    ### START CODE HERE ###
    y_hats = []
    for comment in x_test:
        # if the prediction is > 0
        if naive_bayes_predict(comment, logprior, loglikelihood) > 0:
            # the predicted class is 1
            y_hat_i = 1
        else:
            # otherwise the predicted class is 0
            y_hat_i = 0

        # append the predicted class to the list y_hats
        y_hats.append(y_hat_i)

    # error is the average of the absolute values of the differences between y_hats and test_y
    error = np.sum(np.abs(y_hats - y_test)) / len(y_test)

    # Accuracy is 1 minus the error
    accuracy = 1 - error

    ### END CODE HERE ###

    return accuracy

In [23]:
print(test_naive_bayes(x_test, y_test, logprior, loglikelihood))

0.875


In [24]:
# Error analysis of the above models

print('=== Logistic Regression Model Error Analysis ===\n')
print('Truth Predicted Tweet')
for x, x_tfidf, y in zip(x_test, x_test_tfidf, y_test):
    y_hat = model_LR.predict(x_tfidf)
    if y != (np.sign(y_hat) > 0):
        print('%d\t%0.2f\t%s' % (y, np.sign(y_hat) > 0, ' '.join(
            word_tokenize(process_content(x))).encode('ascii', 'ignore')))
        
print('\n\n=== Random Forest Classifier Error Analysis ===\n')
print('Truth Predicted Tweet')
for x, x_tfidf, y in zip(x_test, x_test_tfidf, y_test):
    y_hat = model_RFC.predict(x_tfidf)
    if y != (np.sign(y_hat) > 0):
        print('%d\t%0.2f\t%s' % (y, np.sign(y_hat) > 0, ' '.join(
            word_tokenize(process_content(x))).encode('ascii', 'ignore')))
        
print('\n\n=== Multi-Layer Perceptron Model Error Analysis ===\n')
print('Truth Predicted Tweet')
for x, x_tfidf, y in zip(x_test, x_test_tfidf, y_test):
    y_hat = model_NN.predict(x_tfidf)
    if y != (np.sign(y_hat) > 0):
        print('%d\t%0.2f\t%s' % (y, np.sign(y_hat) > 0, ' '.join(
            word_tokenize(process_content(x))).encode('ascii', 'ignore')))

print('\n\n=== Naive Bayes Model Error Analysis ===\n')
print('Truth Predicted Tweet')
for x, y in zip(x_test, y_test):
    y_hat = naive_bayes_predict(x, logprior, loglikelihood)
    if y != (np.sign(y_hat) > 0):
        print('%d\t%0.2f\t%s' % (y, np.sign(y_hat) > 0, ' '.join(
            word_tokenize(process_content(x))).encode('ascii', 'ignore')))

=== Logistic Regression Model Error Analysis ===

Truth Predicted Tweet
1	0.00	b'can i get views and subscribers for no reason'
0	1.00	b'since when has katy perry had her own youtube channel'
1	0.00	b'hi d we re twin melody year old twins we did some covers we did a cover of birthday by katy perry please just take second and watch it thanks merci gracias danke obrigado grazie lt xx have a nice day d'
0	1.00	b'this comment is wrong'
1	0.00	b'like this comment if you still jam out to this song after years'
1	0.00	b'o peoples of the earth i have seen how you perform every form of evil at your leisure you cease not from reveling in that which i hate behold you murder the innocent day and night and plot evil against your neighbor you stand up for the rights of those who commit abomination and clap your hands as wickedness is celebrated openly in the streets o most perverse and abominable generation shall i not repay hear the word of the lord trumpetcallofgodonline co m'
1	0.00	b'pleasssssss

0	1.00	b'millioon dislikesssssssssssssssssssssssssssssssss'
0	1.00	b'views'
0	1.00	b'why so many disliked'
0	1.00	b''
1	0.00	b'check out daneja good girl'
0	1.00	b'gooooood'
1	0.00	b'if u love rihanna subscribe me'
0	1.00	b'a href http www youtube com watch v kq zr kcpj amp t m s a best part'
1	0.00	b'top three shakira songs my choice br br waka waka it s time for africa br br can t remember to forget you br br empire br br like this comment if u like shakira'
0	1.00	b'th most viewed video i guess'
1	0.00	b'i like this comment and do not kill p'
0	1.00	b'views near'
0	1.00	b'i am now going to voyage to the first comment tell my family i loved them'
1	0.00	b'if i reach subscribers i will tazz my self and my friend'
0	1.00	b'see it all human folly right'
1	0.00	b'could spanish people understand this br br any way s i how you doing subscribe to me i brake things br br'
0	1.00	b'roaaaaarrrrrr'
0	1.00	b'this song is about rape and cheating br br br br br br br br br br basically'
0	1.00	b's