In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
data = pd.read_csv('Youtube01-Psy.csv')
print(data.sample(5))

                                COMMENT_ID                   AUTHOR  \
137    z122f10gdvathpo2222pshcxxpjliddjv04             BmwManDexter   
94   z132v5iajkjuhfkyn04cg3mxclnnzj0yxmo0k               richardex8   
4      z13fwbwp1oujthgqj04chlngpvzmtt3r3dw                   GsMega   
169      z13dtz1zzkagdromt230g5cqfsejstr3p  Digital Media Butterfly   
27     z13kszcinpnvc34v2234fnpxkpmlw3nhc04               Kyle Jaber   

                    DATE                                            CONTENT  \
137  2014-11-05T20:33:15  check out "starlitnightsky" channel to see epi...   
94   2014-11-03T23:03:03  Why does this video have so many views? Becaus...   
4    2013-11-10T16:05:38            watch?v=vtaRGgvGtWQ   Check this out .﻿   
169  2014-11-06T18:55:14  The most watched video on YouTube is Psy’s “Ga...   
27   2014-01-19T00:21:29            Check me out! I'm kyle. I rap so yeah ﻿   

     CLASS  
137      1  
94       0  
4        1  
169      0  
27       1  


In [2]:
#We only need the content and class column from the dataset for the rest of the task. So let’s select both the columns and move further:
data = data[['CONTENT', 'CLASS']]
print(data.sample(5))

                                               CONTENT  CLASS
301  http://hackfbaccountlive.com/?ref=4436607  psy...      1
57   Subscribe and like to me for more how to video...      1
322            I think he was drunk during this :) x)﻿      0
134                              ❤️ ❤️ ❤️ ❤️ ❤️❤️❤️❤️﻿      0
27             Check me out! I'm kyle. I rap so yeah ﻿      1


In [3]:
#The class column contains values 0 and 1. 0 indicates not spam, and 1 indicates spam. So to make it look better, I will use spam and not spam labels instead of 1 and 0:
data['CLASS'] = data['CLASS'].map({0: 'Spam',
                                    1: 'Spam Comment'})
print(data.sample(5))

                                               CONTENT         CLASS
86                  Suscribe My Channel Please XD lol﻿  Spam Comment
225  You think you're smart?        Headbutt your f...          Spam
271  For all of the little kidz out there there is ...          Spam
25   marketglory . com/strategygame/andrijamatf ear...  Spam Comment
142  pls http://www10.vakinha.com.br/VaquinhaE.aspx...  Spam Comment


In [4]:
#Now let’s move further by training a classification Machine Learning model to classify spam and not spam comments. As this problem is a problem of binary classification, I will use the Bernoulli Naive Bayes algorithm to train the model:
x = np.array(data['CONTENT'])
y = np.array(data['CLASS'])
cv = CountVectorizer()
x = cv.fit_transform(x)
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)
model = BernoulliNB()
model.fit(xtrain, ytrain)
print(model.score(xtest, ytest))

0.9857142857142858


In [5]:
#Now let’s test the model by giving spam and not spam comments as input:
sample = 'Check this out: https://thecleverprogrammer.com/'
data = cv.transform([sample]).toarray()
print(model.predict(data))

['Spam Comment']


In [6]:
sample = 'Lack of information!'
data = cv.transform([sample]).toarray()
print(model.predict(data))

['Spam']


In [None]:
#Summary
#Spam comments detection means classifying comments as spam or not spam. Spam comments on social media platforms are the type of comments posted to redirect the user to another social media account, website or any piece of content.