In [1]:
import os, glob

import numpy as np
import pandas as pd

import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

Q2. Download a YouTube spam collection dataset available from this link (Links to an external site.).

This is a public set of comments collected for spam research. It has five datasets composed by 1,956 real messages extracted from five videos. These 5 videos are popular pop songs that were among the 10 most viewed on the collection period.
All the five dataset has the following attributes:

COMMENT_ID: Unique id representing the comment  
AUTHOR: Author id,  
DATE: Date the comment is posted,  
CONTENT: The comment,  
TAG: For spam 1, otherwise 0

For this exercise use any 4 of these 5 datasets to build a spam filter with Naive Bayes approach and use that filter to check the accuracy on the remaining dataset. Make sure to report the details of your training and the model. [6 points]

In [14]:
def textcleaner(row):

    row = row.lower()     
    row = re.sub("[^\w\s']", "", row) # remove punctuations
    row = row.strip(" ")
    return row

In [15]:
train_df = pd.DataFrame()

#Read data
files = glob.glob("./YouTube-Spam-Collection-v1/*")
files = sorted(files)[:-1]
for f in files:
    temp = pd.read_csv(f)
    print("(file, shape):", f, temp.shape)
    train_df = train_df.append(temp, ignore_index=True)

print("the shape of training files", train_df.shape)

#Picking necessary columns
train_df = train_df[['CONTENT', 'CLASS']]
#train_df['CONTENT'] = train_df['CONTENT'].apply(textcleaner)
train_df.head()

(file, shape): ./YouTube-Spam-Collection-v1/Youtube01-Psy.csv (350, 5)
(file, shape): ./YouTube-Spam-Collection-v1/Youtube02-KatyPerry.csv (350, 5)
(file, shape): ./YouTube-Spam-Collection-v1/Youtube03-LMFAO.csv (438, 5)
(file, shape): ./YouTube-Spam-Collection-v1/Youtube04-Eminem.csv (448, 5)
the shape of training files (1586, 5)


Unnamed: 0,CONTENT,CLASS
0,"Huh, anyway check out this you[tube] channel: ...",1
1,Hey guys check out my new channel and our firs...,1
2,just for test I have to say murdev.com,1
3,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1
4,watch?v=vtaRGgvGtWQ Check this out .﻿,1


In [16]:
test_df = pd.read_csv('./YouTube-Spam-Collection-v1/Youtube05-Shakira.csv')
print("the shape of the test dataset", test_df.shape)
test_df = test_df[['CONTENT', 'CLASS']]
#test_df['CONTENT'] = test_df['CONTENT'].apply(textcleaner)
test_df.head()

the shape of the test dataset (370, 5)


Unnamed: 0,CONTENT,CLASS
0,Nice song﻿,0
1,I love song ﻿,0
2,I love song ﻿,0
3,"860,000,000 lets make it first female to reach...",0
4,shakira is best for worldcup﻿,0


In [35]:
#Created a data pipeline
pipeline = Pipeline([
    ('bow', CountVectorizer()),  # strings to token integer counts
    #('tfidf', TfidfTransformer(use_idf=True, smooth_idf=True)),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [36]:
#Fitting the training data
pipeline.fit(train_df['CONTENT'],train_df['CLASS'])

Pipeline(steps=[('bow', CountVectorizer()), ('classifier', MultinomialNB())])

In [37]:
#Predicting on the testing data
predictions = pipeline.predict(test_df['CONTENT'])

#Printing the classification report and accuracy score
print(classification_report(predictions,test_df['CLASS']))
print("accuracy:", round(accuracy_score(predictions,test_df['CLASS']), 4))

              precision    recall  f1-score   support

           0       0.87      0.92      0.89       185
           1       0.91      0.86      0.89       185

    accuracy                           0.89       370
   macro avg       0.89      0.89      0.89       370
weighted avg       0.89      0.89      0.89       370

accuracy: 0.8892
