#### Another application: Detection spam comments from Youtube videos

In [68]:
import pandas as pd
import html

In [69]:
all_files=["Youtube01-Psy.csv","Youtube02-KatyPerry.csv","Youtube03-LMFAO.csv","Youtube04-Eminem.csv"]
list_=[]

for file in all_files:
    frame = pd.read_csv(("datasets/youtube-spam/" + file), index_col=None, header=0)
    list_.append(frame)
    
df = pd.concat(list_, ignore_index=True)

#### 1. Pre Processing

Drop the first 3 columns

In [70]:
df.drop(columns=["COMMENT_ID","AUTHOR","DATE"],inplace=True) ## no need columns

df["CONTENT"][700]

'<a href="http://www.youtube.com/watch?v=KQ6zr6kCPj8&amp;t=2m19s">2:19</a> best part\ufeff'

Clean up: tags, links, parsing errors,...

In [71]:
df["CONTENT"]=df["CONTENT"].apply(html.unescape)
df["CONTENT"]=df["CONTENT"].str.replace("\ufeff","")    

df["CONTENT"][700]

'<a href="http://www.youtube.com/watch?v=KQ6zr6kCPj8&t=2m19s">2:19</a> best part'

Replace all links by 'htmllink' word

In [72]:
df["CONTENT"]=df["CONTENT"].str.replace("(<a.+>)","htmllink")

  df["CONTENT"]=df["CONTENT"].str.replace("(<a.+>)","htmllink")


Remove other tags

In [73]:
df[df["CONTENT"].str.contains("<.+>")]["CONTENT"]

381                      <script>document.write('htmllink
702     Hey guys, I'm a human.<br /><br /><br />But I ...
708                                          Awsome<br />
728                             Super awesome video<br />
730     This Will Always Be My Favorite Song<br />But ...
                              ...                        
1406                    Hello. İ am from Azerbaijan<br />
1409                EMINEM<3 <br />the best rapper ever<3
1499    If you are a person that loves real music you ...
1546               Love your songs<br />Supper cool<br />
1566     Really good song .<br />you know love song song.
Name: CONTENT, Length: 65, dtype: object

In [74]:
df["CONTENT"]=df["CONTENT"].str.replace("<.+>","")

  df["CONTENT"]=df["CONTENT"].str.replace("<.+>","")


In [75]:
df["CONTENT"]=df["CONTENT"].str.replace("\'","")

In [76]:
df["CONTENT"]=df["CONTENT"].str.lower()

In [77]:
df[df["CONTENT"].str.contains("\.com|watch\?")]

Unnamed: 0,CONTENT,CLASS
2,just for test i have to say murdev.com,1
4,watch?v=vtarggvgtwq check this out .,1
12,https://twitter.com/gbphotographygb,1
14,please like :d https://premium.easypromosapp.c...,1
17,http://www.ebay.com/itm/171183229277?sspagenam...,1
...,...,...
1448,everyone come and check out the new gta 5 game...,1
1476,check out these lyrics /watch?v=yuttx04oyqq,1
1521,hello to everyone! please check out my video: ...,1
1522,/watch?v=aimbwbfqbzg watch and subscrible,1


There are still few rows with links. Let's replace them too with "htmllink" and then remove all non-alpha numerical characters.

In [78]:
df["CONTENT"][1573]

'subscribe to my channel  /watch?v=nxk32i0hkds'

In [79]:
df["CONTENT"]=df["CONTENT"].str.replace(r"\S*\.com\S*|\S*watch\?\S*","htmllink")

  df["CONTENT"]=df["CONTENT"].str.replace(r"\S*\.com\S*|\S*watch\?\S*","htmllink")


In [80]:
df["CONTENT"]=df["CONTENT"].str.replace("\W"," ")

  df["CONTENT"]=df["CONTENT"].str.replace("\W"," ")


In [81]:
df["CONTENT"][1573] ## or 14

'subscribe to my channel  htmllink'

#### 2. Create model

In [82]:
df["CLASS"].value_counts(normalize=True)

1    0.52396
0    0.47604
Name: CLASS, dtype: float64

In [83]:
# Bag_of_words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
texts_bow = cv.fit_transform(df['CONTENT'])

In [85]:
# Split data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(texts_bow, df['CLASS'], train_size=0.8, random_state=0)

In [86]:
# Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB().fit(X_train, y_train)

### Evaluate

In [87]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

def evaluate(y_pred, y_test):
    print(classification_report(y_test, y_pred))
    print("Confusion matrix: \n", confusion_matrix(y_test, y_pred))
    print('Accuracy: ', accuracy_score(y_test, y_pred))

In [88]:
y_pred = classifier.predict(X_test)

evaluate(y_pred, y_test)

              precision    recall  f1-score   support

           0       0.88      0.91      0.89       140
           1       0.93      0.90      0.91       178

    accuracy                           0.91       318
   macro avg       0.90      0.91      0.90       318
weighted avg       0.91      0.91      0.91       318

Confusion matrix: 
 [[127  13]
 [ 17 161]]
Accuracy:  0.9056603773584906


##### Naive Bayes coding (Addition part - Without sklearn library)

In [51]:
vocab=[]
for comment in df["CONTENT"]:
    for word in comment.split():
        vocab.append(word)

In [None]:
vocabulary=list(set(vocab))
len(vocabulary)

In [None]:
# Create a column for each of the unique word in our vocabulary inorder to get the count of words 
for word in vocabulary:
    df[word]=0

In [None]:
df.head()

In [None]:
# looping through data frame and counting words 
for index,value in enumerate(df["CONTENT"]):
  for l in value.split():
    df[l][index]+=1

In [None]:
df.head()

In [None]:
#Total number of words in each class
df.groupby("CLASS").sum().sum(axis=1)

In [None]:
# Assign variables to all values required in calculation
p_ham=0.47604
p_spam=0.52396
n_spam=df[df["CLASS"]==1].drop(columns=["CONTENT","CLASS"]).sum().sum()
n_ham=df[df["CLASS"]==0].drop(columns=["CONTENT","CLASS"]).sum().sum()
n_vocabulary=len(vocabulary)

In [None]:
# Slicing dataframe for each class
df_sspam=df[df["CLASS"]==1]
df_hham=df[df["CLASS"]==0]

In [None]:
parameters_spam = {unique_word:0 for unique_word in vocabulary}
parameters_ham = {unique_word:0 for unique_word in vocabulary}

for word in vocabulary:
    n_word_given_spam = df_sspam[word].sum()   # spam_messages already defined in a cell above
    p_word_given_spam = (n_word_given_spam + 1) / (n_spam + 1*n_vocabulary)
    parameters_spam[word] = p_word_given_spam
    n_word_given_ham = df_hham[word].sum()   # ham_messages already defined in a cell above
    p_word_given_ham = (n_word_given_ham + 1) / (n_ham + 1*n_vocabulary)
    parameters_ham[word] = p_word_given_ham


#### 3. Testing

In [None]:
def classifier(string):
    message=html.unescape(string)
    message=string.replace("\ufeff","")
    message=string.replace("(<a.+>)","htmllink")
    message=string.replace("\'|<.+>","")
    message=string.replace("\S*\.com\S*|\S*watch\?\S*","htmllink")
    message=string.replace("\W"," ").lower()
    p_string_s=1
    p_string_h=1

    for word in message.split():
        if word in parameters_spam:
            p_string_s*=parameters_spam[word]
            p_string_h*=parameters_ham[word]
            
    if (p_string_s*p_spam)>(p_string_h*p_ham):
        return(1)
    elif (p_string_s*p_spam)<(p_string_h*p_ham):
        return(0)
    else:
        return(-1)

In [None]:
# Reading the dataframe for testing model
df_shakira=pd.read_csv("../input/images/Youtube05-Shakira.csv")

In [None]:
df_shakira.head()

In [None]:
df_shakira["Pred_Class"]=df_shakira["CONTENT"].apply(classifier)

In [None]:
correct_predictions=0
total_rows=0
for row in df_shakira.iterrows():
    row=row[1]
    total_rows+=1
    if row["CLASS"]==row["Pred_Class"]:
        correct_predictions+=1
accuracy=correct_predictions/total_rows
print("accuracy=",accuracy)


#### Manual test

In [None]:
classifier("This song gives me goosebumps!!")


In [None]:
classifier("Please subscribe to my channel as I'm approaching 1M subscribers")

In [None]:
classifier("If you want to be a mastercoder, consider buying my course for 50% off at www.buymycourse.com")