# Data Science Wednesday 03/13/2019

In [1]:
#import necessary libraries
import pandas as pd
import re
from sklearn.model_selection import train_test_split

## About Data
As an example for text classification we work with 1956 comments from 5 different YouTube videos. The comments were collected via the YouTube API from five of the ten most viewed videos on YouTube in the first half of 2015. The comments were manually labeled as spam or legitimate. Spam was coded with a “1” and legitimate comments with a “0”. Data files: 'Youtube01-Psy.csv','Youtube02-KatyPerry.csv','Youtube03-LMFAO.csv','Youtube04-Eminem.csv','Youtube05-Shakira.csv'

In [2]:
#Read in the data
train_data = []
data_files = ['Youtube01-Psy.csv','Youtube02-KatyPerry.csv','Youtube03-LMFAO.csv','Youtube04-Eminem.csv','Youtube05-Shakira.csv']
for file in data_files:
    data = pd.read_csv(file)
    train_data.append(data)
train_data = pd.concat(train_data)
train_data.head()

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
0,LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU,Julius NM,2013-11-07T06:20:48,"Huh, anyway check out this you[tube] channel: ...",1
1,LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A,adam riyati,2013-11-07T12:37:15,Hey guys check out my new channel and our firs...,1
2,LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8,Evgeny Murashkin,2013-11-08T17:34:21,just for test I have to say murdev.com,1
3,z13jhp0bxqncu512g22wvzkasxmvvzjaz04,ElNino Melendez,2013-11-09T08:28:43,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1
4,z13fwbwp1oujthgqj04chlngpvzmtt3r3dw,GsMega,2013-11-10T16:05:38,watch?v=vtaRGgvGtWQ Check this out .﻿,1


In [3]:
#check the count of each class
train_data['CLASS'].value_counts()

1    1005
0     951
Name: CLASS, dtype: int64

## Cleaning the data
1. Drop insignificant columns
2. Process the contents of data
3. Extract features from the data


In [4]:
#drop columns
def drop_features(features,data):
    data.drop(features,axis=1,inplace=True)

In [5]:
drop_features(['COMMENT_ID','AUTHOR','DATE'],train_data)
train_data.info()
train_data.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1956 entries, 0 to 369
Data columns (total 2 columns):
CONTENT    1956 non-null object
CLASS      1956 non-null int64
dtypes: int64(1), object(1)
memory usage: 45.8+ KB


Unnamed: 0,CONTENT,CLASS
0,"Huh, anyway check out this you[tube] channel: ...",1
1,Hey guys check out my new channel and our firs...,1
2,just for test I have to say murdev.com,1
3,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1
4,watch?v=vtaRGgvGtWQ Check this out .﻿,1


In [6]:
#process content
def process_content(content):
    return " ".join(re.findall("[A-Za-z]+",content.lower()))

In [7]:
train_data['processed_content'] = train_data['CONTENT'].apply(process_content)
train_data.head()

Unnamed: 0,CONTENT,CLASS,processed_content
0,"Huh, anyway check out this you[tube] channel: ...",1,huh anyway check out this you tube channel kob...
1,Hey guys check out my new channel and our firs...,1,hey guys check out my new channel and our firs...
2,just for test I have to say murdev.com,1,just for test i have to say murdev com
3,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1,me shaking my sexy ass on my channel enjoy
4,watch?v=vtaRGgvGtWQ Check this out .﻿,1,watch v vtarggvgtwq check this out


In [8]:
#drop 'CONTENT', lets use 'PROCESSED CONTENT'
drop_features(['CONTENT'],train_data)

In [9]:
#Train test split
x_train, x_test, y_train, y_test = train_test_split(train_data['processed_content'],train_data['CLASS'],test_size=0.2,random_state=57)

In [10]:
#Feature extraction
#Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words='english')
x_train_counts = count_vect.fit_transform(x_train)

In [22]:
print(count_vect.vocabulary_)
print(x_train_counts)

  (0, 918)	2
  (0, 525)	1
  (0, 1202)	1
  (0, 2229)	1
  (0, 1376)	1
  (0, 1372)	1
  (0, 1285)	1
  (0, 2105)	1
  (0, 498)	1
  (0, 2026)	1
  (0, 2416)	1
  (0, 467)	1
  (0, 155)	1
  (0, 446)	1
  (0, 451)	1
  (0, 3108)	1
  (0, 2618)	1
  (0, 463)	1
  (0, 1726)	1
  (0, 3107)	1
  (0, 579)	1
  (1, 447)	1
  (1, 2802)	1
  (2, 312)	1
  (2, 1190)	1
  :	:
  (1559, 2802)	1
  (1560, 878)	1
  (1560, 3234)	1
  (1560, 1197)	1
  (1560, 2480)	1
  (1560, 1303)	1
  (1560, 2695)	1
  (1560, 1756)	1
  (1561, 1293)	1
  (1561, 2218)	1
  (1561, 3320)	1
  (1561, 463)	1
  (1562, 353)	1
  (1562, 2925)	1
  (1562, 766)	1
  (1562, 2722)	1
  (1562, 3039)	1
  (1562, 351)	4
  (1562, 3186)	1
  (1562, 2168)	1
  (1562, 2802)	1
  (1563, 2187)	1
  (1563, 534)	1
  (1563, 3180)	1
  (1563, 1689)	1


In [11]:
#Term frequency - inverse document frequency
from sklearn.feature_extraction.text import TfidfTransformer
tranformer = TfidfTransformer()
x_train_tfidf = tranformer.fit_transform(x_train_counts)

In [12]:
print(x_train_tfidf)

  (0, 579)	0.1817627430483752
  (0, 3107)	0.09954971942502731
  (0, 1726)	0.264551204493268
  (0, 463)	0.0836688831218715
  (0, 2618)	0.22662109881473988
  (0, 3108)	0.15012737481999192
  (0, 451)	0.11642107551984253
  (0, 446)	0.1722249653649856
  (0, 155)	0.21668874003588182
  (0, 467)	0.25055233104343294
  (0, 2416)	0.22129897413383598
  (0, 2026)	0.22129897413383598
  (0, 498)	0.2089846040361916
  (0, 2105)	0.22129897413383598
  (0, 1285)	0.22129897413383598
  (0, 1372)	0.1722249653649856
  (0, 1376)	0.14331344758515127
  (0, 2229)	0.19498573058635657
  (0, 1202)	0.1817627430483752
  (0, 525)	0.11157517176922856
  (0, 918)	0.529102408986536
  (1, 2802)	0.4169027916259167
  (1, 447)	0.9089510780754471
  (2, 463)	0.06352827648147828
  (2, 2802)	0.0837971152121541
  :	:
  (1559, 2794)	0.4133403828985252
  (1560, 1756)	0.22724886827505128
  (1560, 2695)	0.20333946164509845
  (1560, 1303)	0.3569460101522803
  (1560, 2480)	0.3705517204574569
  (1560, 1197)	0.4039422081563246
  (1560, 323

In [13]:
x_test_counts = count_vect.transform(x_test)
x_test_tfidf = tranformer.transform(x_test_counts)

## ML Modeling
We have completed cleaning of our data and we have training samples to train the model and testing samples to verify the accuracy of our models. 

## Logistic Regression
Logistic regression is a simple statistical model which predicts a binary response (ex. 0/1). For this data, we are predicting whether a comment is a spam or not.

In [14]:
#Create and train Logistic Regression model
from sklearn.linear_model import LogisticRegression
model_reg = LogisticRegression()
model_reg.fit(x_train_tfidf,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
#Run it on test data and calculate the accuracy
accuracy = model_reg.score(x_test_tfidf, y_test)
print(accuracy)

0.9260204081632653


## Random Forest Classifier
Random Forest is a supervised learning algorithm. Like you can already see from it’s name, it creates a forest and makes it somehow random. The forest it builds, is an ensemble of Decision Trees. To say it in simple words: Random forest builds multiple decision trees and merges them together to get a more accurate and stable prediction.

In [16]:
#Create and train Random Forest Classifier model
from sklearn.ensemble import RandomForestClassifier
model_rfc = RandomForestClassifier()
model_rfc.fit(x_train_tfidf,y_train)

  from numpy.core.umath_tests import inner1d


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [18]:
#Run it on test data and calculate the accuracy
accuracy = model_rfc.score(x_test_tfidf, y_test)
print(accuracy)

0.923469387755102


## Support Vector Machine
The objective of the support vector machine algorithm is to find a hyperplane in an N-dimensional space(N — the number of features) that distinctly classifies the data points.

In [19]:
#Create and train Support Vector Machine model
from sklearn import svm
model_svm = svm.SVC(C=100, gamma = 1)
model_svm.fit(x_train_tfidf, y_train)

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [20]:
#Run it on test data and calculate the accuracy
accuracy = model_svm.score(x_test_tfidf, y_test)
print(accuracy)

0.9336734693877551
