In [350]:
import numpy as np
import math
import seaborn as sns
from sklearn import svm
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import *
from sklearn.preprocessing import MinMaxScaler
from sklearn import linear_model
from sklearn.metrics import *

# Loading the dataset 

In [229]:
dataset = pd.read_csv('scraped_yt.csv')

# Getting all the features separately 

In [None]:
df_link = dataset['links']
df_title = dataset['title']
df_description = dataset['description']
df_category = dataset['category']

# Importing liberaries for data cleaning 

In [232]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/shubhamsingh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Cleaning the data and storing it into a list 

In [233]:
# Cleaning the text data; in my experiemnted, i worked on only 1000 observations (reviews)
corpus = []
for i in range(0, 8375):
    review = re.sub('[^a-zA-Z]', ' ', df_title['title'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [234]:
corpus1 = []
for i in range(0, 8375):
    review = re.sub('[^a-zA-Z]', ' ', df_description['description'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus1.append(review)

# Creating dataframes from the lists 

In [237]:
dftitle = pd.DataFrame({'title':corpus})
dfdescription = pd.DataFrame({'description':corpus1})

# Performing label encoding on the category feature

In [355]:
from sklearn.preprocessing import LabelEncoder 

In [247]:
dfcategory1 = dfcategory.apply(LabelEncoder().fit_transform)

# Creating a new dataset after cleaning the data and label encoding the categories

In [294]:
df_new = pd.concat([dflink, dftitle, dfdescription, dfcategory1], axis=1, join_axes=[dflink.index])

In [296]:
df_new.to_csv("scraped_yt.csv", encoding='utf-8', index=False)

In [295]:
df_new

Unnamed: 0,links,title,description,category
0,6bBQ3pd0YU8,american tap danc orchestra strike train chore...,atdo perform strike train joyc theater nyc cho...,0
1,JLU0c0mmvxg,robonaut space station nasa space scienc hd video,visit websit http www junglejoel com robonaut ...,4
2,IojqhtUwz50,european spacecraft pass key reentri test esa ...,visit websit http www junglejoel com european ...,4
3,-zgGVyADnFE,jordan bouri frontrow world danc franc qualifi,first perform world danc,0
4,ZZXWS0n0MCA,scienc univers space satellit hindi,hello bodhaguru learn proudli present anim vid...,4
5,Hz029D4wn1I,hot young star creat bright red nebula eso spa...,space news info http www coconutsciencelab com...,4
6,0jnuiRot6d0,aaja ko bigyaan episod school astronomi,space scienc technolog,4
7,FgBhMVgLtg,danc african danc zehil rugaro nekutamba happi,etienn cakpo guest perform profession dancer c...,0
8,0o90mJe21H,tip travel india,india massiv countri overwhelm plan trip spend...,5
9,PB3E_C1608k,korean food buffet eat,travel korea choic food overwhelm want tri eve...,1


# Creating the bag of words model

In [250]:

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus, corpus1).toarray()
y = df_new.iloc[:, 3].values

# Splitting the dataset into the Training set and Test set 

In [251]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

# Random Forest 

In [462]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 1000, criterion = 'entropy')
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [463]:
y_pred = classifier.predict(X_test)

In [464]:
classifier.score(X_test, y_test)

0.9605970149253731

In [485]:
print(classification_report(y_test, y_pred))

               precision    recall  f1-score   support

  Art & Dance       0.95      0.97      0.96       313
         Food       0.96      0.99      0.98       272
      History       0.96      0.98      0.97       287
Manufacturing       0.95      0.94      0.94       241
      Science       0.97      0.94      0.96       289
       Travel       0.98      0.93      0.96       273

    micro avg       0.96      0.96      0.96      1675
    macro avg       0.96      0.96      0.96      1675
 weighted avg       0.96      0.96      0.96      1675



In [465]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [466]:
cm

array([[305,   0,   2,   3,   0,   3],
       [  0, 269,   0,   1,   0,   2],
       [  2,   1, 281,   1,   2,   0],
       [  5,   1,   3, 226,   5,   1],
       [  6,   0,   6,   4, 273,   0],
       [  3,   8,   2,   3,   2, 255]])

# SVM 

In [467]:
from sklearn.svm import SVC
classifier1 = SVC(kernel = 'linear', random_state = 0)
classifier1.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=0,
  shrinking=True, tol=0.001, verbose=False)

In [468]:
y_pred1 = classifier1.predict(X_test)

In [469]:
classifier1.score(X_test, y_test)

0.9564179104477611

In [470]:
# Making the Confusion Matrix
cm1 = confusion_matrix(y_test, y_pred1)

In [471]:
cm1

array([[301,   0,   4,   6,   0,   2],
       [  0, 266,   0,   1,   0,   5],
       [  2,   1, 278,   3,   2,   1],
       [  0,   1,   4, 229,   4,   3],
       [  1,   0,   9,   9, 270,   0],
       [  2,   4,   1,   6,   2, 258]])

# Naive Bayes 

In [472]:
from sklearn.naive_bayes import GaussianNB
classifier2 = GaussianNB()
classifier2.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [473]:
y_pred2 = classifier2.predict(X_test)

In [474]:
classifier2.score(X_test, y_test)

0.8107462686567164

In [475]:
# Making the Confusion Matrix
cm2 = confusion_matrix(y_test, y_pred2)

In [476]:
cm2

array([[289,   4,  10,   2,   0,   8],
       [  0, 253,   1,   0,   0,  18],
       [ 40,   4, 194,  16,  10,  23],
       [  1,   7,   4, 219,   4,   6],
       [  4,   6,  59,  15, 199,   6],
       [  3,  55,   4,   0,   7, 204]])

# XGboost 

In [477]:
from xgboost import XGBClassifier
classifier3 = XGBClassifier()
classifier3.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [293]:
y_pred3 = classifier3.predict(X_test)

In [271]:
classifier3.score(X_test, y_test)

0.937910447761194

In [458]:
# Making the Confusion Matrix
cm3 = confusion_matrix(y_test, y_pred3)

In [459]:
cm3

array([[287,   0,   3,  20,   0,   3],
       [  0, 264,   1,   4,   0,   3],
       [  0,   1, 275,   9,   2,   0],
       [  1,   1,   2, 235,   2,   0],
       [  0,   0,   3,  28, 258,   0],
       [  0,   7,   0,  14,   0, 252]])