In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle as pkl
import json
import nltk
import emoji
import enchant

from sklearn.utils import shuffle

from textblob import TextBlob

from nltk.util import ngrams
from nltk.corpus import stopwords

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from collections import Counter

In [2]:
UScomments = pd.read_csv('UScomments.csv', delimiter=',', error_bad_lines=False)
GBcomments = pd.read_csv('GBcomments.csv', delimiter=',', error_bad_lines=False)

USvideos = pd.read_csv('USvideos.csv', delimiter=',', error_bad_lines=False)
GBvideos = pd.read_csv('GBvideos.csv', delimiter=',', error_bad_lines=False)

with open('US_category_id.json') as file:
    US_cats = json.load(file)
    
with open('GB_category_id.json') as file:
    GB_cats = json.load(file)

b'Skipping line 41589: expected 4 fields, saw 11\nSkipping line 51628: expected 4 fields, saw 7\nSkipping line 114465: expected 4 fields, saw 5\n'
b'Skipping line 142496: expected 4 fields, saw 8\nSkipping line 189732: expected 4 fields, saw 6\nSkipping line 245218: expected 4 fields, saw 7\n'
b'Skipping line 388430: expected 4 fields, saw 5\n'
  interactivity=interactivity, compiler=compiler, result=result)
b'Skipping line 113225: expected 4 fields, saw 5\n'
b'Skipping line 158379: expected 4 fields, saw 7\nSkipping line 241590: expected 4 fields, saw 5\nSkipping line 245637: expected 4 fields, saw 7\n'
b'Skipping line 521402: expected 4 fields, saw 5\n'
b'Skipping line 2401: expected 11 fields, saw 21\nSkipping line 2800: expected 11 fields, saw 21\nSkipping line 5297: expected 11 fields, saw 12\nSkipping line 5299: expected 11 fields, saw 12\nSkipping line 5300: expected 11 fields, saw 12\nSkipping line 5301: expected 11 fields, saw 12\n'
b'Skipping line 2398: expected 11 fields, sa

In [3]:
US_cats

{'kind': 'youtube#videoCategoryListResponse',
 'etag': '"m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJvJAAShlR6hM"',
 'items': [{'kind': 'youtube#videoCategory',
   'etag': '"m2yskBQFythfE4irbTIeOgYYfBU/Xy1mB4_yLrHy_BmKmPBggty2mZQ"',
   'id': '1',
   'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ',
    'title': 'Film & Animation',
    'assignable': True}},
  {'kind': 'youtube#videoCategory',
   'etag': '"m2yskBQFythfE4irbTIeOgYYfBU/UZ1oLIIz2dxIhO45ZTFR3a3NyTA"',
   'id': '2',
   'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ',
    'title': 'Autos & Vehicles',
    'assignable': True}},
  {'kind': 'youtube#videoCategory',
   'etag': '"m2yskBQFythfE4irbTIeOgYYfBU/nqRIq97-xe5XRZTxbknKFVe5Lmg"',
   'id': '10',
   'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ',
    'title': 'Music',
    'assignable': True}},
  {'kind': 'youtube#videoCategory',
   'etag': '"m2yskBQFythfE4irbTIeOgYYfBU/HwXKamM1Q20q9BN-oBJavSGkfDI"',
   'id': '15',
   'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdnt

In [5]:
UScomments.head()

Unnamed: 0,video_id,comment_text,likes,replies
0,XpVt6Z1Gjjo,Logan Paul it's yo big day ‼️‼️‼️,4,0
1,XpVt6Z1Gjjo,I've been following you from the start of your...,3,0
2,XpVt6Z1Gjjo,Say hi to Kong and maverick for me,3,0
3,XpVt6Z1Gjjo,MY FAN . attendance,3,0
4,XpVt6Z1Gjjo,trending 😉,3,0


In [6]:
UScomments.shape, GBcomments.shape

((691400, 4), (718452, 4))

## Merge and Concatenate all files appropriately

In [7]:
US_df = pd.merge(UScomments, USvideos, on='video_id')
US_df.drop_duplicates(subset=['video_id', 'comment_text'], inplace=True)
US_df.shape

(450534, 14)

In [8]:
GB_df = pd.merge(GBcomments, GBvideos, on='video_id')
GB_df.drop_duplicates(subset=['video_id', 'comment_text'], inplace=True)
GB_df.shape

(384293, 14)

In [9]:
frames = [US_df, GB_df]
df = pd.concat(frames)
df.drop_duplicates(subset=['video_id', 'comment_text'], inplace=True)
df.shape

(646863, 14)

In [10]:
df.head()

Unnamed: 0,video_id,comment_text,likes_x,replies,title,channel_title,category_id,tags,views,likes_y,dislikes,comment_total,thumbnail_link,date
0,XpVt6Z1Gjjo,Logan Paul it's yo big day ‼️‼️‼️,4,0,1 YEAR OF VLOGGING -- HOW LOGAN PAUL CHANGED Y...,Logan Paul Vlogs,24,logan paul vlog|logan paul|logan|paul|olympics...,4394029,320053,5931,46245,https://i.ytimg.com/vi/XpVt6Z1Gjjo/default.jpg,13.09
7,XpVt6Z1Gjjo,I've been following you from the start of your...,3,0,1 YEAR OF VLOGGING -- HOW LOGAN PAUL CHANGED Y...,Logan Paul Vlogs,24,logan paul vlog|logan paul|logan|paul|olympics...,4394029,320053,5931,46245,https://i.ytimg.com/vi/XpVt6Z1Gjjo/default.jpg,13.09
14,XpVt6Z1Gjjo,Say hi to Kong and maverick for me,3,0,1 YEAR OF VLOGGING -- HOW LOGAN PAUL CHANGED Y...,Logan Paul Vlogs,24,logan paul vlog|logan paul|logan|paul|olympics...,4394029,320053,5931,46245,https://i.ytimg.com/vi/XpVt6Z1Gjjo/default.jpg,13.09
21,XpVt6Z1Gjjo,MY FAN . attendance,3,0,1 YEAR OF VLOGGING -- HOW LOGAN PAUL CHANGED Y...,Logan Paul Vlogs,24,logan paul vlog|logan paul|logan|paul|olympics...,4394029,320053,5931,46245,https://i.ytimg.com/vi/XpVt6Z1Gjjo/default.jpg,13.09
28,XpVt6Z1Gjjo,trending 😉,3,0,1 YEAR OF VLOGGING -- HOW LOGAN PAUL CHANGED Y...,Logan Paul Vlogs,24,logan paul vlog|logan paul|logan|paul|olympics...,4394029,320053,5931,46245,https://i.ytimg.com/vi/XpVt6Z1Gjjo/default.jpg,13.09


In [11]:
# Drop all missing values
df.dropna(axis=0, subset=['comment_text'], inplace=True)
df.shape

(646835, 14)

In [12]:
# Let's look at all the video categories
for item in US_cats['items']:
    print(item['id'], item['snippet']['title'])

1 Film & Animation
2 Autos & Vehicles
10 Music
15 Pets & Animals
17 Sports
18 Short Movies
19 Travel & Events
20 Gaming
21 Videoblogging
22 People & Blogs
23 Comedy
24 Entertainment
25 News & Politics
26 Howto & Style
27 Education
28 Science & Technology
29 Nonprofits & Activism
30 Movies
31 Anime/Animation
32 Action/Adventure
33 Classics
34 Comedy
35 Documentary
36 Drama
37 Family
38 Foreign
39 Horror
40 Sci-Fi/Fantasy
41 Thriller
42 Shorts
43 Shows
44 Trailers


In [13]:
# Split data to train and test
train_df = df.sample(frac=0.8, random_state=5)

## Text Preprocessing

In [14]:
x_train = train_df['comment_text']

In [15]:
# Change all text to lowercase
x_train = train_df['comment_text'].apply(lambda x: ' '.join(x.lower() for x in x.split()))

In [16]:
# Remove punctuation and emojis
x_train = x_train.str.replace('[^\w\s]','')

In [17]:
# Remove non-English characters
new_x_train = []

for sent in x_train:
    new_x_train.append(sent.encode("ascii", errors="ignore").decode())

x_train = pd.Series(new_x_train)

In [18]:
# Not going to remove stopwords because will be losing info on context and structure of comments
## Will remove stopwords when filtering for final common positive, negative and neutral comments.
'''
words = set(nltk.corpus.words.words())
stop = stopwords.words('english') ###can append more to this list

x_train = x_train.apply(lambda x: ' '.join(w for w in x.split() if w not in stop))
'''

"\nwords = set(nltk.corpus.words.words())\nstop = stopwords.words('english') ###can append more to this list\n\nx_train = x_train.apply(lambda x: ' '.join(w for w in x.split() if w not in stop))\n"

In [19]:
## Will not correct words, since 'ur' will be incorrectly corrected to 'or' instead of 'your' or 'you're

#TextBlob('awsm').correct()

#d = enchant.Dict('en_US')
#d.suggest('awsm')

In [20]:
# Common words - want to keep since will provide more info on context later on.

pd.Series(' '.join(x_train).split()).value_counts()[:100]

the        271593
i          193204
a          161802
to         160012
and        151635
you        130916
is         127004
this       105883
of         101084
it          90323
in          82026
that        80103
for         65163
so          64348
my          53526
on          52804
like        50871
was         46488
love        46020
are         44833
your        43744
with        43341
but         42426
be          39881
have        39837
me          37588
just        37187
not         36331
im          32309
its         32294
            ...  
now         14543
time        14127
think       13951
there       13829
because     13752
them        12837
look        12727
by          12664
great       12496
did         12465
cant        12446
please      12294
should      12270
only        12258
go          11587
some        11540
lol         11514
want        11438
even        11384
been        11136
these       11117
their       10878
too         10815
looks       10813
got       

In [21]:
# This will remove non-English words, but will not do, since this will also remove a lot of social media slang
'''
x_train.apply(lambda x: ' '.join(w for w in nltk.wordpunct_tokenize(x) if w.lower() in words or not w.isalpha()))
x_train
'''

"\nx_train.apply(lambda x: ' '.join(w for w in nltk.wordpunct_tokenize(x) if w.lower() in words or not w.isalpha()))\nx_train\n"

In [22]:
# Remove rare words - will also remove any non-English words (rare words if count < 50) and reduce noise
rare = pd.Series(' '.join(x_train).split()).value_counts()[-218100:]

x_train = x_train.apply(lambda x: ' '.join(w for w in x.split() if w not in rare))

## Categorizing comments by sentiment

In [23]:
def sort_comment_sentiment(text_list, pos_comments, neu_comments, neg_comments):
    '''
    Sentiment scoring of comments depends on the compound score. 
    The compound score is the sum of valence scores of each word in each comment. 
    All scores are normalized to be between -1 and 1. More documentation can be found in vaderSentiment.py of module.
    '''
    analyzer = SentimentIntensityAnalyzer()
 
    for text in text_list:
        all_score = analyzer.polarity_scores(text)
        score = all_score['compound']
        
        if score > 0.1:
            pos_comments.append(text)
            
        elif score > -0.1 and score <= 0.1:
            neu_comments.append(text)
        
        else:
            neg_comments.append(text) 

In [24]:
positive_comments = []
neutral_comments = []
negative_comments = []

sort_comment_sentiment(x_train, positive_comments, neutral_comments, negative_comments)