In [1]:
from textblob import TextBlob
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#plotly imports
from pprint import pprint
import plotly
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import cufflinks as cf

In [2]:
# pull in the data
df = pd.read_csv('cleanedData.csv')
df = df.dropna()

In [3]:
df.head(1)

Unnamed: 0,vidId,videoText,date,title,views,likes,dislikes,commentCount,vidLength,description,channel,country,totChanViews,totSubscribers,totChanVideos,music,textCleaned
0,7PIMiDcwNvc,[Music] I am most of all happy and grateful t...,2019-08-24,Marzia & Felix - Wedding 19.08.2019,39403934.0,5509252.0,33359.0,526815.0,00:06:22,"Our footage from the wedding, best day of my l...",PewDiePie,US,27984030000.0,110000000.0,4443.0,1,happy grateful finally mighty hospital met tim...


### Sentiment Per Video

In [4]:
polarity = [] # Lies in range [-1,1]  Very negative sentiment to very positive
subjectivity = [] # [0,1] Very objective statement to very subjective statement

for word in tqdm(df['textCleaned']):
    try:
        text = TextBlob(word).sentiment
        polarity.append(text.polarity)
        subjectivity.append(text.subjectivity)
    except:
        print(word)

100%|██████████████████████████████████████████████████████████████████████████████| 1132/1132 [00:04<00:00, 227.72it/s]


In [5]:
df['text_polarity'] = polarity
df['text_subjectivity'] = subjectivity

In [6]:
df[['title', 'text_polarity', 'text_subjectivity']].sample(10)

Unnamed: 0,title,text_polarity,text_subjectivity
243,Diana Pretend Play with Princess Carriage Infl...,0.2,0.5
1062,CELEBRATING MY BROTHER'S 18TH BIRTHDAY | Nicol...,0.247839,0.535461
699,How to Make Pixel Art [Tutorial for Beginners]...,0.108997,0.499644
1075,People Who Got What They DESERVE,-0.013319,0.608285
29,Rich Parents Adopted Me Then Changed Their Mind,0.118964,0.598867
200,"MY FIRST TRIMESTER WITH TWINS! Symptoms, Gende...",0.120001,0.584137
174,"Shaving my head BALD for $10,000! Live Stream ...",0.040561,0.562346
1078,10 Pranks With Edible School Supplies! Back To...,0.213357,0.560662
769,Witch Protection Program: The Dark Secret (A R...,0.004649,0.64067
572,Keeley & Scott’s Official WEDDING Teaser Video!,0.537121,0.742182


In [7]:
df[['title', 'text_polarity', 'text_subjectivity']].describe()

Unnamed: 0,text_polarity,text_subjectivity
count,1132.0,1132.0
mean,0.125748,0.551567
std,0.118046,0.110018
min,-0.8,0.0
25%,0.062038,0.516929
50%,0.122979,0.561859
75%,0.185774,0.601393
max,0.8,1.0


#### Create a 'count' column

In [8]:
df['count'] = df['textCleaned'].map(lambda x: len(x.split()))

In [9]:
pd.pivot_table(df, index='channel', values='count', aggfunc='mean').sort_values('count', ascending=True)[:20]

Unnamed: 0_level_0,count
channel,Unnamed: 1_level_1
Megamanny,1.0
Beluga,1.0
Sophia Slime Mixing,1.0
THEBLACKLABEL,2.0
Lofi Girl,3.0
iRAP7,3.0
스브스케이팝 / SBS KPOP,4.0
Little Soul,5.0
TheThings Celebrity,7.0
Digital Dreams,8.0


In [10]:
# remove all the videos without any real words
df = df[df['count'] > 20]

In [11]:
df.shape

(1094, 20)

In [12]:
df.to_csv('dataSentiment.csv', index=False)

### Sentiment per channel

In [13]:
# pull in the chennel data
df_user = pd.read_csv('channelData.csv')
df_user = df_user.dropna()

In [14]:
df_user['count'] = df_user['documents'].map(lambda x: len(x.split()))

In [15]:
df_user.head()

Unnamed: 0,channel,documents,count
0,1theK (원더케이),wrong good expressing feeling warmhearted woma...,87
1,ABC News,ultimate key unlocking disruption contending a...,380
2,AWESOME WORLD,day army trade mres like cheesecloth tried kor...,322
3,AaronsAnimals,faster gaining catch sothis thing faster wooho...,235
4,Adam Norris,going push beasley video number 10 today slave...,394


In [16]:
channel_polarity = [] # Lies in range [-1,1]  Very negative sentiment to very positive
channel_subjectivity = [] # [0,1] Very objective statement to very subjective statement

for word in tqdm(df_user['documents']):
    try:
        text = TextBlob(word).sentiment
        channel_polarity.append(text.polarity)
        channel_subjectivity.append(text.subjectivity)
    except:
        print(word)

100%|█████████████████████████████████████████████████████████████████████████████████| 441/441 [00:04<00:00, 93.41it/s]


In [17]:
df_user['channel_polarity'] = channel_polarity
df_user['channel_subjectivity'] = channel_subjectivity

In [18]:
df_user.head()

Unnamed: 0,channel,documents,count,channel_polarity,channel_subjectivity
0,1theK (원더케이),wrong good expressing feeling warmhearted woma...,87,0.240741,0.62037
1,ABC News,ultimate key unlocking disruption contending a...,380,0.077059,0.455999
2,AWESOME WORLD,day army trade mres like cheesecloth tried kor...,322,0.301846,0.602803
3,AaronsAnimals,faster gaining catch sothis thing faster wooho...,235,0.143966,0.482137
4,Adam Norris,going push beasley video number 10 today slave...,394,0.122564,0.330137


In [19]:
pd.pivot_table(df_user, index=['channel'], values=['channel_polarity', 'channel_subjectivity']).sort_values('channel_polarity', ascending=False)[:10]

Unnamed: 0_level_0,channel_polarity,channel_subjectivity
channel,Unnamed: 1_level_1,Unnamed: 2_level_1
TheThings Celebrity,0.8,0.4
RoseAngel,0.75,0.75
YouTubers Sing,0.666667,0.666667
JeremyBeans,0.6,1.0
Keeley,0.537121,0.742182
Walt Disney Animation Studios,0.502381,0.867143
Pixar,0.5,0.5
Cristal Valdes,0.5,1.0
Disney ¡Fan!,0.425,0.75
KEJ Productions,0.401761,0.614383


In [20]:
pd.pivot_table(df_user, index=['channel'], values=['channel_polarity', 'channel_subjectivity']).sort_values('channel_polarity', ascending=True)[:10]

Unnamed: 0_level_0,channel_polarity,channel_subjectivity
channel,Unnamed: 1_level_1,Unnamed: 2_level_1
iRAP7,-0.8,0.9
KQ ENTERTAINMENT,-0.61875,0.770833
Bad History,-0.38287,0.691204
OTV & Friends Clips,-0.273214,0.698214
JoBlo Horror Trailers,-0.217641,0.636634
Prince Ea,-0.127851,0.671357
Lofi Girl,-0.125,0.125
Sophia Slime Mixing,-0.125,0.125
THEBLACKLABEL,-0.125,0.125
Beluga,-0.125,0.125


In [21]:
pd.pivot_table(df_user, index=['channel'], values=['channel_polarity', 'channel_subjectivity']).sort_values('channel_subjectivity', ascending=False)[:10]

Unnamed: 0_level_0,channel_polarity,channel_subjectivity
channel,Unnamed: 1_level_1,Unnamed: 2_level_1
Cristal Valdes,0.5,1.0
JeremyBeans,0.6,1.0
iRAP7,-0.8,0.9
Walt Disney Animation Studios,0.502381,0.867143
The Squad YouTube,0.251186,0.800737
KQ ENTERTAINMENT,-0.61875,0.770833
Matthew Weathers,-0.067045,0.758081
Rudy Mancuso,0.204687,0.750868
Disney ¡Fan!,0.425,0.75
RoseAngel,0.75,0.75


In [22]:
pd.pivot_table(df_user, index=['channel'], values=['channel_polarity', 'channel_subjectivity']).sort_values('channel_subjectivity', ascending=True)[:20]

Unnamed: 0_level_0,channel_polarity,channel_subjectivity
channel,Unnamed: 1_level_1,Unnamed: 2_level_1
Megamanny,0.0,0.0
Digital Dreams,0.0,0.0
Little Soul,0.0,0.0
Toniaposts,0.0,0.0
big bRAIN,0.0,0.0
VALORANT,0.0,0.0
Minecraft Speedrun Highlights,0.0,0.1
THEBLACKLABEL,-0.125,0.125
Beluga,-0.125,0.125
스브스케이팝 / SBS KPOP,-0.125,0.125


Check to see what kind of videos are giving poor plarity and sentiment scores, or the same polarity and sentiment scores

In [23]:
pd.pivot_table(df_user, index='channel', values='count', aggfunc='mean').sort_values('count', ascending=True)[:20]

Unnamed: 0_level_0,count
channel,Unnamed: 1_level_1
Sophia Slime Mixing,1
Megamanny,1
Beluga,1
THEBLACKLABEL,2
iRAP7,3
Lofi Girl,3
스브스케이팝 / SBS KPOP,4
Toniaposts,5
Little Soul,5
TheThings Celebrity,7


Looks like we can remove videos that have a low word count 

In [24]:
pd.pivot_table(df_user, index='channel', values='count', aggfunc='mean').sort_values('count', ascending=False)[:20]

Unnamed: 0_level_0,count
channel,Unnamed: 1_level_1
SSSniperWolf,123719
MrBeast,34544
Colleen Ballinger,15749
Mark Rober,15555
Joey Graceffa,12213
Brat TV,11767
MrBeast Gaming,11416
BadBoyHalo,10928
JacyandKacy,10818
ZHC Crafts,10301


### Remove channels with little data

In [25]:
df_user = df_user[df_user['count'] > 20]

In [26]:
df_user.shape

(424, 5)

In [27]:
df_user.to_csv('dataChannelSentiment.csv', index=False)