In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import re
import contractions
import warnings

from my_functions import *
from datetime import datetime
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [2]:
# remove chained assignment error message
pd.options.mode.chained_assignment = None
# ignore warnings
warnings.filterwarnings("ignore") 

In [3]:
# assign text and meta data csv's to pandas dataframes
text_df = pd.read_csv(r'youtube_text_data.csv')
meta_df = pd.read_csv(r'youtube_metadata.csv')

In [4]:
# drop duplicates in both dataframes
text_df = text_df.drop_duplicates(subset=['vidId'])
meta_df = meta_df.drop_duplicates(subset=['vidId'])
print(text_df.shape, meta_df.shape)

(1238, 2) (1520, 14)


In [5]:
# merge text and meta df for processing
df = pd.merge(text_df, meta_df, how='left', on="vidId")

In [6]:
print(df.shape)
df.head()

(1238, 15)


Unnamed: 0,vidId,videoText,date,title,views,likes,dislikes,commentCount,vidLength,description,channel,country,totChanViews,totSubscribers,totChanVideos
0,7PIMiDcwNvc,[Music] I am most of all happy and grateful t...,2019-08-24T15:39:57Z,Marzia & Felix - Wedding 19.08.2019,39403934.0,5509252.0,33359.0,526815.0,PT6M22S,"Our footage from the wedding, best day of my l...",PewDiePie,US,27984030000.0,110000000.0,4443.0
1,s2ZTZrghxvg,"(upbeat mellow music) - Lilly, you're almost ...",2021-11-04T16:00:12Z,Indian Parents Give Me “The Talk.” It’s Not Wh...,125462.0,14308.0,109.0,639.0,PT3M40S,You’ve heard of the Kama Sutra. But allow me t...,Lilly Singh,-1,3489530000.0,14700000.0,836.0
2,ce4V_PYLhEo,(hoof beats clip-clopping) (spooky mellow mus...,2018-10-22T21:57:57Z,Halloween As An Adult vs A Kid,8157273.0,163837.0,3156.0,5633.0,PT2M31S,Remember when Halloween was an excuse to go bu...,Lilly Singh,-1,3489530000.0,14700000.0,836.0
3,f6LCVGhRqx4,what's up guys welcome back to my youtube cha...,2021-10-24T16:00:31Z,WE WENT HERE FOR OUR 1 YEAR ANNIVERSARY...,623236.0,37788.0,469.0,1139.0,PT14M31S,Dixie and I traveled to Paris for fashion week...,Noah Beck,US,74682300.0,1590000.0,49.0
4,dE-SHprUhYw,[Music] bunny checks from parents I must have...,2018-02-03T22:38:54Z,FUNNIEST TEXTS FROM PARENTS,4045355.0,94099.0,1198.0,6457.0,PT10M9S,FUNNIEST TEXTS FROM PARENTS! Funny text messag...,SSSniperWolf,US,16180860000.0,30000000.0,2828.0


#### Check if any channel is represented more than others

In [7]:
pd.pivot_table(df, index=['channel'], values='title', aggfunc=np.size).sort_values("title", ascending=False)[:10]

Unnamed: 0_level_0,title
channel,Unnamed: 1_level_1
SSSniperWolf,151
MrBeast,38
Nickelodeon,18
TED-Ed,16
Colleen Ballinger,16
Mark Rober,16
MrBeast Gaming,15
Lilly Singh,15
Dhar Mann,14
ZHC Crafts,14


#### Reduce the amount of rows from SSSniperWolf and MrBeast  

    NOTE: --> I later decided to keep all videos for more data and did not use the following cells

In [8]:
# # create SSSniperWolf mask
# sniper_df = df[df['channel'] == 'SSSniperWolf']
# # get a random df of SSniper Wolf rows to remove, we will get rid of 85% of SSSniperWolf's videos
# remove_sniper_df = sniper_df.sample(125)
# # assign index values to a list for removal
# sniper_index_list = remove_sniper_df.index.values.tolist()
# df.drop(sniper_index_list, inplace=True)

In [9]:
# # create MrBeast mask
# beast_df = df[df['channel'] == 'MrBeast']
# # get a random df of MrBeast rows to remove, we will get rid of 50% of MrBeast's videos
# remove_beast_df = beast_df.sample(18)
# # assign index values to a list for removal
# beast_index_list = remove_beast_df.index.values.tolist()
# df.drop(beast_index_list, inplace=True)

In [10]:
# # reset the df's index
# df = df[df['title'].notna()]
# df = df[df['channel'] != 'Fox News']
# df = df.reset_index(drop=True)
# df.shape

In [11]:
# # check the pivot table now
# pd.pivot_table(df, index=['channel'], values='title', aggfunc=np.size).sort_values("title", ascending=False)[:10]

### Clean and Create additional columns

In [12]:
# create column that identifies if a video has music in it or not
df['music'] = df.videoText.map(lambda x: 1 if '[Music]' in x else 0)

In [13]:
# get only date from date column
df['date'] = df['date'].apply(lambda x: str(x).split('T')[0])
df['date'] = pd.to_datetime(df['date'])

In [14]:
# remove videos before 2015
df = (df[df['date'] > '2015-01-01'])

In [15]:
# convert the video length column to a readable format
df['vidLength'] = vidLengthConverter(df['vidLength'])  # custom function imported
# convert vidLength to dateTime from string
df['vidLength'] = df['vidLength'].map(lambda x: datetime.strptime(x, '%H:%M:%S').time())

### Text Preprocessing

In [16]:
def preprocessor(text):
    # remove all words in brackets and parenthesis 
    # --> these are fillers that Youtube uses to identify general sounds like Music, Applause, etc., which are not helpful to our overall goal
    text = re.sub("[\(\[].*?[\)\]]", "", text)
    # fix all contractions
    text = contractions.fix(text)
    # remove all puncuation
    text = re.sub('[%s]'% re.escape(string.punctuation), '', text)
    # strip and lowercase all letters
    text = (re.sub('[\W]+', ' ', text.strip().lower()))
    return text + ' '

In [17]:
df['textCleaned'] = df['videoText'].map(preprocessor)

In [18]:
df['textCleaned'].head()

0    i am most of all happy and grateful to finally...
1    lilly you are almost old enough we think it is...
2    yum these are my favorite oh god that is so sw...
3    what is up guys welcome back to my youtube cha...
4    bunny checks from parents i must have gotten t...
Name: textCleaned, dtype: object

In [19]:
# reset the df's index
df = df.reset_index(drop=True)
df.shape

(1153, 17)

### Explore Topic Modeling

Remove all stop words

In [20]:
# if you want to add additional words, use .union(['word']), 
# where word = a word you want to add as a stop word
add_words = ['hmm', 'um', 'uh', 'oh', 'okay', 'yeah', 'la', 'cause', 'wanna', 'right', 'baby', 'gosh', 'isnt', 'hey', 'wait', 'the', 'ah']

stop_words = ENGLISH_STOP_WORDS.union(add_words)
df['textCleaned'] = df['textCleaned'].apply(lambda x: ' '.join([word for word in x.split(' ') if word not in stop_words]))

In [21]:
df['textCleaned'].head()

0    happy grateful finally mighty hospital met tim...
1    lilly old think time talk mom 32 years old chi...
2    yum favorite god sweet ew god teeth covered mo...
3    guys welcome youtube channel guy know just lef...
4    bunny checks parents gotten stupid virus phone...
Name: textCleaned, dtype: object

Lemmatize words

In [22]:
df['textCleaned'].head()

0    happy grateful finally mighty hospital met tim...
1    lilly old think time talk mom 32 years old chi...
2    yum favorite god sweet ew god teeth covered mo...
3    guys welcome youtube channel guy know just lef...
4    bunny checks parents gotten stupid virus phone...
Name: textCleaned, dtype: object

In [23]:
lemmer = WordNetLemmatizer()
df['textCleaned'] = df['textCleaned'].map(lambda x: ' '.join([lemmer.lemmatize(word) for word in x.split(' ') if word not in stop_words]))

In [24]:
df['textCleaned'].head()

0    happy grateful finally mighty hospital met tim...
1    lilly old think time talk mom 32 year old chir...
2    yum favorite god sweet ew god teeth covered mo...
3    guy welcome youtube channel guy know just left...
4    bunny check parent gotten stupid virus phone s...
Name: textCleaned, dtype: object

In [25]:
# export full cleaned dataframe to csv
df.to_csv('cleanedData.csv', index=False)

Create a channel based dataframe

In [26]:
userText = df.groupby('channel').textCleaned.sum()

In [27]:
userText_df = pd.DataFrame(data=userText.values, index=userText.index)

In [28]:
userText_df.columns = ['documents']

In [29]:
userText_df.head(10)

Unnamed: 0_level_0,documents
channel,Unnamed: 1_level_1
1theK (원더케이),wrong good expressing feeling warmhearted woma...
ABC News,ultimate key unlocking disruption contending a...
AWESOME WORLD,day army trade mres like cheesecloth tried kor...
AaronsAnimals,faster gaining catch sothis thing faster wooho...
Adam Norris,going push beasley video number 10 today slave...
Adobe Asia Pacific,hi everybody jeremy lord freelance illustrator...
Adventure Athlete,guy clark hazlit better known adventure athlet...
Airrack,america largest maze let like maze best friend...
Aissata Amadou,hi guy welcome welcome channel today video goi...
Alan Becker,welcome window 7 computer generation people us...


In [30]:
len(userText_df['documents'][425])   # ---> inspect a random grouping

1354

In [31]:
userText_df.to_csv('channelData.csv', index=True)