In [1]:
import json
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
import pickleshare

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kovre\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kovre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# loading and cleaning GBcomments dataframe
df_gb_comments = pd.read_csv('base_data/GBcomments.csv')
df_gb_comments = df_gb_comments.iloc[:, :4]
df_gb_comments['likes'] = pd.to_numeric(df_gb_comments['likes'], errors='coerce')
df_gb_comments['replies'] = pd.to_numeric(df_gb_comments['replies'], errors='coerce')
df_gb_comments = df_gb_comments.dropna()

# loading and cleaning UScomments dataframe
df_us_comments = pd.read_csv('base_data/UScomments.csv', encoding_errors='replace',
                             usecols=df_gb_comments.columns[0:4],
                             dtype={df_gb_comments.columns[1]: 'str', df_gb_comments.columns[2]: 'str',
                                    df_gb_comments.columns[3]: 'str'})
df_us_comments = df_us_comments.iloc[:, :4]
df_us_comments = df_us_comments[df_us_comments['video_id'].map(lambda x: len(x)) == 11]
df_us_comments['likes'] = pd.to_numeric(df_us_comments['likes'], errors='coerce')
df_us_comments['replies'] = pd.to_numeric(df_us_comments['replies'], errors='coerce')
df_us_comments = df_us_comments.dropna()

# Creating combined comments dataframe
df_gb_us_comments = pd.concat([df_gb_comments, df_us_comments], axis='index', ignore_index=True)
df_gb_us_comments = df_gb_us_comments[df_gb_us_comments['video_id'].str.match(r'[a-zA-Z0-9-_]{11}')]

# loading GBvideos dataframe
df_gb_videos = pd.read_csv('base_data/GBvideos.csv')
df_gb_videos = df_gb_videos.dropna()
df_gb_videos = df_gb_videos[df_gb_videos['video_id'].map(lambda x: len(x)) == 11]

# loading USvideos dataframe
df_us_videos = pd.read_csv('base_data/USvideos.csv')
df_us_videos = df_us_videos.iloc[:, :11]
df_us_videos = df_us_videos.dropna()
df_us_videos = df_us_videos[df_us_videos['video_id'].map(lambda x: len(x)) == 11]

# Creating combined videos dataframe
df_gb_us_videos = pd.concat([df_gb_videos, df_us_videos])
df_gb_us_videos = df_gb_us_videos.set_index(np.arange(len(df_gb_us_videos)))
df_gb_us_videos = df_gb_us_videos.loc[df_gb_us_videos.groupby('video_id')['views'].idxmax()]

# Loading and reformating categories
f = open('base_data/GB_category_id.json')
json_gb_category = json.load(f)
f.close()

values_gb = []
for i in json_gb_category['items']:
    values_gb.append([i['id'], i['snippet']['title']])
df_gb_category = pd.DataFrame(values_gb)

f = open('base_data/US_category_id.json')
json_us_category = json.load(f)
f.close()

values_us = []
for i in json_us_category['items']:
    values_us.append([i['id'], i['snippet']['title']])
df_us_category = pd.DataFrame(values_us)  # [[index1, value], [index1, value], [], []

# Merging videos, comments and categories into one dataframe
df_gb_us_categories = pd.merge(df_gb_category, df_us_category, how='outer', on=[0, 1])
df_gb_us_categories = df_gb_us_categories.set_axis(['category_id', 'category_name'], axis=1)
df_gb_us_categories = df_gb_us_categories.set_index(df_gb_us_categories['category_id']).drop(axis='columns',
                                                                                             labels='category_id')
df_gb_us_videos['category_id'] = df_gb_us_videos['category_id'].map(
    lambda x: df_gb_us_categories.loc[str(x)]['category_name'])
df_final = pd.merge(df_gb_us_videos, df_gb_us_comments, how='outer', on='video_id')
df_final = df_final.rename(columns={'likes_x': 'likes_video', 'likes_y': 'likes_comment'})
df_final = df_final.drop(['thumbnail_link', 'date'], axis='columns')
df_final = df_final.dropna()
filtered = df_final['comment_text'].str.contains(r'(https|http)://(\w*)?\.?(\w*)\.(\w*)(/[\w_=+-?!&\.]*)*')
df_final = df_final[~filtered]

  df_gb_comments = pd.read_csv('base_data/GBcomments.csv')
  filtered = df_final['comment_text'].str.contains(r'(https|http)://(\w*)?\.?(\w*)\.(\w*)(/[\w_=+-?!&\.]*)*')


In [3]:
#Creating dataframe with comments by category
df_comment_by_category = df_final.groupby(['category_id'])['comment_text'].apply(lambda x: " ".join(x))

In [4]:
#Cleaning comments
df_comment_by_category = df_comment_by_category.str.lower()
df_comment_by_category = df_comment_by_category.apply(lambda x: re.sub(f'[{chr(8205)}-{chr(129784)}]', ' ', x))
df_comment_by_category = df_comment_by_category.apply(lambda x: re.sub(r'\\n', '', x))
df_comment_by_category = df_comment_by_category.apply(lambda x: re.sub(r'[:!~?&)(*-<>=+]', '', x))
df_comment_by_category = df_comment_by_category.apply(lambda x: re.sub(r'[.,\d]*', '', x))

In [5]:
#This function gets the informative words from the text and counts their number of occurrences
def count_words(comment_text):
    word_count = {}
    for a in comment_text.split():
        if len(a) >= 3:
            if a in word_count.keys():
                word_count[a] += 1
            else:
                word_count[a] = 1
    for r in list(word_count.keys()):
        if r in stopwords.words('english'):
            word_count.pop(r)
    return list(sorted(word_count.items(), key= lambda item: item[1], reverse=True))[:15]

df_comment_by_category_stats = df_comment_by_category.to_frame()
df_comment_by_category_stats['nb_words'] = df_comment_by_category_stats['comment_text'].apply(lambda x: len([word for word in x.split()]))
df_comment_by_category_stats['top_words'] = df_comment_by_category_stats['comment_text'].apply(count_words)

In [6]:
#In this section I work with dataframes to gather useful information
df_categories_stats = df_final.iloc[:, :-3].groupby('category_id')[['views', 'likes_video', 'dislikes', 'comment_total']].sum().sort_values(
    by='views', ascending=False)
df_categories_stats.insert(3, 'total_reactions', df_categories_stats['likes_video'] + df_categories_stats['dislikes'])

In [7]:
%store df_categories_stats
%store df_comment_by_category
%store df_comment_by_category_stats

Stored 'df_categories_stats' (DataFrame)
Stored 'df_comment_by_category' (Series)
Stored 'df_comment_by_category_stats' (DataFrame)
