# Youtube Trending Video Statistics (2017-2018)
It's important to note that this **only** contains trending videos from a certain time period, so any statistics calculated only apply to this population.
Also, one video can go trending multiple times, so there are certainly duplicate videos that went trending at different times in the dataset, that I won't exclude from the data except for certain calculations.

Here's the raw data, with category ids mapped to their string values for better readability.
<br>
**Note:** Loading and mapping the 40949 rows takes a while, but everything is a breeze after that.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json 
import datetime as dt

csv_file = 'USvideos.csv'
json_file = 'US_category_id.json'

#Loading files into variables
trending_videos = pd.read_csv(csv_file, index_col='video_id')
categories_json = json.load(open(json_file,'r'))
categories_by_id = {}
categories_by_name = {}

for category in categories_json['items']:
    categories_by_id[category['id']] = category['snippet']['title']
    categories_by_name[category['snippet']['title']] = category['id']
    
for video in trending_videos.transpose():
    try:
        category_id = trending_videos.loc[video, "category_id"].iloc[0]
    except:
        category_id = trending_videos.loc[video, "category_id"]
    if isinstance(category_id, str) == True:
         continue
    trending_videos.loc[video, "category_id"] = categories_by_id[str(category_id)]

trending_videos

In [None]:
trending_videos

### Simple pie chart showing the distribution of views in trending videos


The videos are grouped into the categories 'Under 100k views', '100k to 500k views', '500k to 1m views', '1m to 3m views', and '3m and above views.'

In [None]:
views_list = list()
for i in range(0, len(trending_videos)):
    views_list.append(trending_videos['views'][i])
labels = 'Under 100k views', '100k to 500k views', '500k to 1m views', '1m to 3m views', '3m and above views'
data = [0, 0, 0, 0, 0]

for i in range(len(views_list)):
    if(views_list[i] > 3000000):
        data[4] += 1
    elif(views_list[i] > 1000000):
        data[3] += 1
    elif(views_list[i] > 500000):
        data[2] += 1
    elif(views_list[i] > 100000):
        data[1] += 1
    elif(views_list[i] > 0):
        data[0] += 1

fig1, ax1 = plt.subplots()
ax1.pie(data, labels=labels, autopct='%1.1f%%', startangle=90)
ax1.axis('equal')
plt.show()

### Distribution of trending videos by category

It's pretty easy to see that certain categories of videos are more popular or more likely to go trending than others. The biggest ones are Entertainment, Music, Howto & Style, People & Blogs, and Comedy, which comprise about 2/3 of the dataset alone!

In [None]:
categories_list = {}
for i in range(0, len(trending_videos)):
    if trending_videos['category_id'][i] in categories_list:
        categories_list[trending_videos['category_id'][i]] += 1
    else:
        categories_list[trending_videos['category_id'][i]] = 1

fig1, ax1 = plt.subplots()
ax1.pie(categories_list.values(), labels=categories_list.keys(), autopct='%1.1f%%', startangle=90)
ax1.axis('equal')
plt.show()

### Distribution of trending videos by category (minority grouped as other)

With the tons of tiny categories lost in there, I thought it would be better to group them as an 'other' category, which makes the graph much more readable. Here it is below!

In [None]:
categories_list = {}
categories_to_delete = []
for i in range(0, len(trending_videos)):
    if trending_videos['category_id'][i] in categories_list:
        categories_list[trending_videos['category_id'][i]] += 1
    else:
        categories_list[trending_videos['category_id'][i]] = 1
categories_list['other'] = 0
for category in categories_list:
    if categories_list[category] < 900 and category != 'other':
        categories_list['other'] += categories_list[category]
        categories_to_delete.append(category)
for category in categories_to_delete:
    del categories_list[category]
fig1, ax1 = plt.subplots()
ax1.pie(categories_list.values(), labels=categories_list.keys(), autopct='%1.1f%%', startangle=90)
ax1.axis('equal')
plt.show()

### Trending Tag Statistics

I thought it would be cool to track usage of tags in the many different trending videos, and see if there was any pattern in their usage over time and by categories.

In [None]:
pd.set_option('display.max_rows', 10)
tag = input("Enter the tag to search:")
unique = input("Show duplicate results (Y/N)? (videos that went trending more than once):").lower()
match_condition = input("Match term (Y/N)? (Excludes tags that may contain the term in it but not by itself):").lower()
results = input("Show all results (Y/N)?")
if results == "y":
    pd.set_option('display.max_rows', 1000000)
if unique == "n":
    unique = True
else:
    unique = False
    
if match_condition == "n":
    match_condition = False
else:
    match_condition = True

months_from_2017 = {}
unique_keys = {}
tag_count = 0
operation = trending_videos.copy()
category_association = {}
categories_to_delete = []
for i in range(0, len(trending_videos)):
    tags = trending_videos['tags'][i]
    if match_condition == True:
        tags = tags.replace('"','').replace("[none]","").split('|')
    if tag in tags:
        trending_date_text = trending_videos['trending_date'][i].split(".")
        months = int(trending_date_text[2])
        if trending_date_text[0] == "18":
            months += 12
        if months in months_from_2017:
            months_from_2017[months] += 1
        else:
            months_from_2017[months] = 1
        if trending_videos.index[i] in unique_keys:
            continue
        operation['tags'][i] = tag
        tag_count += 1
        if trending_videos['category_id'][i] in category_association:
            category_association[trending_videos['category_id'][i]] += 1
        else:
            category_association[trending_videos['category_id'][i]] = 1
        if unique == True:
            unique_keys[trending_videos.index[i]] = True
category_association['other'] = 0
for category in category_association:
    if category_association[category] < 0.04 * tag_count and category != 'other':
        category_association['other'] += category_association[category]
        categories_to_delete.append(category)
for category in categories_to_delete:
    del category_association[category]
print("Found " + str(tag_count) + " results with tag '" + tag + "'.")

mx = []
mv = []
for month in months_from_2017:
    mx.append(int(month))
    mv.append(int(months_from_2017[month]))
    
scatterboi = plt.scatter(mx, mv) #creates a scatterplot object. The first parameter is the x-axis variable and the second parameter is the y-axis variable. 
plt.plot(mx, mv, linestyle='-', marker='o')
plt.xlabel("Months since the start of 2017") #label for the x-axis
plt.ylabel("Tag usage") #label for the y-axis
ax = plt.gca() # gets an instance of the current axes (so everything that is in black is an object)
ax.set_facecolor('#FFFFFF') #colors the background of the object we created in the last line

fig1, ax1 = plt.subplots()
ax1.pie(category_association.values(), labels=category_association.keys(), autopct='%1.1f%%', startangle=90)
ax1.axis('equal')
plt.show()
trending_videos[operation['tags'] == tag]

### Top 100 most common tags used

In [None]:
tags_list = {}
for i in range(0, len(trending_videos)):
    for tag in trending_videos['tags'][i].replace('"','').replace("[none]","").split('|'):
        if(tag != ""):
            if tag in tags_list:
                tags_list[tag] += 1
            else:
                tags_list[tag] = 1
data = []
for key in tags_list:
    value = tags_list[key]
    data.append({'tag': key, 'uses': value})
tags_df = pd.DataFrame(data)
tags_df.set_index('tag', inplace=True)
pd.set_option('display.max_rows', 1000000)
tags_df.sort_values(['uses'], ascending=False).head(100)

### Top channels by most trending videos

In [None]:
channels_list = {}
for i in range(0, len(trending_videos)):
    channel = trending_videos['channel_title'][i]
    if(channel != ""):
        if channel in channels_list:
            channels_list[channel] += 1
        else:
            channels_list[channel] = 1
data = []
for key in channels_list:
    value = channels_list[key]
    data.append({'channel': key, 'videos': value})
tags_df = pd.DataFrame(data)
tags_df.set_index('channel', inplace=True)
pd.set_option('display.max_rows', 1000000)
tags_df.sort_values(['videos'], ascending=False).head(100)

### Top categories by most trending videos

In [None]:
categories_list = {}
for i in range(0, len(trending_videos)):
    if trending_videos['category_id'][i] in categories_list:
        categories_list[trending_videos['category_id'][i]] += 1
    else:
        categories_list[trending_videos['category_id'][i]] = 1

data = []
for key in categories_list:
    value = categories_list[key]
    data.append({'category': key, 'uses': value})
tags_df = pd.DataFrame(data)
tags_df.set_index('category', inplace=True)
pd.set_option('display.max_rows', 1000000)
tags_df.sort_values(['uses'], ascending=False)

### Top categories by most trending videos (unique)

In [None]:
categories_list = {}
videos = {}
for i in range(0, len(trending_videos)):
    if trending_videos.index[i] in videos:
        continue
    if trending_videos['category_id'][i] in categories_list:
        categories_list[trending_videos['category_id'][i]] += 1
    else:
        categories_list[trending_videos['category_id'][i]] = 1
    videos[trending_videos.index[i]] = True

data = []
for key in categories_list:
    value = categories_list[key]
    data.append({'category': key, 'uses': value})
tags_df = pd.DataFrame(data)
tags_df.set_index('category', inplace=True)
pd.set_option('display.max_rows', 1000000)
tags_df.sort_values(['uses'], ascending=False)

### Average Trending Frequency of Each Video by Category

In [None]:
categories_list = {}
unique_categories_list = {}
videos = {}
for i in range(0, len(trending_videos)):
    if trending_videos.index[i] in videos:
        categories_list[trending_videos['category_id'][i]] += 1
    elif trending_videos['category_id'][i] in categories_list:
        categories_list[trending_videos['category_id'][i]] += 1
        unique_categories_list[trending_videos['category_id'][i]] += 1
    else:
        categories_list[trending_videos['category_id'][i]] = 1
        unique_categories_list[trending_videos['category_id'][i]] = 1
    videos[trending_videos.index[i]] = True

data = []
for key in categories_list:
    value = round(categories_list[key] / unique_categories_list[key] * 100) / 100
    data.append({'category': key, 'avg times a video from this category went trending': value})
tags_df = pd.DataFrame(data)
tags_df.set_index('category', inplace=True)
pd.set_option('display.max_rows', 1000000)
tags_df.sort_values(['avg times a video from this category went trending'], ascending=False)

### Distribution of views in all trending videos

In [None]:
views = []
for i in range(0, len(trending_videos)):
    views.append(trending_videos['views'][i])
plt.hist(views, bins = 500, color = '#ffff00', edgecolor = 'b') 
plt.xlabel("Views")
ax = plt.gca()
ax.set_facecolor('#FFFFFF')

### Likes, Dislikes, Views, and Comments Statistics by Category

**Ordered by Like to Dislike ratio**

In [None]:
categories_views = {}
categories_likes = {}
categories_dislikes = {}
categories_comments = {}
unique_categories_list = {}
videos = {}
for i in range(0, len(trending_videos)):
    if trending_videos.index[i] in videos:
        continue
    elif trending_videos['category_id'][i] in categories_likes:
        categories_likes[trending_videos['category_id'][i]] += trending_videos['likes'][i]
        categories_dislikes[trending_videos['category_id'][i]] += trending_videos['dislikes'][i]
        categories_comments[trending_videos['category_id'][i]] += trending_videos['comment_count'][i]
        categories_views[trending_videos['category_id'][i]] += trending_videos['views'][i]
    else:
        categories_likes[trending_videos['category_id'][i]] = trending_videos['likes'][i]
        categories_dislikes[trending_videos['category_id'][i]] = trending_videos['dislikes'][i]
        categories_comments[trending_videos['category_id'][i]] = trending_videos['comment_count'][i]
        categories_views[trending_videos['category_id'][i]] = trending_videos['views'][i]
    videos[trending_videos.index[i]] = True

data = []
for key in categories_likes:
    likes = categories_likes[key]
    dislikes = categories_dislikes[key]
    comments = categories_comments[key]
    views = categories_views[key]
    view_like_ratio = round(views / likes * 100) / 100
    view_comment_ratio = round(views / comments * 100) / 100
    like_dislike_ratio = round(likes / dislikes * 100) / 100
    like_comment_ratio = round(likes / comments * 100) / 100
    data.append({'category': key, 'total views': views, 'total likes': likes, 'total dislikes': dislikes, 'comments': comments, 'view to like ratio': view_like_ratio, 'view to comment ratio': view_comment_ratio, 'like to dislike ratio': like_dislike_ratio, 'like to comment ratio': like_comment_ratio})
tags_df = pd.DataFrame(data)
tags_df.set_index('category', inplace=True)
pd.set_option('display.max_rows', 1000000)
tags_df.sort_values(['like to dislike ratio'], ascending=False)

**Ordered by Like to Comment ratio**

In [None]:
tags_df.sort_values(['like to comment ratio'], ascending=False)

**Ordered by Views to Likes ratio**

In [None]:
tags_df.sort_values(['view to like ratio'], ascending=False)

**Ordered by Views to Comments ratio**

In [None]:
tags_df.sort_values(['view to comment ratio'], ascending=False)

### Comparing two statistics of a Category with a Scatterplot

In [None]:
category_response = input("What category would you like to view the graph of?")
comparator1 = input("Comparator 1 (views/likes/dislikes/comment_count)?")
comparator2 = input("Comparator 2 (views/likes/dislikes/comment_count)?")
sample = trending_videos[trending_videos['category_id'] == category_response]
sample_x = list(sample[comparator1])
sample_y = list(sample[comparator2])
scatterboi = plt.scatter(sample_x, sample_y) #creates a scatterplot object. The first parameter is the x-axis variable and the second parameter is the y-axis variable. 
plt.xlabel(category_response + " " + comparator1) #label for the x-axis
plt.ylabel(category_response + " " + comparator2) #label for the y-axis
plt.plot(np.unique(sample_x), np.poly1d(np.polyfit(sample_x, sample_y, 1))(np.unique(sample_x))) #draws a line of best fit from the data points
ax = plt.gca() # gets an instance of the current axes (so everything that is in black is an object)
ax.set_facecolor('#000000') #colors the background of the object we created in the last line