# YouTube subscribers and views relation

In [None]:
# read API key
with open('.g_api_key') as f:
    api_key = f.readline()[:-1]

In [None]:
import requests
headers = {'Referer': "http://localhost:8888"}

import json

## Collect data

### Channels info

- channel titile
- stats for channel:
    - subscribers
    - total views 
    - videos count
- uplodas playlist ID

In [None]:
# read channels from file
with open('channels.json', 'r') as f:
    channel_ids = json.load(f)

In [None]:
# select channel IDs 
ids_array = channel_ids['ids']
len(ids_array)

In [None]:
def get_channels_data(ids):
    ids = ','.join(str(id) for id in ids)
    parts = "snippet,contentDetails,statistics"
    url = "https://www.googleapis.com/youtube/v3/channels?part=" + parts + "&id=" + ids + "&key=" + api_key
    response = requests.get(url, headers=headers)
#     print(response)
    resp_body = json.loads(response.text)
    items = resp_body['items']
    return items

In [None]:
channels_data = []

# split IDs to chunks of 50 to pass request length limit
chunks, chunk_size = len(ids_array), 50

# get channels data for chunk of IDs at once
for chunk in [ids_array[i:i+chunk_size] for i in range(0, chunks, chunk_size)]:
    stats = get_channels_data(chunk)
    print(len(stats))
    channels_data.extend(stats)
len(channels_data)

In [None]:
# sample channel
# channels_data[0]

### Videos info

Get list of videos (uploads) by passing playlist ID, in this case — list of uploads

In [None]:
# sample ID
# channels_data[0]['contentDetails']['relatedPlaylists']['uploads']

In [None]:
def get_channel_uploads(list_id):
    parts = "contentDetails"
    url = "https://www.googleapis.com/youtube/v3/playlistItems?part=" + parts + "&playlistId=" + list_id + "&key=" + api_key
    response = requests.get(url, headers=headers)
    resp_body = json.loads(response.text)
    items = resp_body['items']
    return items

In [None]:
# only one list ID per request is allowed
i = 0
for channel in channels_data:
    i += 1
    list_id = channel['contentDetails']['relatedPlaylists']['uploads']
    uploads = get_channel_uploads(list_id)
    channel['uploads'] = uploads
    print(i, "/", len(channels_data), end="\r")

In [None]:
# sample list of uploads
# channels_data[0]['uploads']

In [None]:
# keys of channel object
# channels_data[0].keys()

#### Stats

Request stats for every video by respective ID to calculate stats

In [None]:
def get_videos_info(ids):
    ids = ','.join(str(id) for id in ids)
    parts = "statistics"
    url = "https://www.googleapis.com/youtube/v3/videos?part=" + parts + "&id=" + ids + "&key=" + api_key
    response = requests.get(url, headers=headers)
    resp_body = json.loads(response.text)
    items = resp_body['items']
    return items

In [None]:
i = 0
for channel in channels_data:
    i += 1
    ids = []
    for list_item in channel['uploads'][1:]: # discard last video, it can be very fresh
        id = list_item['contentDetails']['videoId']
        ids.append(id)
    videos = get_videos_info(ids)
    channel['uploads_info'] = videos
    print(i, "/", len(channels_data), end="\r")

In [None]:
# sample video stats
# channels_data[0]['uploads_info']

## Prepare data

- channel titile
- stats for channel:
    - subscribers
    - total views 
    - videos count
    - average views for last 4 videos (except last one):

In [None]:
data = []
i = 0
for channel in channels_data:
    channel_data = {}
    channel_data['id'] = channel['id']
    channel_data['title'] = channel['snippet']['title']
    channel_data['subscribers'] = int(channel['statistics']['subscriberCount'])
    channel_data['total_views'] = int(channel['statistics']['viewCount'])
    channel_data['total_videos'] = int(channel['statistics']['videoCount'])
    
    last_views = 0
    for upload in channel['uploads_info']:
        if 'viewCount' not in upload['statistics']:
            last_views = 0
            break
        else:
            last_views += int(upload['statistics']['viewCount'])    
        
    if last_views > 0 and channel_data['subscribers'] > 0:
        # filter out channels with missing stats
        channel_data['last_views'] = int(last_views)/4
        data.append(channel_data)
    i += 1
    print("\r", i, "/", len(channels_data), end="")
print(" =>", len(data))

### Save data to file

In [None]:
with open('data.json', 'w+') as f:
        json.dump(data, f)

### Load data from file

In [None]:
with open('data.json', 'r') as f:
    data = json.load(f)

## Solution

In [None]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

In [None]:
data_df = pd.DataFrame(data)

Calculate additional values

In [None]:
data_df['views_to_subs'] = data_df['last_views']/data_df['subscribers']

data_df['avrg_views_video'] = data_df['total_views']/data_df['total_videos']
data_df['virality'] = data_df['avrg_views_video']/data_df['subscribers']

In [None]:
to_plot = data_df.sort_values(by='subscribers',ignore_index=True)
to_plot['index'] = to_plot.index.values

In [None]:
# fix base style
plt.rcParams.update({
    'font.size': 15,
    'axes.labelpad': 20,
    'axes.titlepad': 40.0,
    'axes.titleweight': 'bold',   
})
views_color = "b"

# SET PLOT
# create
fig, ax1 = plt.subplots()

# set size in inches
fig.set_size_inches(11,7)

# share x axis
ax2 = ax1.twinx()
ax3 = ax1.twinx()

# show right spine of 3rd chart
ax3.spines['right'].set_position(('axes', 1.2))


# DRAW CHARTS
# draw Last views
p1 = ax1.scatter(
        x=to_plot['index'],
        y=to_plot['last_views'],
        color=views_color,
        s=60+(to_plot['virality']/to_plot['virality'].max())*100000, 
        alpha=0.7, 
        edgecolors=None
    )

# draw Subscribers
p2, = ax2.plot(to_plot['subscribers'], "go")

# draw Views to subscribers relation
p3, = ax3.plot(to_plot[to_plot['views_to_subs']<1]['views_to_subs'], 'r.')


# ADD LINES
# show median views to subscription relation
ax3.hlines(y=to_plot['views_to_subs'].median(), xmin=0, xmax=to_plot['index'].max(), linewidth=1, color='r')

# show channel with 1–500K subscribers
key_ammounts = []
for subs in [1000, 5000, 10000, 50000, 100000, 200000, 500000]:
    channels = to_plot[to_plot['subscribers']>subs]['subscribers']
    lowest_id = channels.index[0]
    ax2.vlines(x=lowest_id, ymin=0, ymax=to_plot['subscribers'].max(), linewidth=1, color='g')
    key_ammounts.append(channels[lowest_id])


# DESCRIBE CHARTS
# limit Y axis and restore margin (ylim drops it)
views_lim = 1000
ymargin = plt.rcParams['axes.ymargin']
ymin = -1*views_lim*ymargin
ymax = views_lim*(1+ymargin)
ylim = (ymin,ymax)

# add chart labels
ax1.set(
        title="Relation of YouTube subscribers and views",
        xlabel='Channels, sorted by subscribtions (vertical lines: \n' + ', '.join(str(subs) for subs in key_ammounts) + ' subscribers)',
        ylabel='Last views (<1000, size: channel virality)',
        ylim=ylim,
       )
ax2.set_ylabel('Subscribers')
ax3.set_ylabel('Last views/subscribers \n(<1, views less than subscribers; \nhorizontal line: overal median)')


# STYLE
ax1.yaxis.label.set_color(views_color)
ax2.yaxis.label.set_color(p2.get_color())
ax3.yaxis.label.set_color(p3.get_color())

tkw = dict(size=5, width=1.5)
ax1.tick_params(axis='y', colors=views_color, **tkw)
ax2.tick_params(axis='y', colors=p2.get_color(), **tkw)
ax3.tick_params(axis='y', colors=p3.get_color(), **tkw)
ax1.tick_params(axis='x', **tkw)


# EXPORT FILE
fig.savefig(pad_inches=True, bbox_inches='tight', fname="yt-views-chart.png")

In [None]:
# show growth of views/subscribers relation
fig, ax1 = plt.subplots()
fig.set_size_inches(11,7)
ax2 = ax1.twinx()

p1, = ax1.plot(to_plot['views_to_subs'].cumsum(), 'bo--')
p2, = ax2.plot(to_plot['subscribers'], 'ro--')

ax1.set_title("Growth of views/subscribers ratio")
ax1.set_ylabel('Views/subscribers cumulative sum', color="b")
ax2.set_ylabel('Subscribers', color="r")
ax1.tick_params(axis='y', colors='b')
ax2.tick_params(axis='y', colors='r')
ax2.set_ylim(0-10000*plt.rcParams['axes.ymargin'],10000*(1+plt.rcParams['axes.ymargin']))
ax1.grid(True, axis='y')

fig.savefig(pad_inches=True, bbox_inches='tight', fname="yt-views-to-subs-cumsum-chart.png")