# Youtube Channel Statistic Analysis

Here we will analyze all the data that we collected from the youtube data API.

## Necessary Imports

Here we will import necessary libraries.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np

## Import Data from Data Files

Here we will import the data from the csv files in the data directory of our project and save it to dataframes.

In [None]:
channelStatistics = pd.read_pickle('../data/channelStatistics.pkl')

videoStatistics = pd.read_pickle('../data/videoStatistics.pkl')

In [None]:
channelStatistics.info()

In [None]:
videoStatistics.info()

As mentioned when cleaning the data, since we are still able to run operations such as sum() on a column that contains numpy NaN's, we will not be dropping these rows (videos) as they will not hinder our data analysis.

# Average Views Per Video

Here we will analyze the average number of views per videos which will give us a broad idea of how the channels video's perform.

In [None]:
channelStatistics['averageViewsPerVideo'] = channelStatistics['viewCount'] / channelStatistics['videoCount']

averageViewsPerVideoDf = channelStatistics[['channelName', 'viewCount', 'videoCount', 'averageViewsPerVideo']].sort_values(by='averageViewsPerVideo', ascending=False)

averageViewsPerVideoDf


Now we will plot this average on a graph where the x axis will be the video count and the y axis will be the view count.

In [None]:
plt.figure(figsize=(10,6))
plt.scatter(averageViewsPerVideoDf['videoCount'], averageViewsPerVideoDf['viewCount'], alpha=0.7)
plt.xlim(left=0)
plt.ylim(bottom=0)
plt.title('Total Channel View Count vs Video Count')
plt.xlabel('Video Count')
plt.ylabel('View Count in Millions')

locs, labels = plt.yticks()
new_tick_locations = locs / 1e6
new_tick_labels = ['{:.0f}M'.format(loc) for loc in new_tick_locations]
plt.yticks(ticks=locs, labels=new_tick_labels)

plt.show()

# Subscriber Engagement

Here we will be analyzing the subscriber engagement through comparing the relationship between subscriber count and view count for each channel.

Therefore we will create a new metric called subscriber engagement that will be given by the ratio: view count / subscriber count.

That way, if the channel has:

many subscribers and many views -->  the channel will have average subscriber engagement
few subscribers and many views -->  the channel will have high subscriber engagement
many subscribers and few views -->  the channel will have low subscriber engagement
few subscribers and few views -->  the channel will have average subscriber engagement

We will create a new dataframe that contains the name of the channel and its corresponding subscriber engagement. It will be ordered by decreasing subscriber engagement.

In [None]:
channelStatistics['subscriberEngagement'] = channelStatistics['viewCount'] / channelStatistics['subscriberCount']

subscriberEngagementDf = channelStatistics[['channelName', 'viewCount', 'subscriberCount', 'subscriberEngagement']].sort_values(by='subscriberEngagement', ascending=False)

subscriberEngagementDf

After that, we will plot the different subscriber engagments in a graph that contains the subscriber count on the x axis and the view count on the y axis.

In [None]:
plt.figure(figsize=(10,6))
plt.scatter(subscriberEngagementDf['subscriberCount'], subscriberEngagementDf['viewCount'], alpha=0.7)
plt.xlim(left=0)
plt.ylim(bottom=0)
plt.title('Total Channel View Count vs Subscriber Count')
plt.xlabel('Subscriber Count')
plt.ylabel('View Count in Millions')

ylocs, ylabels = plt.yticks()
newYTickLocations = ylocs / 1e6
newYTickLabels = ['{:.0f}M'.format(loc) for loc in newYTickLocations]
plt.yticks(ticks=ylocs, labels=newYTickLabels)

xlocs, xlabels = plt.xticks()
newXTickLocations = xlocs / 1e6
newXTickLabels = ['{:.0f}M'.format(loc) for loc in newXTickLocations]
plt.xticks(ticks=xlocs, labels=newXTickLabels)

plt.show()

# Video View Count Over Time

Here we will analyze the view count of a particular channel over time.

To do so, first, we will filter to only the videos of the channel we desire to visualize the data for.

After that, we will plot each video of a given channel on a scatter plot, where the x axis will be time and the y axis will be the view count.

But before any of that, we will create a dataframe that contains all of the unique channels of our data set so we can choose which channel we will want to filter out.

In [None]:
uniqueChannels = videoStatistics[['channelTitle', 'channelID']].drop_duplicates()

uniqueChannels

Now we will filter the videos for the our desired channel and plot the data.

In [None]:
channelName = 'GORGONOID'
channelViewCountOverTimeDf = videoStatistics[videoStatistics['channelTitle'] == channelName].copy()

channelViewCountOverTimeDf['videoPublishDatetime'] = pd.to_datetime(channelViewCountOverTimeDf['videoPublishDatetime'])

channelViewCountOverTimeDf['videoPublishDatetime'] = channelViewCountOverTimeDf['videoPublishDatetime'].dt.tz_localize(None)

numericDates = mdates.date2num(channelViewCountOverTimeDf['videoPublishDatetime'])

slope, intercept = np.polyfit(numericDates, channelViewCountOverTimeDf['videoViewCount'], 1)

regressionLine = slope * numericDates + intercept

plt.figure(figsize=(10,6))
plt.scatter(channelViewCountOverTimeDf['videoPublishDatetime'], channelViewCountOverTimeDf['videoViewCount'])

plt.plot(channelViewCountOverTimeDf['videoPublishDatetime'], regressionLine, color='red', label=f'Linear Regression | Slope = {slope:.0f}')

plt.gca().xaxis.set_major_locator(mdates.YearLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))

maxViewCount = channelViewCountOverTimeDf['videoViewCount'].max()
nextMillion = np.ceil(maxViewCount / 1e6) * 1e6

stepSize = 1e6
yticks = np.arange(0, nextMillion+1, step=stepSize)

plt.yticks(ticks=yticks, labels=[f"{int(tick/1e6)}M" for tick in yticks])

plt.title('Video View Count Over Time')
plt.xlabel('Time')
plt.ylabel('View Count in Millions')

plt.legend()
plt.show()

In [None]:
channelViewCountOverTimeDf['semester'] = (channelViewCountOverTimeDf['videoPublishDatetime'].dt.month - 1) // 6 + 1

# Create a year-semester column for grouping
channelViewCountOverTimeDf['yearSemester'] = channelViewCountOverTimeDf['videoPublishDatetime'].dt.year.astype(str) + '-' + channelViewCountOverTimeDf['semester'].astype(str)


# Group by year-semester and calculate the average view count
semesterAverage = channelViewCountOverTimeDf.groupby('yearSemester')['videoViewCount'].mean().reset_index()

minYear = channelViewCountOverTimeDf['videoPublishDatetime'].dt.year.min()
maxYear = channelViewCountOverTimeDf['videoPublishDatetime'].dt.year.max()

allSemesters = [f"{year}-{semester}" for year in range(minYear, maxYear + 1) for semester in range(1, 3)]

i = 0
while i < len(semesterAverage):
    if allSemesters[i] != semesterAverage['yearSemester'].loc[i]:

        newRow = pd.DataFrame({
            'yearSemester': [allSemesters[i]],
            'videoViewCount': [0]
        }, index=[i])
        semesterAverage = pd.concat([semesterAverage.loc[:i-1], newRow, semesterAverage.loc[i:]])
        semesterAverage = semesterAverage.reset_index(drop=True)
    
    i += 1

semesterAverage['numericSemester'] = semesterAverage['yearSemester'].apply(
    lambda x: float(x.split('-')[0]) + (0.5 if x.split('-')[1] == '2' else 0.0)
)

# Perform linear regression using numpy's polyfit
slope, intercept = np.polyfit(semesterAverage['numericSemester'], semesterAverage['videoViewCount'], 1)

# Create the regression line
regressionLine = slope * semesterAverage['numericSemester'] + intercept

plt.figure(figsize=(10, 6))
plt.bar(semesterAverage['yearSemester'], semesterAverage['videoViewCount'])

plt.plot(semesterAverage['yearSemester'], regressionLine, color='red', label=f'Linear Regression | Slope = {slope:.0f}')

plt.title('Average Video View Count Per Semester')
plt.xlabel('Semester')
plt.ylabel('Average View Count')
plt.xticks(rotation=45)

maxViewCount = semesterAverage['videoViewCount'].max()
nextHundredK = np.ceil(maxViewCount / 1e5) * 1e5
yticks = np.arange(0, nextHundredK + 1, step=1e5)

ytickLabels = [f"{tick/1e3:.0f}K" for tick in yticks]

plt.yticks(ticks=yticks, labels=ytickLabels)

plt.legend()

plt.show()

In [None]:
videoStatistics.info()

In [None]:
videoStatistics