In [None]:
#check if python is installed
try:
  !python --version
except:
  print("It does not look like python is installed")

Importing Data

In [None]:
pip install cloudmesh-common -U

In [None]:
#set up enviroment
from cloudmesh.common.StopWatch import StopWatch
from collections import Counter
import pandas as pd
import datetime
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score
import seaborn as sns

In [None]:
#load file
StopWatch.start("Code Execution")
StopWatch.start("Loading Dataset")
df =pd.read_csv("https://raw.githubusercontent.com/cybertraining-dsc/fa20-523-327/main/project/dataset/USvideos.csv")
StopWatch.stop("Loading Dataset")
StopWatch.status("Loading Dataset", True)

Familiarize Myself with The Data

In [None]:
StopWatch.start("Data Preparation")
df.info()
print("")
print("Shape: ", df.shape)

In [None]:
#Check for null values
null = df.isna().sum()
for key in null.keys():
    if(null[key] > 0):
        print(key,":",null[key])

Data Preprocessing

In [None]:
#check random tags for relevance
df.sample(n = 3)

In [None]:
#turn boolean labels into 1/0
df["comments_disabled"] = df["comments_disabled"].astype(int)
df["ratings_disabled"] = df["ratings_disabled"].astype(int)
df["video_error_or_removed"] = df["video_error_or_removed"].astype(int)

separate dates into respective columns

In [None]:
#clean publish and date
df['publish_time'] = pd.to_datetime(df["publish_time"], format = '%Y-%m-%d')
df['trending_date'] = pd.to_datetime(df["trending_date"], format = '%y.%d.%m')

In [None]:
#create new columns
df['td_month'] = df['trending_date'].dt.month
df['td_day'] = df['trending_date'].dt.day
df['td_year'] = df['trending_date'].dt.year
df['p_month'] = df['publish_time'].dt.month
df['p_day'] = df['publish_time'].dt.day
df['p_year'] = df['publish_time'].dt.year
StopWatch.stop("Data Preparation")
StopWatch.status("Data Preparation", True)

Pearson's Correlation Graph

In [None]:
plt.figure(figsize=(12,10))
cor = df.corr()
sns.heatmap(cor, annot = True, cmap = plt.cm.Reds)
plt.show()

In [None]:
#Finding highly correlated features
cor_target = abs(cor["views"])
relevant_features = cor_target[cor_target>0.1]
relevant_features

Model 1

In [None]:
#drop fields that cannot be ran through model or relevant
StopWatch.start("Model 1")
x = df.drop(['trending_date','video_id', 'title', 'channel_title', 'publish_time', 'tags',
             'thumbnail_link', 'description', 'comments_disabled', 
              'ratings_disabled', 'video_error_or_removed', 'td_month',
         'td_day', 'td_year', 'p_month', 'p_day', 'p_year'], axis = 'columns')

In [None]:
x = df[['category_id', 'likes', 'dislikes', 'comment_count']]
y = df['views']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 33)

In [None]:
#decision tree
DecisionTree_Class_Model = DecisionTreeRegressor()

In [None]:
DecisionTree_Class_Model.fit(x_train, y_train)

In [None]:
y_pred = DecisionTree_Class_Model.predict(x_test)
y_pred

In [None]:
explained_variance_score(y_test, y_pred)

In [None]:
#random forest
rf_Model = RandomForestRegressor()

In [None]:
rf_Model.fit(x_train, y_train)

In [None]:
explained_variance_score(y_test, y_pred)

In [None]:
StopWatch.stop("Model 1")
StopWatch.status("Model 1", True)

Final Model

In [None]:
#drop fields that cannot be ran through model or relevant
StopWatch.start("Final Model")
x1 = df.drop(['trending_date','video_id', 'title', 'channel_title', 'publish_time', 'tags',
             'thumbnail_link', 'description'], axis = 'columns')

In [None]:
#final 
x1 = df[['category_id', 'likes', 'dislikes', 'comment_count', 
         'comments_disabled', 'ratings_disabled', 'video_error_or_removed', 'td_month',
         'td_day', 'td_year', 'p_month', 'p_day', 'p_year']]
y1 = df["views"]
x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1, test_size = 0.2, random_state = 68)

In [None]:
DecisionTree_Class_Model.fit(x1_train, y1_train)

In [None]:
y_pred1 = DecisionTree_Class_Model.predict(x1_test)
y_pred1

In [None]:
#evaulates model
explained_variance_score(y1_test, y_pred1)

In [None]:
#predicted vs acutal values
plt.figure(figsize = (12,10))
av = sns.kdeplot(y1_test, color = "g", label = "Actual Values")
sns.kdeplot(y_pred1, color = "r", label = "Predicted Values", ax = av)
plt.title("Prediction of Views")
plt.legend()
plt.show()
plt.close()

In [None]:
rf_Model.fit(x1_train, y1_train)

In [None]:
y_pred1 = rf_Model.predict(x1_test)
y_pred1

In [None]:
explained_variance_score(y1_test, y_pred1)

In [None]:
StopWatch.stop("Final Model")
StopWatch.status("Final Model", True)

In [None]:
plt.figure(figsize = (12,10))
av = sns.kdeplot(y1_test, color = "g", label = "Actual Values")
sns.kdeplot(y_pred1, color = "r", label = "Predicted Values", ax = av)
plt.title("Prediction of Views")
plt.legend()
plt.show()
plt.close()

Insights

In [None]:
#plot each category
sns.histplot(df["category_id"])
plt.title("Vidoes By Cateogry")

In [None]:
#count of each category
category_lst = []
for category in df["category_id"]:
  category_lst.append(category)
c = Counter(category_lst)
c.most_common(10)

In [None]:
#which videos fall under the top 3 categories
twofour = df[(df.category_id == 24)]
twofour.head(3)

In [None]:
ten = df[(df.category_id == 10)]
ten.head(3)

In [None]:
twentysix = df[(df.category_id == 26)]
twentysix.head(3)

In [None]:
#count of channels that trend
channel_lst = []
for channel in df["channel_title"]:
  channel_lst.append(channel)
c = Counter(channel_lst)
c.most_common(10)

In [None]:
uniquechannels = len(set(channel_lst))
print("There are", uniquechannels, "unique channels that trended on Youtube")

In [None]:
print("Average Number of Views: ", df['views'].mean())
print("Average Number of Comments: ", df['comment_count'].mean())

In [None]:
likes_lst = []
for likes in df["likes"]:
  likes_lst.append(likes)

likes_total = 0
for l in range(0, len(likes_lst)):
  likes_total = likes_total + likes_lst[l]

print(likes_total)

#dislikes

dislikes_lst = []
for dislikes in df['dislikes']:
  dislikes_lst.append(dislikes)

dislikes_total = 0
for d in range(0, len(dislikes_lst)):
  dislikes_total = dislikes_total + dislikes_lst[d]

print(dislikes_total)

#ratio

print("Total likes to dislikes ratio is 20:1 or",likes_total/dislikes_total)

In [None]:
#get random sample for comparison
df.sample()

In [None]:
StopWatch.stop("Code Execution")
StopWatch.status("Code Execution", True)
StopWatch.benchmark()