In [1]:
%matplotlib inline

In [29]:
# dependencies

import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
import datetime

db_password = "postgres"
db_user = "postgres"
db_name = "Youtube P3"
endpoint = 'youtube.cb1bticre0py.us-east-1.rds.amazonaws.com'

connection_string = f"postgresql://{db_user}:{db_password}@{endpoint}:5432/{db_name}"
engine = create_engine(connection_string)

og_data = pd.read_sql('SELECT * FROM final_unique;', con = engine)

In [30]:
# start here if you need to re-run instead of pulling from postgres in above cell

final_unique = og_data.copy()

In [83]:
## chose target_days for classification model

target_df = pd.DataFrame(final_unique[['video_id', 'trend_days']])

target_days = 4   #     <---- pick your target days here

# prints statement below
print(target_df[target_df.trend_days > target_days].count())
print(target_df[target_df.trend_days > 0].count())              
print(round((target_df[target_df.trend_days > target_days].count()) / (target_df[target_df.trend_days > 0].count()),4))     

#             ↓↓↓↓↓  videos above days threshold / total videos / % of dataset above days threshold

video_id      111492
trend_days    111492
dtype: int64
video_id      241112
trend_days    241112
dtype: int64
video_id      0.4624
trend_days    0.4624
dtype: float64


In [84]:
# create dataset with 50/50 > and < outcomes

over_target = final_unique[final_unique.trend_days > target_days]
under_target = final_unique[final_unique.trend_days <= target_days].sample(n=111492)  # <--- copy & paste top output of last cell

df = pd.concat([over_target, under_target]).reset_index(drop=True)

# add target to column to new 50/50 dataset

df['target'] = df.trend_days > target_days
df['target'] = df['target'].astype(int) # makes 1 or 0 for T or F

df.reset_index(drop=True, inplace=True)

print(over_target.shape)
print(under_target.shape)
print(df.shape)
print(" ")
df.columns

(111492, 34)
(111492, 34)
(222984, 35)
 


Index(['index', 'video_id', 'category', 'category_e', 'country',
       'publish_date', 'trending_date', 'publish_to_trend', 'publish_day',
       'publish_day_num', 'combined_trend_days', 'trend_days', 'views',
       'pt_views', 'wt_views', 'pt_views_rate', 'wt_views_rate', 'likes',
       'pt_likes', 'wt_likes', 'pt_likes_rate', 'wt_likes_rate', 'likes_ratio',
       'dislikes', 'pt_dislikes', 'wt_dislikes', 'pt_dislikes_rate',
       'wt_dislikes_rate', 'comments', 'pt_comments', 'wt_comments',
       'pt_comments_rate', 'wt_comments_rate', 'comments_ratio', 'target'],
      dtype='object')

In [85]:
# chose your features

X = df[[
    'category_e',
    'publish_to_trend',
    'publish_day_num',
    'pt_views',
    'pt_likes',
    'pt_dislikes',
    'pt_comments',
    'likes_ratio',
    'comments_ratio'
    ]].to_numpy()

y = df['trend_days'].to_numpy()

In [86]:
# if not scaling

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

print(X_train.shape)
print(y_train.shape)

(167238, 9)
(167238,)


In [87]:
# if scaling

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
sclr  = scaler.fit(X)
X_scaled = sclr.transform(X)

from sklearn.model_selection import train_test_split

X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_scaled, y,random_state=1)

print(X_train_scaled.shape)
print(y_train.shape)

(167238, 9)
(167238,)


In [88]:
###########################################################################################

In [89]:
# multi-model function unscaled

def test_model(model, data):
    X_train, X_test, y_train, y_test = data
    reg = model.fit(X_train, y_train)
    print(f'Model: {type(reg).__name__}')
    print(f'Train score: {reg.score(X_train, y_train)}')
    print(f'Test Score: {reg.score(X_test, y_test)}\n')
    plt.show()   

# bring in models

from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
#from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

In [90]:
# unscaled run

data = [X_train, X_test, y_train, y_test]

test_model(LinearRegression(), data)
test_model(Lasso(max_iter=2000),data)
test_model(KNeighborsRegressor(), data)
test_model(RandomForestRegressor(), data)
test_model(ExtraTreesRegressor(), data)
test_model(AdaBoostRegressor(), data)
#test_model(SVR(C=1.0, epsilon=0.2), data)

Model: LinearRegression
Train score: 5.059437280074963e-05
Test Score: -5.56641482107878e-05

Model: Lasso
Train score: 2.7429440533466298e-05
Test Score: -1.5695525985348624e-05

Model: KNeighborsRegressor
Train score: 0.2011082051076778
Test Score: -0.19949517853188614

Model: RandomForestRegressor
Train score: 0.8551751092271346
Test Score: -0.0318990141687272

Model: ExtraTreesRegressor
Train score: 1.0
Test Score: -0.061304058667420724

Model: AdaBoostRegressor
Train score: -0.06700268423789812
Test Score: -0.06837486979079666



In [91]:
# multi-model function scaled

def test_model(model, data):
    X_train_scaled, X_test_scaled, y_train, y_test = data_scaled
    reg = model.fit(X_train_scaled, y_train)
    print(f'Model: {type(reg).__name__}')
    print(f'Train score: {reg.score(X_train_scaled, y_train)}')
    print(f'Test Score: {reg.score(X_test_scaled, y_test)}\n')
    plt.show()   

# bring in models

from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
#from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

In [92]:
# scaled run

data_scaled = [X_train_scaled, X_test_scaled, y_train, y_test]

test_model(LinearRegression(), data_scaled)
test_model(Lasso(max_iter=2000),data_scaled)
test_model(KNeighborsRegressor(), data_scaled)
test_model(RandomForestRegressor(), data_scaled)
test_model(ExtraTreesRegressor(), data_scaled)
test_model(AdaBoostRegressor(), data_scaled)
#test_model(SVR(C=1.0, epsilon=0.2), data_scaled)

Model: LinearRegression
Train score: 0.8568744710265116
Test Score: 0.8604595082670738

Model: Lasso
Train score: 0.7235502051012375
Test Score: 0.7242328687840727

Model: KNeighborsRegressor
Train score: 0.9402469985181723
Test Score: 0.9078926652327605

Model: RandomForestRegressor
Train score: 0.9909634756642327
Test Score: 0.9352095191760438

Model: ExtraTreesRegressor
Train score: 1.0
Test Score: 0.9355564579948066

Model: AdaBoostRegressor
Train score: 0.8640601695168255
Test Score: 0.8608282322321918

