# This File Contains:

- Preliminary modeling code
- "Comments Per Hour" method code (failed to produce better results)
- Feature importance code
- Some other things that were useful for the main file

In [61]:
import numpy as np
import pandas as pd
import math
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import VotingRegressor
from sklearn.inspection import permutation_importance

from sklearn.model_selection import train_test_split

In [62]:
test = pd.read_csv("./C2T2_Test.csv")
train = pd.read_csv("./C2T2_train.csv")

test.head()

Unnamed: 0,ID,PageLikes,PageCheckIns,DailyInterest,PageCategory,PageCC1Min,PageCC1Max,PageCC1Avg,PageCC1Median,PageCC1Std,...,Published_Friday,Published_Saturday,PredictOn_Sunday,PredictOn_Monday,PredictOn_Tuesday,PredictOn_Wednesday,PredictOn_Thursday,PredictOn_Friday,PredictOn_Saturday,CommentsNumber
0,130000,879585,11,49540,9,0,1063,102.595311,43.0,173.796629,...,0,0,0,0,0,0,1,0,0,
1,130001,879585,11,49540,9,0,1063,102.595311,43.0,173.796629,...,0,0,0,0,0,0,0,1,0,
2,130002,879585,11,49540,9,0,1063,102.595311,43.0,173.796629,...,0,0,0,0,0,0,1,0,0,
3,130003,879585,11,49540,9,0,1063,102.595311,43.0,173.796629,...,0,0,0,0,0,0,1,0,0,
4,130004,879585,11,49540,9,0,1063,102.595311,43.0,173.796629,...,0,0,0,0,0,0,1,0,0,


In [63]:
train_x, train_verification_x, train_y, train_verification_y = train_test_split(
    train.drop(columns = "CommentsNumber"),
    train["CommentsNumber"],
    test_size = 0.2,
    random_state = 0
)

train_verification_x.head()

Unnamed: 0,ID,PageLikes,PageCheckIns,DailyInterest,PageCategory,PageCC1Min,PageCC1Max,PageCC1Avg,PageCC1Median,PageCC1Std,...,Published_Thursday,Published_Friday,Published_Saturday,PredictOn_Sunday,PredictOn_Monday,PredictOn_Tuesday,PredictOn_Wednesday,PredictOn_Thursday,PredictOn_Friday,PredictOn_Saturday
3652,3653,88359,3,42,17,0,4,0.782456,0.0,1.130925,...,1,0,0,1,0,0,0,0,0,0
124158,124159,29392,0,597,91,0,23,1.803519,1.0,2.628981,...,1,0,0,0,0,0,0,0,0,1
87089,87090,14691,0,4072,16,0,704,31.367206,9.0,77.067329,...,0,0,0,0,0,0,0,0,1,0
39220,39221,4072194,880,161174,9,0,2219,147.955342,78.0,217.664556,...,1,0,0,0,0,0,0,0,0,1
15377,15378,5970921,53096,234501,9,0,1294,136.465394,49.5,228.628584,...,0,0,0,0,0,0,0,1,0,0


In [64]:
# Purpose: efficiently testing different models without repeating too much code

class ModelMaker:
    def __init__(self, model_type, train_x, train_y, test_x, test_y):
        self.model = model_type
        self.train_x = train_x
        self.train_y = train_y
        self.test_x = test_x
        self.test_y = test_y

        # Model outputs when used on testing set
        self.preds_df = pd.DataFrame()

        # Model outputs when used on training set
        self.train_df = pd.DataFrame()


    def run_model(self):
        m = self.model.fit(self.train_x, self.train_y)

        # Testing set predictions
        preds = m.predict(self.test_x)
        preds_rounded = [round(p, 0) for p in preds]

        # Training set predictions
        train_preds = m.predict(self.train_x)
        train_preds_rounded = [round(p, 0) for p in train_preds]

        # Create dataframes with predictions
        self.preds_df = pd.DataFrame(
            {
                "Actual": self.test_y,
                "Predicted": preds_rounded
            }
        )

        self.train_df = pd.DataFrame(
            {
                "Actual": self.train_y,
                "Predicted": train_preds_rounded
            }
        )

    def rmse(self, df):
        rmse = 0

        for index, row in df.iterrows():
            rmse += (int(row["Predicted"]) - int(row["Actual"]))**2

        rmse /= len(df)
        rmse = math.sqrt(rmse)

        return rmse
    
    def get_rmse(self):
        return (self.rmse(self.preds_df), self.rmse(self.train_df))


In [65]:
mod1 = ModelMaker(
    RandomForestRegressor(max_depth=6, random_state=0),
    train_x,
    train_y,
    train_verification_x,
    train_verification_y
)

mod1.run_model()
results = mod1.get_rmse()

print("Testing RMSE: " + str(results[0]))
print("Training RMSE: " + str(results[1]))

Testing RMSE: 16.790517012244187
Training RMSE: 16.006142810174133


In [66]:
# Reduce features based on intuition

train_x_reduced, train_verification_x_reduced, train_y_reduced, train_verification_y_reduced = train_test_split(
    train.drop(columns = ["CommentsNumber"])
    [
        [
            "PageLikes", 
            "PageCheckIns", 
            "DailyInterest",
            "PageCategory",
            "TotalComments_CC1", 
            "CommentsLast24H_CC2",
            "CommentsLast48to24H_CC3",
            "CommentsFirst24H_CC4",
            "TimeSincePublishedinHrs",
            "PostLength",
            "PostShareCount",
            "PostPromoted",
            "PredictAfterHrs"
        ]
    ],
    train["CommentsNumber"],
    test_size = 0.2,
    random_state = 0
)

train_verification_x.head()

Unnamed: 0,ID,PageLikes,PageCheckIns,DailyInterest,PageCategory,PageCC1Min,PageCC1Max,PageCC1Avg,PageCC1Median,PageCC1Std,...,Published_Thursday,Published_Friday,Published_Saturday,PredictOn_Sunday,PredictOn_Monday,PredictOn_Tuesday,PredictOn_Wednesday,PredictOn_Thursday,PredictOn_Friday,PredictOn_Saturday
3652,3653,88359,3,42,17,0,4,0.782456,0.0,1.130925,...,1,0,0,1,0,0,0,0,0,0
124158,124159,29392,0,597,91,0,23,1.803519,1.0,2.628981,...,1,0,0,0,0,0,0,0,0,1
87089,87090,14691,0,4072,16,0,704,31.367206,9.0,77.067329,...,0,0,0,0,0,0,0,0,1,0
39220,39221,4072194,880,161174,9,0,2219,147.955342,78.0,217.664556,...,1,0,0,0,0,0,0,0,0,1
15377,15378,5970921,53096,234501,9,0,1294,136.465394,49.5,228.628584,...,0,0,0,0,0,0,0,1,0,0


In [67]:
# Reduced features does not affect accuracy greatly

mod2 = ModelMaker(
    RandomForestRegressor(max_depth=10, random_state=0),
    train_x_reduced,
    train_y_reduced,
    train_verification_x_reduced,
    train_verification_y_reduced
)

mod2.run_model()
results = mod2.get_rmse()

print("Testing RMSE: " + str(results[0]))
print("Training RMSE: " + str(results[1]))

Testing RMSE: 16.269121294412543
Training RMSE: 10.677241704587242


In [68]:
#Convert target feature to comments per hour

train_cpr = train.copy()
train_cpr["CommentsPerHour"] = train["CommentsNumber"] / train["PredictAfterHrs"]*100

train_x_cpr, train_verification_x_cpr, train_y_cpr, train_verification_y_cpr = train_test_split(
    train_cpr.drop(columns = ["CommentsNumber", "CommentsPerHour"]),
    train_cpr[["CommentsNumber", "CommentsPerHour"]],
    test_size = 0.2,
    random_state = 0
)

train_verification_x_cpr.head()

Unnamed: 0,ID,PageLikes,PageCheckIns,DailyInterest,PageCategory,PageCC1Min,PageCC1Max,PageCC1Avg,PageCC1Median,PageCC1Std,...,Published_Thursday,Published_Friday,Published_Saturday,PredictOn_Sunday,PredictOn_Monday,PredictOn_Tuesday,PredictOn_Wednesday,PredictOn_Thursday,PredictOn_Friday,PredictOn_Saturday
3652,3653,88359,3,42,17,0,4,0.782456,0.0,1.130925,...,1,0,0,1,0,0,0,0,0,0
124158,124159,29392,0,597,91,0,23,1.803519,1.0,2.628981,...,1,0,0,0,0,0,0,0,0,1
87089,87090,14691,0,4072,16,0,704,31.367206,9.0,77.067329,...,0,0,0,0,0,0,0,0,1,0
39220,39221,4072194,880,161174,9,0,2219,147.955342,78.0,217.664556,...,1,0,0,0,0,0,0,0,0,1
15377,15378,5970921,53096,234501,9,0,1294,136.465394,49.5,228.628584,...,0,0,0,0,0,0,0,1,0,0


In [69]:
mod3 = ModelMaker(
    RandomForestRegressor(max_depth=5, random_state=0),
    train_x_cpr,
    train_y_cpr["CommentsPerHour"],
    train_verification_x_cpr,
    train_verification_y_cpr["CommentsPerHour"]
)

mod3.run_model()
results = mod3.get_rmse()

print("Testing RMSE: " + str(results[0]))
print("Training RMSE: " + str(results[1]))

Testing RMSE: 72.44030194687103
Training RMSE: 74.16039093926915


In [70]:
# Using comments per hour as the target feature reduces rmse

mod3_preds = mod3.model.predict(train_verification_x_cpr)
mod3_preds_rounded = [round(p, 0) / 100  for p in mod3_preds]

test_df = pd.DataFrame(
    {
        "RawPredictions": mod3_preds_rounded,
        "PredictAfterHrs": train_verification_x_cpr["PredictAfterHrs"],
        "Actual": train_verification_y_cpr["CommentsNumber"]
    }
)

test_df["Predicted"] = round(test_df["RawPredictions"] * test_df["PredictAfterHrs"], 0)


print(mod3.rmse(test_df.drop(columns = ["RawPredictions", "PredictAfterHrs"])))


17.34765778914349


In [71]:
# Using gradient boost improves rmse

mod4 = ModelMaker(
    GradientBoostingRegressor(n_estimators=400, random_state=0),
    train_x,
    train_y,
    train_verification_x,
    train_verification_y
)

mod4.run_model()
results = mod4.get_rmse()

print("Testing RMSE: " + str(results[0]))
print("Training RMSE: " + str(results[1]))

Testing RMSE: 15.215671324202354
Training RMSE: 13.565577911965867


In [72]:
mod5 = ModelMaker(
    LinearRegression(),
    train_x,
    train_y,
    train_verification_x,
    train_verification_y
)

mod5.run_model()
results = mod5.get_rmse()

print("Testing RMSE: " + str(results[0]))
print("Training RMSE: " + str(results[1]))

Testing RMSE: 25.259177311653254
Training RMSE: 27.254481601614863


In [73]:
# A voting regressor makes things worse

mod6 = ModelMaker(
    VotingRegressor(
        estimators=[
            ("gb", mod4.model),
            ("rf", mod1.model),
            ("lr", mod5.model)
        ]
    ),
    train_x,
    train_y,
    train_verification_x,
    train_verification_y
)

mod6.run_model()
results = mod6.get_rmse()

print("Testing RMSE: " + str(results[0]))
print("Training RMSE: " + str(results[1]))

Testing RMSE: 17.375398169206413
Training RMSE: 17.15058732528894


In [74]:
# Feature importance based on decrease in impurity

feature_names = train.drop(columns = ["CommentsNumber"]).columns

importances = pd.Series(mod1.model.feature_importances_)

importance_df = pd.DataFrame(
    {
        "Importance": importances,
        "Feature": feature_names
    }
)

In [75]:
sorted_importance = importance_df.sort_values(by = "Importance", ascending = False)

sorted_importance

Unnamed: 0,Importance,Feature
31,0.313714,CommentsLast24H_CC2
35,0.265109,TimeSincePublishedinHrs
34,0.134987,CC2MinusCC3_CC5
37,0.076227,PostShareCount
29,0.019436,PageCC5Std
33,0.017066,CommentsFirst24H_CC4
6,0.016097,PageCC1Max
30,0.015754,TotalComments_CC1
12,0.015585,PageCC2Avg
8,0.013589,PageCC1Median


In [76]:
# Only selecting features with above 0.01 importance

train_x_reduced, train_verification_x_reduced, train_y_reduced, train_verification_y_reduced = train_test_split(
    train.drop(columns = ["CommentsNumber"])
    [
        [
            "CommentsLast24H_CC2", 
            "TimeSincePublishedinHrs", 
            "CC2MinusCC3_CC5",
            "PostShareCount",
            "PageCC5Std", 
            "CommentsFirst24H_CC4",
            "PageCC1Max",
            "TotalComments_CC1",
            "PageCC2Avg",
            "PageCC1Median",
            "PageCC4Avg"
        ]
    ],
    train["CommentsNumber"],
    test_size = 0.2,
    random_state = 0
)

train_verification_x.head()

Unnamed: 0,ID,PageLikes,PageCheckIns,DailyInterest,PageCategory,PageCC1Min,PageCC1Max,PageCC1Avg,PageCC1Median,PageCC1Std,...,Published_Thursday,Published_Friday,Published_Saturday,PredictOn_Sunday,PredictOn_Monday,PredictOn_Tuesday,PredictOn_Wednesday,PredictOn_Thursday,PredictOn_Friday,PredictOn_Saturday
3652,3653,88359,3,42,17,0,4,0.782456,0.0,1.130925,...,1,0,0,1,0,0,0,0,0,0
124158,124159,29392,0,597,91,0,23,1.803519,1.0,2.628981,...,1,0,0,0,0,0,0,0,0,1
87089,87090,14691,0,4072,16,0,704,31.367206,9.0,77.067329,...,0,0,0,0,0,0,0,0,1,0
39220,39221,4072194,880,161174,9,0,2219,147.955342,78.0,217.664556,...,1,0,0,0,0,0,0,0,0,1
15377,15378,5970921,53096,234501,9,0,1294,136.465394,49.5,228.628584,...,0,0,0,0,0,0,0,1,0,0


In [77]:
# Results are essentially the same

mod7 = ModelMaker(
    RandomForestRegressor(max_depth=8, random_state=0),
    train_x_reduced,
    train_y_reduced,
    train_verification_x_reduced,
    train_verification_y_reduced
)

mod7.run_model()
results = mod7.get_rmse()

print("Testing RMSE: " + str(results[0]))
print("Training RMSE: " + str(results[1]))

Testing RMSE: 16.40653715865909
Training RMSE: 13.048520496698508


In [78]:
mod8 = ModelMaker(
    GradientBoostingRegressor(n_estimators=600, random_state=0),
    train_x_reduced,
    train_y_reduced,
    train_verification_x_reduced,
    train_verification_y_reduced
)

mod8.run_model()
results = mod8.get_rmse()

print("Testing RMSE: " + str(results[0]))
print("Training RMSE: " + str(results[1]))

Testing RMSE: 15.897194963021233
Training RMSE: 13.070464295295157


In [79]:
# Rank features based on feature permutation

perm_imp = permutation_importance(
    mod1.model,
    train_verification_x,
    train_verification_y,
    n_repeats = 10,
    n_jobs = 2,
    random_state = 0
)

perm_imp = pd.Series(perm_imp.importances_mean, index = feature_names)

In [80]:
perm_importance_df = pd.DataFrame(
    {
        "Importance": perm_imp,
        "Feature": feature_names
    }
)
sorted_perm_importance = perm_importance_df.sort_values(by = "Importance", ascending = False)

sorted_perm_importance


Unnamed: 0,Importance,Feature
CommentsLast24H_CC2,0.923901,CommentsLast24H_CC2
TimeSincePublishedinHrs,0.66736,TimeSincePublishedinHrs
CommentsFirst24H_CC4,0.098946,CommentsFirst24H_CC4
CC2MinusCC3_CC5,0.077355,CC2MinusCC3_CC5
PostShareCount,0.058699,PostShareCount
PageCC5Std,0.006425,PageCC5Std
TotalComments_CC1,0.006377,TotalComments_CC1
PageCC4Avg,0.005982,PageCC4Avg
PageCC1Median,0.005591,PageCC1Median
PageCC2Avg,0.005434,PageCC2Avg


In [81]:
train_x_reduced, train_verification_x_reduced, train_y_reduced, train_verification_y_reduced = train_test_split(
    train.drop(columns = ["CommentsNumber"])
    [
        [
            "CommentsLast24H_CC2", 
            "TimeSincePublishedinHrs", 
            "CommentsFirst24H_CC4",
            "CC2MinusCC3_CC5",
            "PostShareCount", 
            "PageCC5Std",
            "TotalComments_CC1",
            "PageCC4Avg",
            "PageCC1Median",
            "PageCC2Avg",
            "PageCC1Max",
            "PageCC4Median"
        ]
    ],
    train["CommentsNumber"],
    test_size = 0.2,
    random_state = 0
)

In [82]:
# results do not change

mod9 = ModelMaker(
    RandomForestRegressor(max_depth=8, random_state=0),
    train_x_reduced,
    train_y_reduced,
    train_verification_x_reduced,
    train_verification_y_reduced
)

mod9.run_model()
results = mod9.get_rmse()

print("Testing RMSE: " + str(results[0]))
print("Training RMSE: " + str(results[1]))

Testing RMSE: 16.349894142212005
Training RMSE: 13.062036748062335


In [83]:
mod10 = ModelMaker(
    GradientBoostingRegressor(n_estimators=800, random_state=0),
    train_x_reduced,
    train_y_reduced,
    train_verification_x_reduced,
    train_verification_y_reduced
)

mod10.run_model()
results = mod10.get_rmse()

print("Testing RMSE: " + str(results[0]))
print("Training RMSE: " + str(results[1]))

Testing RMSE: 15.692144041378942
Training RMSE: 12.288195163738207


In [84]:
mod11 = ModelMaker(
    VotingRegressor(
        estimators=[
            ("gb", mod10.model),
            ("rf", mod9.model)
        ]
    ),
    train_x_reduced,
    train_y_reduced,
    train_verification_x_reduced,
    train_verification_y_reduced
)

mod11.run_model()
results = mod11.get_rmse()

print("Testing RMSE: " + str(results[0]))
print("Training RMSE: " + str(results[1]))

Testing RMSE: 15.520045350845715
Training RMSE: 12.277203988935764
