In [56]:
import numpy as np
import pandas as pd
import math
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import VotingRegressor

from sklearn.model_selection import train_test_split

In [32]:
test = pd.read_csv("./C2T2_Test.csv")
train = pd.read_csv("./C2T2_train.csv")

test.head()

Unnamed: 0,ID,PageLikes,PageCheckIns,DailyInterest,PageCategory,PageCC1Min,PageCC1Max,PageCC1Avg,PageCC1Median,PageCC1Std,...,Published_Friday,Published_Saturday,PredictOn_Sunday,PredictOn_Monday,PredictOn_Tuesday,PredictOn_Wednesday,PredictOn_Thursday,PredictOn_Friday,PredictOn_Saturday,CommentsNumber
0,130000,879585,11,49540,9,0,1063,102.595311,43.0,173.796629,...,0,0,0,0,0,0,1,0,0,
1,130001,879585,11,49540,9,0,1063,102.595311,43.0,173.796629,...,0,0,0,0,0,0,0,1,0,
2,130002,879585,11,49540,9,0,1063,102.595311,43.0,173.796629,...,0,0,0,0,0,0,1,0,0,
3,130003,879585,11,49540,9,0,1063,102.595311,43.0,173.796629,...,0,0,0,0,0,0,1,0,0,
4,130004,879585,11,49540,9,0,1063,102.595311,43.0,173.796629,...,0,0,0,0,0,0,1,0,0,


In [33]:
train_x, train_verification_x, train_y, train_verification_y = train_test_split(
    train.drop(columns = "CommentsNumber"),
    train["CommentsNumber"],
    test_size = 0.2,
    random_state = 0
)

train_verification_x.head()

Unnamed: 0,ID,PageLikes,PageCheckIns,DailyInterest,PageCategory,PageCC1Min,PageCC1Max,PageCC1Avg,PageCC1Median,PageCC1Std,...,Published_Thursday,Published_Friday,Published_Saturday,PredictOn_Sunday,PredictOn_Monday,PredictOn_Tuesday,PredictOn_Wednesday,PredictOn_Thursday,PredictOn_Friday,PredictOn_Saturday
3652,3653,88359,3,42,17,0,4,0.782456,0.0,1.130925,...,1,0,0,1,0,0,0,0,0,0
124158,124159,29392,0,597,91,0,23,1.803519,1.0,2.628981,...,1,0,0,0,0,0,0,0,0,1
87089,87090,14691,0,4072,16,0,704,31.367206,9.0,77.067329,...,0,0,0,0,0,0,0,0,1,0
39220,39221,4072194,880,161174,9,0,2219,147.955342,78.0,217.664556,...,1,0,0,0,0,0,0,0,0,1
15377,15378,5970921,53096,234501,9,0,1294,136.465394,49.5,228.628584,...,0,0,0,0,0,0,0,1,0,0


In [34]:
# Purpose: efficiently testing different models without repeating too much code

class ModelMaker:
    def __init__(self, model_type, train_x, train_y, test_x, test_y):
        self.model = model_type
        self.train_x = train_x
        self.train_y = train_y
        self.test_x = test_x
        self.test_y = test_y

        # Model outputs when used on testing set
        self.preds_df = pd.DataFrame()

        # Model outputs when used on training set
        self.train_df = pd.DataFrame()


    def run_model(self):
        m = self.model.fit(self.train_x, self.train_y)

        # Testing set predictions
        preds = m.predict(self.test_x)
        preds_rounded = [round(p, 0) for p in preds]

        # Training set predictions
        train_preds = m.predict(self.train_x)
        train_preds_rounded = [round(p, 0) for p in train_preds]

        # Create dataframes with predictions
        self.preds_df = pd.DataFrame(
            {
                "Actual": self.test_y,
                "Predicted": preds_rounded
            }
        )

        self.train_df = pd.DataFrame(
            {
                "Actual": self.train_y,
                "Predicted": train_preds_rounded
            }
        )

    def rmse(self, df):
        rmse = 0

        for index, row in df.iterrows():
            rmse += (int(row["Predicted"]) - int(row["Actual"]))**2

        rmse /= len(df)
        rmse = math.sqrt(rmse)

        return rmse
    
    def get_rmse(self):
        return (self.rmse(self.preds_df), self.rmse(self.train_df))


In [35]:
mod1 = ModelMaker(
    RandomForestRegressor(max_depth=6, random_state=0),
    train_x,
    train_y,
    train_verification_x,
    train_verification_y
)

mod1.run_model()
results = mod1.get_rmse()

print("Testing RMSE: " + str(results[0]))
print("Training RMSE: " + str(results[1]))

Testing RMSE: 16.790517012244187
Training RMSE: 16.006142810174133


In [36]:
train_x, train_verification_x, train_y, train_verification_y = train_test_split(
    train.drop(columns = ["CommentsNumber"])
    [
        [
            "PageLikes", 
            "PageCheckIns", 
            "DailyInterest",
            "PageCategory",
            "TotalComments_CC1", 
            "CommentsLast24H_CC2",
            "CommentsLast48to24H_CC3",
            "CommentsFirst24H_CC4",
            "TimeSincePublishedinHrs",
            "PostLength",
            "PostShareCount",
            "PostPromoted",
            "PredictAfterHrs"
        ]
    ],
    train["CommentsNumber"],
    test_size = 0.2,
    random_state = 0
)

train_verification_x.head()

Unnamed: 0,PageLikes,PageCheckIns,DailyInterest,PageCategory,TotalComments_CC1,CommentsLast24H_CC2,CommentsLast48to24H_CC3,CommentsFirst24H_CC4,TimeSincePublishedinHrs,PostLength,PostShareCount,PostPromoted,PredictAfterHrs
3652,88359,3,42,17,0,0,0,0,63,153,1,0,24
124158,29392,0,597,91,1,0,1,1,43,188,1,0,24
87089,14691,0,4072,16,9,3,3,6,59,150,1,0,24
39220,4072194,880,161174,9,82,3,35,78,51,205,115,0,24
15377,5970921,53096,234501,9,46,9,12,37,51,163,74,0,24


In [37]:
mod2 = ModelMaker(
    RandomForestRegressor(max_depth=10, random_state=0),
    train_x,
    train_y,
    train_verification_x,
    train_verification_y
)

mod2.run_model()
results = mod2.get_rmse()

print("Testing RMSE: " + str(results[0]))
print("Training RMSE: " + str(results[1]))

Testing RMSE: 16.269121294412543
Training RMSE: 10.677241704587242


In [40]:
train_cpr = train.copy()
train_cpr["CommentsPerHour"] = train["CommentsNumber"] / train["PredictAfterHrs"]*100

train_x_cpr, train_verification_x_cpr, train_y_cpr, train_verification_y_cpr = train_test_split(
    train_cpr.drop(columns = ["CommentsNumber", "CommentsPerHour"]),
    train_cpr[["CommentsNumber", "CommentsPerHour"]],
    test_size = 0.2,
    random_state = 0
)

train_verification_x_cpr.head()

Unnamed: 0,ID,PageLikes,PageCheckIns,DailyInterest,PageCategory,PageCC1Min,PageCC1Max,PageCC1Avg,PageCC1Median,PageCC1Std,...,Published_Thursday,Published_Friday,Published_Saturday,PredictOn_Sunday,PredictOn_Monday,PredictOn_Tuesday,PredictOn_Wednesday,PredictOn_Thursday,PredictOn_Friday,PredictOn_Saturday
3652,3653,88359,3,42,17,0,4,0.782456,0.0,1.130925,...,1,0,0,1,0,0,0,0,0,0
124158,124159,29392,0,597,91,0,23,1.803519,1.0,2.628981,...,1,0,0,0,0,0,0,0,0,1
87089,87090,14691,0,4072,16,0,704,31.367206,9.0,77.067329,...,0,0,0,0,0,0,0,0,1,0
39220,39221,4072194,880,161174,9,0,2219,147.955342,78.0,217.664556,...,1,0,0,0,0,0,0,0,0,1
15377,15378,5970921,53096,234501,9,0,1294,136.465394,49.5,228.628584,...,0,0,0,0,0,0,0,1,0,0


In [41]:
mod3 = ModelMaker(
    RandomForestRegressor(max_depth=5, random_state=0),
    train_x_cpr,
    train_y_cpr["CommentsPerHour"],
    train_verification_x_cpr,
    train_verification_y_cpr["CommentsPerHour"]
)

mod3.run_model()
results = mod3.get_rmse()

print("Testing RMSE: " + str(results[0]))
print("Training RMSE: " + str(results[1]))

Testing RMSE: 72.44030194687103
Training RMSE: 74.16039093926915


In [49]:
mod3_preds = mod3.model.predict(train_verification_x_cpr)
mod3_preds_rounded = [round(p, 0) / 100  for p in mod3_preds]

test_df = pd.DataFrame(
    {
        "RawPredictions": mod3_preds_rounded,
        "PredictAfterHrs": train_verification_x_cpr["PredictAfterHrs"],
        "Actual": train_verification_y_cpr["CommentsNumber"]
    }
)

test_df["Predicted"] = round(test_df["RawPredictions"] * test_df["PredictAfterHrs"], 0)


print(mod3.rmse(test_df.drop(columns = ["RawPredictions", "PredictAfterHrs"])))


17.34765778914349


In [52]:
mod4 = ModelMaker(
    GradientBoostingRegressor(n_estimators=400, random_state=0),
    train_x,
    train_y,
    train_verification_x,
    train_verification_y
)

mod4.run_model()
results = mod4.get_rmse()

print("Testing RMSE: " + str(results[0]))
print("Training RMSE: " + str(results[1]))

Testing RMSE: 16.427960452088527
Training RMSE: 14.885764232589416


In [53]:
mod5 = ModelMaker(
    LinearRegression(),
    train_x,
    train_y,
    train_verification_x,
    train_verification_y
)

mod5.run_model()
results = mod5.get_rmse()

print("Testing RMSE: " + str(results[0]))
print("Training RMSE: " + str(results[1]))

Testing RMSE: 25.696507544800713
Training RMSE: 27.75132431628633


In [57]:
mod6 = ModelMaker(
    VotingRegressor(
        estimators=[
            ("gb", mod4.model),
            ("rf", mod1.model),
            ("lr", mod5.model)
        ]
    ),
    train_x,
    train_y,
    train_verification_x,
    train_verification_y
)

mod6.run_model()
results = mod6.get_rmse()

print("Testing RMSE: " + str(results[0]))
print("Training RMSE: " + str(results[1]))

Testing RMSE: 18.260246859903493
Training RMSE: 18.194813570831556
