In [1]:
import pandas as pd
import numpy as np
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.data import cross_val, standard, minmax, train_test
from src.models import LinRegSGD
from src.metrics import rmse, r2

In [2]:
raw_df = pd.read_csv("../data/raw/facebook/Features_Variant_1.csv", header=None)

# Feauture annotation

0 Page Popularity/likes
Decimal Encoding Page feature Defines the popularity or support for the source of the document.

1 Page Checkins
Decimal Encoding Page feature Describes how many individuals so far visited this place. This feature is only associated with the places eg:some institution, place, theater etc.

2 Page talking about
Decimal Encoding Page feature Defines the daily interest of individuals towards source of the document/ Post. The people who actually come back to the page, after liking the page. This include activities such as comments, likes to a post, shares, etc by visitors to the page.

3 Page Category
Value Encoding Page feature Defines the category of the source of the document eg: place, institution, brand etc.

(DROP) 4 - 28 Derived 
Decimal Encoding Derived feature These features are aggregated by page, by calculating min, max, average, median and standard deviation of essential features.

29 CC1
Decimal Encoding Essential feature The total number of comments before selected base date/time.

30 CC2
Decimal Encoding Essential feature The number of comments in last 24 hours, relative to base date/time.

31 CC3
Decimal Encoding Essential feature The number of comments in last 48 to last 24 hours relative to base date/time.

32 CC4
Decimal Encoding Essential feature The number of comments in the first 24 hours after the publication of post but before base date/time.

33 CC5
Decimal Encoding Essential feature The difference between CC2 and CC3.

(DROP) 34 Base time
Decimal(0-71) Encoding Other feature Selected time in order to simulate the scenario.

35 Post length
Decimal Encoding Other feature Character count in the post.

36 Post Share Count
Decimal Encoding Other feature This features counts the no of shares of the post, that how many peoples had shared this post on to their timeline.

37 Post Promotion Status
Binary Encoding Other feature To reach more people with posts in News Feed, individual promote their post and this features tells that whether the post is promoted(1) or not(0).

38 H Local
Decimal(0-23) Encoding Other feature This describes the H hrs, for which we have the target variable/ comments received.

39-45 Post published weekday
Binary Encoding Weekdays feature This represents the day(Sunday...Saturday) on which the post was published.

46-52 Base DateTime weekday
Binary Encoding Weekdays feature This represents the day(Sunday...Saturday) on selected base Date/Time.

53 Target Variable
Decimal Target The no of comments in next H hrs(H is given in Feature no 38).

In [3]:
feature_idx = [0, 1, 2, 3, 29, 30, 31, 32, 33, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52]

In [4]:
X = raw_df.iloc[:, feature_idx].to_numpy()
y = raw_df.iloc[:, -1].to_numpy()

In [5]:
cross_val_split = cross_val(X, y, 5)

In [6]:
linreg = LinRegSGD(l1_reg=0., l2_reg=0.)

In [7]:
feature_list = [f"f{feature}" for feature in feature_idx]
report_table = pd.DataFrame(columns=[
    "Mean_train",
    "STD_train",
    "Mean_test",
    "STD_test",
    "R2_train",
    "RMSE_train",
    "R2_test",
    "RMSE_test",
    *feature_list
])
for X_train, y_train, X_test, y_test in cross_val_split:
    X_train_scaled, X_test_scaled = standard(X_train, X_test)
    linreg.fit(X_train_scaled, y_train, epochs=10, learning_rate=1e-5)
    y_train_pred = linreg.predict(X_train_scaled)
    y_test_pred = linreg.predict(X_test_scaled)
    report_table = report_table.append({
        "Mean_train": np.mean(y_train),
        "STD_train": np.std(y_train),
        "Mean_test": np.mean(y_test),
        "STD_test": np.std(y_test),
        "R2_train": r2(y_train, y_train_pred),
        "RMSE_train": rmse(y_train, y_train_pred),
        "R2_test": r2(y_test, y_test_pred),
        "RMSE_test": rmse(y_test, y_test_pred),
        **dict(zip(feature_list, linreg.weights))
    }, ignore_index=True)

In [8]:
pd.set_option("max_columns", 100)
report_table

Unnamed: 0,Mean_train,STD_train,Mean_test,STD_test,R2_train,RMSE_train,R2_test,RMSE_test,f0,f1,f2,f3,f29,f30,f31,f32,f33,f35,f36,f37,f38,f39,f40,f41,f42,f43,f44,f45,f46,f47,f48,f49,f50,f51,f52
0,7.511508,36.89942,6.568323,29.191288,0.279596,31.318976,0.305904,24.319978,7.511292,-0.636122,-0.540764,1.906295,-0.705617,0.704122,11.702484,2.773536,1.399014,7.851732,0.103129,1.273193,0.136089,0.802247,-0.16031,-0.133653,-0.116215,0.322573,0.15849,0.29211,-0.055488,-0.1122,0.192288,0.122476,0.518301,0.069071,0.100778
1,7.246467,34.60701,7.628571,38.838852,0.277589,29.41414,0.305484,32.367388,7.241445,-0.826797,-0.564349,1.722104,-0.593991,0.574545,10.691627,2.645983,1.273444,7.173087,0.105426,1.058775,0.136089,0.732607,-0.229229,-0.079248,-0.319199,0.296911,0.260483,0.393845,-0.013444,-0.185834,0.164157,0.041834,0.514056,0.164877,0.283774
2,7.30819,35.32366,7.381685,36.16783,0.286742,29.832457,0.273655,30.824357,7.310188,-0.76319,-0.635152,1.786447,-0.620054,0.801113,10.949775,2.705721,1.357302,7.441277,0.017394,0.934069,0.136089,0.819269,-0.391954,-0.217472,-0.177063,0.441028,0.334416,0.402845,-0.117773,-0.306975,0.295369,0.245597,0.560352,0.109026,0.084708
3,7.141793,35.072705,8.047253,37.123085,0.296963,29.407526,0.073874,35.725568,7.139402,-0.804164,-0.7549,2.340437,-0.526445,-0.004228,10.596094,2.402397,0.659815,7.315657,0.185658,4.020961,0.136089,0.818649,-0.070419,-0.263089,-0.093997,0.256426,0.215913,0.388025,-0.12043,-0.096438,0.05335,0.202821,0.503158,0.129906,0.188359
4,7.406484,35.524766,6.988523,35.36928,0.29107,29.911142,0.254039,30.548101,7.380334,-0.929899,-0.617482,1.920377,-0.549084,0.657882,10.944742,2.663988,1.311458,7.485884,0.087471,0.815277,0.136089,0.785147,-0.267227,-0.458315,-0.200186,0.335832,0.379749,0.416813,0.090672,-0.309505,0.000431,0.329329,0.716608,0.116663,0.083517


In [9]:
print(f"Mean test RMSE: {report_table['RMSE_test'].mean()}")
print(f"Mean test R2: {report_table['R2_test'].mean()}")

Mean test RMSE: 30.75707862050203
Mean test R2: 0.2425909361071666
