In [98]:
from pathlib import Path

In [99]:
import pandas as pd

In [100]:
%matplotlib inline

In [101]:
path = Path("./data/train/005.csv")
df = pd.read_csv(path, header=None)

In [102]:
def rename(df):
    columns = {
        0: "popularity",  # Popularity or support for the source of the document,
        1: "checkins",  # How many individuals so far visited this place,
        2: "interest",  # Daily interest of individuals towards source of the post,
        3: "category",  # Category of the source of the document, e.g. place, institution, brand,
        29: "comments_before",  # Total number of comments before selected base datetime,
        30: "comments_last_24h",  # Number of comments in last 24 hours, relative to base datetime,
        31: "comments_last_48h",  # Number of comments in last 48 to last 24 hours relative to base datetime,
        32: "comments_first_24h",
        # Number of comments in the first 24 hours after the publication of post but before base datetime,
        34: "time",  # Selected time in order to simulate the scenario,
        35: "length",  # Character length of the post,
        36: "shares",  # No. of shares of the post,
        37: "is_promoted",  # Whether the post is promoted,
        38: "hours",  # `H` hours, for which we have comments received,
        39: "published_monday",
        40: "published_tuesday",
        41: "published_wednesday",
        42: "published_thursday",
        43: "published_friday",
        44: "published_saturday",
        45: "published_sunday",
        46: "current_monday",
        47: "current_tuesday",
        48: "current_wednesday",
        49: "current_thursday",
        50: "current_friday",
        51: "current_saturday",
        52: "current_sunday",
        53: "comments",  # Target: no. of comments in next `H` hrs.
    }
    df = df.rename(columns=columns)
    df["category"] = df["category"].astype("category")
    
    return df

In [103]:
df = rename(df)

In [104]:
X_train, y_train = df.loc[:, df.columns != "comments"], df["comments"]

In [105]:
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold

In [106]:
pipeline = make_pipeline(
    StandardScaler(),
    # VarianceThreshold(),
    SGDRegressor(),
)

In [107]:
pipeline.fit(X_train, y_train)



Pipeline(steps=[('standardscaler', StandardScaler()),
                ('sgdregressor', SGDRegressor())])

In [108]:
path = Path("./data/test/001.csv")
df = pd.read_csv(path, header=None)

In [109]:
df = rename(df)

In [110]:
X_test, y_test = df.loc[:, df.columns != "comments"], df["comments"]

In [111]:
pipeline.score(X_test, y_test)



-3.4062643914401024e+16