In [36]:
import pandas as pd
from settings import RAW_PATH
from catboost import CatBoostRegressor, Pool
import numpy as np
from settings import DATA_PATH
from src.utils import loads, dump
from sklearn.preprocessing import MultiLabelBinarizer
from src.funs import encode_list_by_rate, str_to_list
from sklearn.metrics import r2_score

In [37]:
def prepare(df, is_train):
    df.sort_values('publish_date', inplace=True)
    df['Time'] = np.arange(len(df.index))

    if is_train:
        df.loc[df['full_reads_percent'] > 100, 'full_reads_percent'] = np.nan
        df['full_reads_percent'].fillna((df['full_reads_percent'].mean()), inplace=True)


    df = encode_list_by_rate(df, 'authors', 0.03)

    df = df.apply(lambda row: str_to_list(row, 'tags'), axis=1)

    tags_encoder = MultiLabelBinarizer() if is_train else loads(DATA_PATH / "tags_encoder.pickle")
    tags = tags_encoder.fit_transform(df['tags']) if is_train else tags_encoder.transform(
        df['tags'])
    tags_feat_names = ['tags_' + str(cls) for cls in list(tags_encoder.classes_)]
    tags_df = pd.DataFrame(tags, columns=tags_feat_names)
    df = df.merge(tags_df, left_index=True, right_index=True)
    df = df.drop('tags', axis=1)
    if is_train:
        dump(DATA_PATH / "tags_encoder.pickle", tags_encoder)

    df['day'] = pd.to_datetime(df['publish_date']).dt.strftime("%d").astype(int)
    df['month'] = pd.to_datetime(df['publish_date']).dt.strftime("%m").astype(int)
    df['hour'] = pd.to_datetime(df['publish_date']).dt.strftime("%H").astype(int)
    return df


In [38]:
df_train = pd.read_csv(RAW_PATH / "train.csv", parse_dates=['publish_date'])
df_test = pd.read_csv(RAW_PATH / "test_v.csv", parse_dates=['publish_date'])


df_train=prepare(df_train, True)
df_test=prepare(df_test, False)

cat_features= ["category"]

x_cols_drop = ["views", "depth", "full_reads_percent", "publish_date", "session", "document_id", 'title', 'Time']
y_cols = ["views"]

X_train = df_train.drop(x_cols_drop, axis=1)
y_train = df_train[y_cols]

X_test = df_test.drop(["publish_date", "session", "document_id", 'title', 'Time', 'real_views'], axis=1)
y_test = df_test['real_views']




In [39]:
train_pool = Pool(X_train, label=y_train, cat_features=cat_features)
test_pool = Pool(X_test, label=y_test, cat_features=cat_features)

In [40]:
model = CatBoostRegressor(score_function="Cosine")

In [41]:
model.fit(train_pool, eval_set=test_pool)

Learning rate set to 0.069116
0:	learn: 94972.8834046	test: 5084503.2306717	best: 5084503.2306717 (0)	total: 27.3ms	remaining: 27.2s
1:	learn: 94363.9888735	test: 5084507.5742506	best: 5084503.2306717 (0)	total: 52.8ms	remaining: 26.4s
2:	learn: 93737.4772785	test: 5084510.6818229	best: 5084503.2306717 (0)	total: 83.2ms	remaining: 27.7s
3:	learn: 93284.8868768	test: 5084514.9277309	best: 5084503.2306717 (0)	total: 112ms	remaining: 27.9s
4:	learn: 92688.9130642	test: 5084519.5539959	best: 5084503.2306717 (0)	total: 141ms	remaining: 28s
5:	learn: 92143.2606833	test: 5084523.9238670	best: 5084503.2306717 (0)	total: 170ms	remaining: 28.1s
6:	learn: 91728.7855854	test: 5084526.8591872	best: 5084503.2306717 (0)	total: 198ms	remaining: 28.1s
7:	learn: 91212.5977463	test: 5084531.0926927	best: 5084503.2306717 (0)	total: 224ms	remaining: 27.8s
8:	learn: 90675.5958980	test: 5084535.0792023	best: 5084503.2306717 (0)	total: 251ms	remaining: 27.7s
9:	learn: 90153.7703498	test: 5084538.0395435	best:

<catboost.core.CatBoostRegressor at 0x20a2adfa4f0>

In [42]:
train_y_pred = model.predict(train_pool)
test_y_pred = model.predict(test_pool)

In [43]:
r2_score(y_train,train_y_pred)

0.9325952869974405

In [44]:
r2_score(y_test,test_y_pred)

0.0034122289362918945