In [16]:
import pandas as pd
from settings import RAW_PATH
from catboost import CatBoostRegressor, Pool
import numpy as np
from settings import DATA_PATH
from src.utils import loads, dump
from sklearn.preprocessing import MultiLabelBinarizer
from src.funs import encode_list_by_rate, str_to_list
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

In [17]:
def prepare(df, is_train):
    df.sort_values('publish_date', inplace=True)
    df['Time'] = np.arange(len(df.index))

    if is_train:
        df.loc[df['full_reads_percent'] > 100, 'full_reads_percent'] = np.nan
        df['full_reads_percent'].fillna((df['full_reads_percent'].mean()), inplace=True)


    df = encode_list_by_rate(df, 'authors', 0.03)

    df = df.apply(lambda row: str_to_list(row, 'tags'), axis=1)

    tags_encoder = MultiLabelBinarizer() if is_train else loads(DATA_PATH / "tags_encoder.pickle")
    tags = tags_encoder.fit_transform(df['tags']) if is_train else tags_encoder.transform(
        df['tags'])
    tags_feat_names = ['tags_' + str(cls) for cls in list(tags_encoder.classes_)]
    tags_df = pd.DataFrame(tags, columns=tags_feat_names)
    df = df.merge(tags_df, left_index=True, right_index=True)
    df = df.drop('tags', axis=1)
    if is_train:
        dump(DATA_PATH / "tags_encoder.pickle", tags_encoder)

    df['day'] = pd.to_datetime(df['publish_date']).dt.strftime("%d").astype(int)
    df['month'] = pd.to_datetime(df['publish_date']).dt.strftime("%m").astype(int)
    df['hour'] = pd.to_datetime(df['publish_date']).dt.strftime("%H").astype(int)
    return df


In [32]:

df= pd.read_csv(RAW_PATH / "train.csv", parse_dates=['publish_date'])
df=prepare(df, True)

cat_features= ["category"]

x_cols_drop = ["views", "depth", "full_reads_percent", "publish_date", "session", "document_id", 'title', 'Time']
y_col = ["views"]

X =df.drop(x_cols_drop, axis=1)
y=df[y_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [33]:
train_pool = Pool(X_train, label=y_train, cat_features=cat_features)
test_pool = Pool(X_test, label=y_test, cat_features=cat_features)

In [34]:
model = CatBoostRegressor(score_function="Cosine")

In [35]:
model.fit(train_pool, eval_set=test_pool)

Learning rate set to 0.065352
0:	learn: 88076.4620724	test: 109637.2136057	best: 109637.2136057 (0)	total: 22.2ms	remaining: 22.1s
1:	learn: 87396.7506318	test: 109624.1795893	best: 109624.1795893 (1)	total: 70.4ms	remaining: 35.1s
2:	learn: 86740.9870778	test: 109647.3361107	best: 109624.1795893 (1)	total: 103ms	remaining: 34.3s
3:	learn: 86089.0767266	test: 109689.4356764	best: 109624.1795893 (1)	total: 136ms	remaining: 33.9s
4:	learn: 85510.2366091	test: 109729.0392919	best: 109624.1795893 (1)	total: 166ms	remaining: 33s
5:	learn: 84946.8371653	test: 109767.7152828	best: 109624.1795893 (1)	total: 198ms	remaining: 32.7s
6:	learn: 84350.2092509	test: 109756.7571117	best: 109624.1795893 (1)	total: 224ms	remaining: 31.8s
7:	learn: 83768.6380309	test: 109830.3560624	best: 109624.1795893 (1)	total: 258ms	remaining: 32s
8:	learn: 83204.2418327	test: 109894.4511213	best: 109624.1795893 (1)	total: 284ms	remaining: 31.3s
9:	learn: 82825.2875207	test: 109895.0530685	best: 109624.1795893 (1)	to

<catboost.core.CatBoostRegressor at 0x2a34798b790>

In [36]:
train_y_pred = model.predict(train_pool)
test_y_pred = model.predict(test_pool)

In [37]:
r2_score(y_train,train_y_pred)

0.7475792123099998

In [38]:
r2_score(y_test,test_y_pred)

0.07732128666041205

In [42]:
df_test = pd.read_csv(RAW_PATH / "test_v.csv", parse_dates=['publish_date'])
df_test=prepare(df_test, False)
# df_test=df_test[df_test['real_views']<1_000_000]
X_test = df_test.drop(["publish_date", "session", "document_id", 'title', 'Time', 'real_views'], axis=1)
y_test = df_test['real_views']
test_pool = Pool(X_test, label=y_test, cat_features=cat_features)
test_y_pred = model.predict(test_pool)
r2_score(y_test,test_y_pred)



0.0009483805709832316