In [16]:
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from settings import RAW_PATH
from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
import numpy as np
import category_encoders as ce
from pandas import Series
from pandas import DataFrame
import ast
from typing import Dict, List
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df_train = pd.read_csv(RAW_PATH / "train.csv", index_col= 0)
df_test = pd.read_csv(RAW_PATH / "test.csv", index_col= 0)

## Preprocess data

In [3]:
def encode_dummies(df:DataFrame, col_name:str)->DataFrame:
    categories_df = pd.get_dummies(df[col_name], prefix=col_name)  # dummy_na = False -> no NaN found
    df = df.drop(col_name, axis=1)
    df = df.join(categories_df)
    return df

In [4]:
def encode_list_by_rate(df: DataFrame, col_name: str, rate_limit: float) -> DataFrame:
    def str_to_list(row: Series, col_name: str) -> Series:
        row[col_name] = ast.literal_eval(row[col_name])
        return row

    def get_col_encode_dict(df: DataFrame, col_name: str, rate_limit: float) -> Dict[str, int]:
        col_value_rates = df.explode(col_name)[col_name].value_counts(normalize=True)
        col_encode_dict = {}
        for index, (col_value, rate) in enumerate(col_value_rates.items()):
            if rate < rate_limit:
                break
            col_encode_dict[col_value] = index * 10

        return col_encode_dict

    def encode(row: Series, col_name: str, encode_dict: Dict[str, int], empty_code: int) -> Series:
        values = row[col_name]
        code = 0
        if len(values) == 0:
            code = empty_code
        else:
            for col_value, col_code in encode_dict.items():
                if col_value in values:
                    code += col_code

        row[col_name] = code
        return row

    df = df.apply(lambda row: str_to_list(row, col_name), axis=1)
    col_encode_dict = get_col_encode_dict(df, col_name, rate_limit)
    df = df.apply(lambda row: encode(row, col_name, col_encode_dict, -1), axis=1)
    return df

In [5]:
df_train = encode_dummies(df_train, 'category')

In [6]:
df_train = encode_list_by_rate(df_train, 'authors', 0.03)

In [7]:
df_train = encode_list_by_rate(df_train, 'tags', 0.01)

In [17]:
vectorizer = TfidfVectorizer()
vectorizer.fit_transform(df_train['title'])

<7000x14407 sparse matrix of type '<class 'numpy.float64'>'
	with 66534 stored elements in Compressed Sparse Row format>

In [8]:
df_train['day'] = pd.to_datetime(df_train['publish_date']).dt.strftime("%d").astype(int)
df_train['month'] = pd.to_datetime(df_train['publish_date']).dt.strftime("%m").astype(int)

In [9]:
df_train

Unnamed: 0_level_0,title,publish_date,session,authors,ctr,tags,views,depth,full_reads_percent,category_5409f11ce063da9c8b588a12,category_5409f11ce063da9c8b588a13,category_5409f11ce063da9c8b588a18,category_540d5eafcbb20f2524fc0509,category_540d5ecacbb20f2524fc050a,category_5433e5decbb20f277b20eca9,category_552e430f9a79475dd957f8b3,category_5e54e2089a7947f63a801742,category_5e54e22a9a7947f560081ea2,day,month
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
624ac09c9a7947db3d80c98eIDE7mtH4RBqGn-8MXfGffQ,Европейский банк развития приостановил доступ ...,2022-04-04 10:29:44,IDE7mtH4RBqGn-8MXfGffQ,-1,1.580,50,20460,1.134,35.850,0,0,1,0,0,0,0,0,0,4,4
620f6b899a7947701cf489e1KtVJsteHStO5oditt3Uvzw,Кремль назвал регулярным процессом учебные зап...,2022-02-18 10:00:39,KtVJsteHStO5oditt3Uvzw,0,1.853,30,19038,1.142,38.355,1,0,0,0,0,0,0,0,0,18,2
620730cf9a7947ab96a44e27hk7puWJwSziw0m3sfTkKWA,Госсекретарь Швеции заявила о нежелании вступа...,2022-02-12 04:24:02,hk7puWJwSziw0m3sfTkKWA,-1,0.000,0,51151,1.185,36.424,1,0,0,0,0,0,0,0,0,12,2
6262a5889a79470b78c9ca307UKY2SSZTjCcjhwBzxw37w,Песков назвал прагматичной выдачу лицензии Газ...,2022-04-22 13:24:55,7UKY2SSZTjCcjhwBzxw37w,-1,0.000,0,3782,1.053,30.169,1,0,0,0,0,0,0,0,0,22,4
626678929a79477ca0101568wuMYES90REuV5YhrN75IXg,В Хабаровске задержали главу филиала РАНХиГС п...,2022-04-25 10:42:23,wuMYES90REuV5YhrN75IXg,-1,0.000,0,3065,1.063,34.617,0,0,0,0,0,1,0,0,0,25,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61fd479c9a7947848183694751RC_1i5QlGuv7B2FzwTjw,"Открытие Олимпиады, новые китайские контракты ...",2022-02-04 16:19:09,51RC_1i5QlGuv7B2FzwTjw,0,2.354,30,21738,1.188,28.448,0,0,0,0,0,1,0,0,0,4,2
628a8f1c9a79478c64a7dae450tIkSGVSf2O8XQgk_uaHQ,Саудовская Аравия выразила надежду на соглашен...,2022-05-22 20:07:46,50tIkSGVSf2O8XQgk_uaHQ,0,4.112,50,8377,1.056,38.045,0,0,0,1,0,0,0,0,0,22,5
6244cf5e9a79479099346b26tKlqoxQYSmij0ZBrGzebug,Брюс Уиллис завершит актерскую карьеру из-за б...,2022-03-30 21:59:02,tKlqoxQYSmij0ZBrGzebug,-1,5.125,0,37004,1.176,50.757,0,0,0,0,0,1,0,0,0,30,3
627303699a794749b8116abc12JX_3ZDTDu2XHF3k2eJVg,Экс-посол Франции назвал болезненным для Европ...,2022-05-04 23:15:48,12JX_3ZDTDu2XHF3k2eJVg,-1,0.000,0,3450,1.068,38.928,0,1,0,0,0,0,0,0,0,4,5


In [10]:
score_dict = {"views":0.4, "depth":0.3,"full_reads_percent":0.3}

def calculate_score(y_true: Series, y_pred: Series, y_cols: List[str]) -> float:
    score = 0
    for i, col_name in enumerate(y_cols):
        if len(y_cols) > 1:
            y_pred_i = y_pred[:, i]
            y_true_i = y_true[col_name]
            score_coef = score_dict[col_name]
        else:
            y_pred_i = y_pred
            y_true_i = y_true[col_name].ravel()
            score_coef = 1
        score += score_coef * r2_score(y_true_i, y_pred_i)
    return score

In [11]:
X = df_train.drop(["views","depth","full_reads_percent","title","publish_date", "session"], axis = 1)
y = df_train[["views","depth","full_reads_percent"]]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [13]:
def train_score(y_cols:List[str], X_train, X_test, y_train, y_test):
    y_train = y_train[y_cols]
    y_test = y_test[y_cols]
    
    if y_train.shape[1]==1:
        y_train = y_train.values.ravel()
    
    estimator = RandomForestRegressor()
    
    param_grid = { 
            "n_estimators"      : [100, 500, 1000],
            "max_features"      : ["auto", "sqrt", "log2"],
            "min_samples_split" : [2,4,5],
            "bootstrap": [True, False],
            }
    grid = GridSearchCV(estimator, param_grid, n_jobs=-1, cv=5)
    grid.fit(X_train, y_train)
    print(y_cols)
    print(grid.best_score_) 
    print(grid.best_params_)
    
    regr = RandomForestRegressor(**grid.best_params_)
    regr.fit(X_train, y_train)
    
    pred = regr.predict(X_test)
    score = calculate_score(y_test, pred, y_cols)
    print(score)
     
    col_name ='importance'
    importance_df = pd.DataFrame(regr.feature_importances_, columns=[col_name], index=regr.feature_names_in_).sort_values(by=col_name, ascending=False)
    display(importance_df)
    

In [14]:
for y_cols in ([["views"],["depth"],["full_reads_percent"], ["views","depth","full_reads_percent"]]):
    train_score(y_cols, X_train.copy(), X_test.copy(), y_train.copy(), y_test.copy())

['views']
0.6278949174722206
{'bootstrap': False, 'max_features': 'sqrt', 'min_samples_split': 4, 'n_estimators': 100}
0.7386030044517422


Unnamed: 0,importance
tags,0.373762
ctr,0.337627
day,0.105631
month,0.103098
authors,0.062591
category_5409f11ce063da9c8b588a12,0.004959
category_540d5eafcbb20f2524fc0509,0.004943
category_5433e5decbb20f277b20eca9,0.00315
category_5409f11ce063da9c8b588a13,0.002513
category_5409f11ce063da9c8b588a18,0.001018


['depth']
0.7562070992426696
{'bootstrap': True, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 500}
0.7729286244573383


Unnamed: 0,importance
month,0.447816
day,0.187491
authors,0.118325
tags,0.11742
ctr,0.108675
category_5433e5decbb20f277b20eca9,0.007942
category_5409f11ce063da9c8b588a12,0.00462
category_540d5eafcbb20f2524fc0509,0.002646
category_5409f11ce063da9c8b588a13,0.002017
category_5409f11ce063da9c8b588a18,0.001707


['full_reads_percent']
0.2921372722622639
{'bootstrap': True, 'max_features': 'log2', 'min_samples_split': 5, 'n_estimators': 500}
0.2807263906215557


Unnamed: 0,importance
ctr,0.318855
day,0.220355
authors,0.219087
tags,0.091024
month,0.074082
category_5409f11ce063da9c8b588a18,0.018616
category_5433e5decbb20f277b20eca9,0.017592
category_5409f11ce063da9c8b588a12,0.016234
category_540d5ecacbb20f2524fc050a,0.010374
category_540d5eafcbb20f2524fc0509,0.007502


['views', 'depth', 'full_reads_percent']
0.5444827848807738
{'bootstrap': False, 'max_features': 'log2', 'min_samples_split': 5, 'n_estimators': 100}
0.6111573666067004


Unnamed: 0,importance
tags,0.40456
ctr,0.287978
day,0.131275
month,0.087858
authors,0.070978
category_540d5eafcbb20f2524fc0509,0.006961
category_5409f11ce063da9c8b588a12,0.003812
category_5433e5decbb20f277b20eca9,0.002746
category_5409f11ce063da9c8b588a13,0.002304
category_5409f11ce063da9c8b588a18,0.000837
