In [1]:
import sys
import os
import warnings
os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import time
import pyarrow.parquet as pq
import scipy
import implicit
import bisect
import sklearn.metrics as m
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.metrics import f1_score

In [3]:
def age_bucket(x):
    return bisect.bisect_left([0,25,35,45,55,65], x)

In [4]:
data = pq.read_table('competition_data_final_pqt')

In [5]:
targets = pq.read_table('public_train.pqt').to_pandas()
targets.head()

Unnamed: 0,age,is_male,user_id
350459,31.0,1,350459
188276,35.0,1,188276
99002,41.0,0,99002
155506,33.0,0,155506
213873,54.0,0,213873


# userid_urlhost_requestcnt_sum

In [6]:
data_agg = data.select(['user_id', 'url_host', 'request_cnt']).group_by(['user_id', 'url_host']).aggregate([('request_cnt', "sum")])

item_set = set(data_agg.select(['url_host']).to_pandas()['url_host'])
item_dict = {url: idurl for url, idurl in zip(item_set, range(len(item_set)))}

usr_set = set(data_agg.select(['user_id']).to_pandas()['user_id'])
usr_dict = {usr: user_id for usr, user_id in zip(usr_set, range(len(usr_set)))}
inv_usr_map = {v: k for k, v in usr_dict.items()}

values = np.array(data_agg.select(['request_cnt_sum']).to_pandas()['request_cnt_sum'])
rows = np.array(data_agg.select(['user_id']).to_pandas()['user_id'].map(usr_dict))
cols = np.array(data_agg.select(['url_host']).to_pandas()['url_host'].map(item_dict))
mat = scipy.sparse.coo_matrix((values, (rows, cols)), shape=(rows.max() + 1, cols.max() + 1))

max_f1 = 0
for f in [64, 128, 256]:
    for i in [30, 40, 50]:
        model = implicit.als.AlternatingLeastSquares(factors=f, 
                                                 iterations=i, 
                                                 regularization=0.01, 
                                                 random_state=42,
                                                 alpha=40, 
                                                 use_gpu=False
                                                )
        model.fit(mat, show_progress=False)
        u_factors = model.user_factors 
        i_factors = model.item_factors
        
        usr_emb = pd.DataFrame(u_factors)
        usr_emb['user_id'] = usr_emb.index.map(inv_usr_map)
        df = targets.merge(usr_emb, how = 'inner', on = ['user_id'])
        df = df[df['age'] != 'NA']
        df = df.dropna()
        df['age'] = df['age'].map(age_bucket)
        
        x_train, x_test, y_train, y_test = train_test_split(df.drop(['user_id', 'age', 'is_male'], axis = 1), df['age'], test_size = 0.33, random_state = 42)
        clf = CatBoostClassifier(task_type="GPU", devices='0:1')
        clf.fit(x_train, y_train, verbose = False)
        y_pred = clf.predict(x_test)
        f1 = f1_score(y_test, y_pred, average='weighted')
        
        if max_f1 < f1:
            max_f1 = f1
            best_params = {'factors': f, 'iterations': i}
            
print(best_params)
print(max_f1)
model = implicit.als.AlternatingLeastSquares(factors=best_params['factors'], 
                                                 iterations=best_params['iterations'], 
                                                 regularization=0.01, 
                                                 random_state=42,
                                                 alpha=40, 
                                                 use_gpu=False
                                                )
model.fit(mat, show_progress=False)
u_factors = model.user_factors
usr_emb = pd.DataFrame(u_factors, columns=[f"userid_urlhost_requestcnt_sum_{i}" for i in range(u_factors.shape[1])])
usr_emb['user_id'] = usr_emb.index.map(inv_usr_map)
usr_emb.to_csv('als/userid_urlhost_requestcnt_sum.csv', index=False)

{'factors': 256, 'iterations': 40}
0.4030627652108856


# userid_urlhost_date_count

In [7]:
data_agg = data.select(['user_id', 'url_host', 'date']).group_by(['user_id', 'url_host']).aggregate([('date', "count")])

item_set = set(data_agg.select(['url_host']).to_pandas()['url_host'])
item_dict = {url: idurl for url, idurl in zip(item_set, range(len(item_set)))}

usr_set = set(data_agg.select(['user_id']).to_pandas()['user_id'])
usr_dict = {usr: user_id for usr, user_id in zip(usr_set, range(len(usr_set)))}
inv_usr_map = {v: k for k, v in usr_dict.items()}

values = np.array(data_agg.select(['date_count']).to_pandas()['date_count'])
rows = np.array(data_agg.select(['user_id']).to_pandas()['user_id'].map(usr_dict))
cols = np.array(data_agg.select(['url_host']).to_pandas()['url_host'].map(item_dict))
mat = scipy.sparse.coo_matrix((values, (rows, cols)), shape=(rows.max() + 1, cols.max() + 1))

max_f1 = 0
for f in [64, 128, 256]:
    for i in [30, 40, 50]:
        model = implicit.als.AlternatingLeastSquares(factors=f, 
                                                 iterations=i, 
                                                 regularization=0.01, 
                                                 random_state=42,
                                                 alpha=40, 
                                                 use_gpu=False
                                                )
        model.fit(mat, show_progress=False)
        u_factors = model.user_factors 
        i_factors = model.item_factors
        
        usr_emb = pd.DataFrame(u_factors)
        usr_emb['user_id'] = usr_emb.index.map(inv_usr_map)
        df = targets.merge(usr_emb, how = 'inner', on = ['user_id'])
        df = df[df['age'] != 'NA']
        df = df.dropna()
        df['age'] = df['age'].map(age_bucket)
        
        x_train, x_test, y_train, y_test = train_test_split(df.drop(['user_id', 'age', 'is_male'], axis = 1), df['age'], test_size = 0.33, random_state = 42)
        clf = CatBoostClassifier(task_type="GPU", devices='0:1')
        clf.fit(x_train, y_train, verbose = False)
        y_pred = clf.predict(x_test)
        f1 = f1_score(y_test, y_pred, average='weighted')
        
        if max_f1 < f1:
            max_f1 = f1
            best_params = {'factors': f, 'iterations': i}
            
print(best_params)
print(max_f1)
model = implicit.als.AlternatingLeastSquares(factors=best_params['factors'], 
                                                 iterations=best_params['iterations'], 
                                                 regularization=0.01, 
                                                 random_state=42,
                                                 alpha=40, 
                                                 use_gpu=False
                                                )
model.fit(mat, show_progress=False)
u_factors = model.user_factors
usr_emb = pd.DataFrame(u_factors, columns=[f"userid_urlhost_date_count_{i}" for i in range(u_factors.shape[1])])
usr_emb['user_id'] = usr_emb.index.map(inv_usr_map)
usr_emb.to_csv('als/userid_urlhost_date_count.csv', index=False)

{'factors': 256, 'iterations': 40}
0.409860708444737


# userid_regionname_requestcnt_count

In [None]:
data_agg = data.select(['user_id', 'region_name', 'request_cnt']).group_by(['user_id', 'region_name']).aggregate([('request_cnt', "count")])

item_set = set(data_agg.select(['region_name']).to_pandas()['region_name'])
item_dict = {url: idurl for url, idurl in zip(item_set, range(len(item_set)))}

usr_set = set(data_agg.select(['user_id']).to_pandas()['user_id'])
usr_dict = {usr: user_id for usr, user_id in zip(usr_set, range(len(usr_set)))}
inv_usr_map = {v: k for k, v in usr_dict.items()}

values = np.array(data_agg.select(['request_cnt_count']).to_pandas()['request_cnt_count'])
rows = np.array(data_agg.select(['user_id']).to_pandas()['user_id'].map(usr_dict))
cols = np.array(data_agg.select(['region_name']).to_pandas()['region_name'].map(item_dict))
mat = scipy.sparse.coo_matrix((values, (rows, cols)), shape=(rows.max() + 1, cols.max() + 1))

max_f1 = 0
for f in [64, 128, 256]:
    for i in [30, 40, 50]:
        model = implicit.als.AlternatingLeastSquares(factors=f, 
                                                 iterations=i, 
                                                 regularization=0.01, 
                                                 random_state=42,
                                                 alpha=40, 
                                                 use_gpu=False
                                                )
        model.fit(mat, show_progress=False)
        u_factors = model.user_factors 
        i_factors = model.item_factors
        
        usr_emb = pd.DataFrame(u_factors)
        usr_emb['user_id'] = usr_emb.index.map(inv_usr_map)
        df = targets.merge(usr_emb, how = 'inner', on = ['user_id'])
        df = df[df['age'] != 'NA']
        df = df.dropna()
        df['age'] = df['age'].map(age_bucket)
        
        x_train, x_test, y_train, y_test = train_test_split(df.drop(['user_id', 'age', 'is_male'], axis = 1), df['age'], test_size = 0.33, random_state = 42)
        clf = CatBoostClassifier(task_type="GPU", devices='0:1')
        clf.fit(x_train, y_train, verbose = False)
        y_pred = clf.predict(x_test)
        f1 = f1_score(y_test, y_pred, average='weighted')
        
        if max_f1 < f1:
            max_f1 = f1
            best_params = {'factors': f, 'iterations': i}
            
print(best_params)
print(max_f1)
model = implicit.als.AlternatingLeastSquares(factors=best_params['factors'], 
                                                 iterations=best_params['iterations'], 
                                                 regularization=0.01, 
                                                 random_state=42,
                                                 alpha=40, 
                                                 use_gpu=False
                                                )
model.fit(mat, show_progress=False)
u_factors = model.user_factors
usr_emb = pd.DataFrame(u_factors, columns=[f"userid_regionname_requestcnt_count_{i}" for i in range(u_factors.shape[1])])
usr_emb['user_id'] = usr_emb.index.map(inv_usr_map)
usr_emb.to_csv('als/userid_regionname_requestcnt_count.csv', index=False)

{'factors': 64, 'iterations': 40}
0.24179573107470356


# part_of_day

In [78]:
import pyarrow as pa

part_of_day_agg = pa.Table.from_pandas(pd.get_dummies(data.select(['user_id', 'part_of_day']).\
    to_pandas(), columns = ['part_of_day'])).\
    group_by(['user_id']).aggregate([('part_of_day_day', 'sum'), ('part_of_day_evening', 'sum'), ('part_of_day_morning', 'sum'), ('part_of_day_night', 'sum')])
part_of_day_agg = part_of_day_agg.to_pandas()
part_of_day_agg.to_csv("als/partofday_sum.csv", index=False)