# Imports

In [None]:
!pip install fastparquet catboost

In [1]:
import pandas as pd
import os 
import numpy as np
import gc
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
!unzip "/content/drive/MyDrive/CP/train_dataset_VK.zip" -d "/content/"

Archive:  /content/drive/MyDrive/CP/train_dataset_VK.zip
  inflating: /content/train.csv      
  inflating: /content/Baseline.ipynb  
  inflating: /content/submission.csv  
  inflating: /content/test.csv       


In [2]:
dtypes = {0: 'category', 1: np.int16, 2: np.int16, 3: np.float32, 4: np.float32, 5: np.float32}
train = pd.read_csv('/content/test.csv', skiprows=1+25_000_000, nrows=10_000_000, dtype=dtypes, header=None)
train.columns=['ego_id', 'u', 'v', 't', 'x1', 'x2', 'x3']
train

Unnamed: 0,ego_id,u,v,t,x1,x2,x3
0,1056561955172,146,71,228.100006,8.521509,6.595780,0.0
1,1056561955172,0,62,123.400002,0.023873,0.000000,0.0
2,1056561955172,59,23,123.300003,0.000010,0.000000,1.0
3,1056561955172,99,78,,0.000119,0.000000,0.0
4,1056561955172,35,95,186.300003,1.453455,0.000000,0.0
...,...,...,...,...,...,...,...
9999995,1477468750531,132,149,,0.000000,0.693147,0.0
9999996,1477468750531,9,116,,,1.945910,0.0
9999997,1477468750531,127,132,,,0.000000,0.0
9999998,1477468750531,10,54,59.500000,15.074452,7.853993,0.0


# Feature Engineering

In [3]:
dtypes = {'ego_id': 'category', 'u': np.int16, 'v': np.int16, 'age': np.float32, 'sex': np.int8}
df = pd.read_csv('/content/drive/MyDrive/hack/attr.csv', dtype=dtypes)

In [4]:
train.t = train.groupby('ego_id')['t'].transform(lambda x: x.fillna(x.mean()))

In [5]:
train = pd.merge(train, df, how='left', left_on = ['ego_id', 'u'],  right_on = ['ego_id', 'u'])
train = pd.merge(train, df, how='left', left_on = ['ego_id', 'v'],  right_on = ['ego_id', 'u'])

In [6]:
train.sex_x = train.sex_x.astype(str)
train.sex_y = train.sex_y.astype(str)

In [7]:
train = train.rename(columns={'u_x':'u'})
train = train.drop('u_y', axis=1)

In [8]:
def aggregations(df):
    
        new_ego_frame = df

        # !!! генерим фичи по группировке

        # mean по x для (u и v)
        for x in ['x2', 'age_x', 'age_y']:
            new_ego_frame['u_mean_' + x] = new_ego_frame.groupby(['ego_id', 'u'])[x].transform('mean').astype(np.float16)
            new_ego_frame['v_mean_' + x] = new_ego_frame.groupby(['ego_id', 'v'])[x].transform('mean').astype(np.float16)

        # median по x для (u и v)
        for x in ['x2', 'age_x', 'age_y']:
            new_ego_frame['u_median_' + x] = new_ego_frame.groupby(['ego_id', 'u'])[x].transform('median').astype(np.float16)
            new_ego_frame['v_median_' + x] = new_ego_frame.groupby(['ego_id', 'v'])[x].transform('median').astype(np.float16)

        # max по x для (u и v)
        for x in ['x2', 'age_x', 'age_y']:
            new_ego_frame['u_max_' + x] = new_ego_frame.groupby(['ego_id', 'u'])[x].transform('max').astype(np.float16)
            new_ego_frame['v_max_' + x] = new_ego_frame.groupby(['ego_id', 'v'])[x].transform('max').astype(np.float16)

        # std по x для (u и v)
        for x in ['x2', 'age_x', 'age_y']:
            new_ego_frame['u_std_' + x] = new_ego_frame.groupby(['ego_id', 'u'])[x].transform('std').astype(np.float16)
            new_ego_frame['v_std_' + x] = new_ego_frame.groupby(['ego_id', 'v'])[x].transform('std').astype(np.float16)

        # mean по time для (u и v)
        new_ego_frame['u_mean_time'] = new_ego_frame.groupby(['ego_id', 'u'])['t'].transform('mean').astype(np.float16)
        new_ego_frame['v_mean_time'] = new_ego_frame.groupby(['ego_id', 'v'])['t'].transform('mean').astype(np.float16)

        # median по time для (u и v)
        new_ego_frame['u_median_time'] = new_ego_frame.groupby(['ego_id', 'u'])['t'].transform('median').astype(np.float16)
        new_ego_frame['v_median_time'] = new_ego_frame.groupby(['ego_id', 'v'])['t'].transform('median').astype(np.float16)


        new_ego_frame['u_std_time'] = new_ego_frame.groupby(['ego_id', 'u'])['t'].transform('std').astype(np.float16)
        new_ego_frame['v_std_time'] = new_ego_frame.groupby(['ego_id', 'v'])['t'].transform('std').astype(np.float16)

        new_ego_frame['u_count'] = new_ego_frame.groupby(['ego_id', 'u'])['t'].transform('count').astype(np.float32)
        new_ego_frame['v_count'] = new_ego_frame.groupby(['ego_id', 'v'])['t'].transform('count').astype(np.float32)
        new_ego_frame['u_count_mean'] = new_ego_frame.groupby(['ego_id', 'u'])['v_count'].transform('mean').astype(np.float32)
        new_ego_frame['v_count_mean'] = new_ego_frame.groupby(['ego_id', 'v'])['u_count'].transform('mean').astype(np.float32)
        new_ego_frame['u_mean_median'] = new_ego_frame.groupby(['ego_id', 'u'])['v_count'].transform('median').astype(np.float32)
        new_ego_frame['v_mean_median'] = new_ego_frame.groupby(['ego_id', 'v'])['u_count'].transform('median').astype(np.float32)
        new_ego_frame['u_mean_std'] = new_ego_frame.groupby(['ego_id', 'u'])['v_count'].transform('std').astype(np.float32)
        new_ego_frame['v_mean_std'] = new_ego_frame.groupby(['ego_id', 'v'])['u_count'].transform('std').astype(np.float32)
        new_ego_frame['u_mean_max'] = new_ego_frame.groupby(['ego_id', 'u'])['v_count'].transform('max').astype(np.float32)
        new_ego_frame['v_mean_max'] = new_ego_frame.groupby(['ego_id', 'v'])['u_count'].transform('max').astype(np.float32)

        new_ego_frame['ego_mean_count_v'] = new_ego_frame.groupby(['ego_id'])['v_count'].transform('mean').astype(np.float32)
        new_ego_frame['ego_mean_count_u'] = new_ego_frame.groupby(['ego_id'])['u_count'].transform('mean').astype(np.float32)
        new_ego_frame['ego_mean_t'] = new_ego_frame.groupby(['ego_id'])['t'].transform('mean').astype(np.float32)
        new_ego_frame['ego_mean_x2'] = new_ego_frame.groupby(['ego_id'])['x1'].transform('mean').astype(np.float32)

        new_ego_frame['ego_max_count_v'] = new_ego_frame.groupby(['ego_id'])['v_count'].transform('max').astype(np.float32)
        new_ego_frame['ego_max_count_u'] = new_ego_frame.groupby(['ego_id'])['u_count'].transform('max').astype(np.float32)
        new_ego_frame['ego_max_t'] = new_ego_frame.groupby(['ego_id'])['t'].transform('max').astype(np.float32)
        new_ego_frame['ego_max_x2'] = new_ego_frame.groupby(['ego_id'])['x1'].transform('max').astype(np.float32)

        new_ego_frame['ego_median_count_v'] = new_ego_frame.groupby(['ego_id'])['v_count'].transform('median').astype(np.float32)
        new_ego_frame['ego_median_count_u'] = new_ego_frame.groupby(['ego_id'])['u_count'].transform('median').astype(np.float32)
        new_ego_frame['ego_median_t'] = new_ego_frame.groupby(['ego_id'])['t'].transform('median').astype(np.float32)
        new_ego_frame['ego_median_x2'] = new_ego_frame.groupby(['ego_id'])['x1'].transform('median').astype(np.float32)


  

In [9]:
aggregations(train)

In [10]:
train['same_city'] = ((train['city_id_x'] == train['city_id_y']) & ((train['city_id_y']!=-1) | (train['city_id_x']!=-1)))
train['same_school'] = ((train['school_x'] == train['school_y']) & ((train['school_x']!=-1) | (train['school_y']!=-1)))
train['same_univ'] = ((train['university_x'] == train['university_y']) & ((train['university_x']!=-1) | (train['university_y']!=-1)))

In [11]:
train = train.drop(['city_id_y', 'city_id_x', 'school_y', 'school_x', 'university_x', 'university_y'], axis=1)

In [12]:
train_dr = train.dropna(subset='x1')

# Train

In [14]:
X_train, X_val, y_train, y_val = train_test_split(train_dr.drop(['x1', 'u', 'v'], axis=1), train_dr['x1'], test_size=0.1, random_state=42) 

In [16]:
X_train.columns

Index(['ego_id', 't', 'x2', 'x3', 'age_x', 'sex_x', 'age_y', 'sex_y',
       'u_mean_x2', 'v_mean_x2', 'u_mean_age_x', 'v_mean_age_x',
       'u_mean_age_y', 'v_mean_age_y', 'u_median_x2', 'v_median_x2',
       'u_median_age_x', 'v_median_age_x', 'u_median_age_y', 'v_median_age_y',
       'u_max_x2', 'v_max_x2', 'u_max_age_x', 'v_max_age_x', 'u_max_age_y',
       'v_max_age_y', 'u_std_x2', 'v_std_x2', 'u_std_age_x', 'v_std_age_x',
       'u_std_age_y', 'v_std_age_y', 'u_mean_time', 'v_mean_time',
       'u_median_time', 'v_median_time', 'u_std_time', 'v_std_time', 'u_count',
       'v_count', 'u_count_mean', 'v_count_mean', 'u_mean_median',
       'v_mean_median', 'u_mean_std', 'v_mean_std', 'u_mean_max', 'v_mean_max',
       'ego_mean_count_v', 'ego_mean_count_u', 'ego_mean_t', 'ego_mean_x2',
       'ego_max_count_v', 'ego_max_count_u', 'ego_max_t', 'ego_max_x2',
       'ego_median_count_v', 'ego_median_count_u', 'ego_median_t',
       'ego_median_x2', 'same_city', 'same_school', 'sam

In [15]:
model = CatBoostRegressor(eval_metric='RMSE', task_type='GPU', random_state=42, learning_rate=0.5)
model.fit(X_train, y_train, eval_set=(X_val, y_val), cat_features=['ego_id', 'sex_x', 'sex_y','same_city', 'same_school', 'same_univ'])

0:	learn: 1.0251150	test: 1.0258482	best: 1.0258482 (0)	total: 553ms	remaining: 9m 12s
1:	learn: 0.8956167	test: 0.8959092	best: 0.8959092 (1)	total: 1.07s	remaining: 8m 53s
2:	learn: 0.8432755	test: 0.8440954	best: 0.8440954 (2)	total: 1.56s	remaining: 8m 39s
3:	learn: 0.8215391	test: 0.8232847	best: 0.8232847 (3)	total: 2.07s	remaining: 8m 35s
4:	learn: 0.8092928	test: 0.8114355	best: 0.8114355 (4)	total: 2.6s	remaining: 8m 36s
5:	learn: 0.8027236	test: 0.8050622	best: 0.8050622 (5)	total: 3.15s	remaining: 8m 41s
6:	learn: 0.7982243	test: 0.8004677	best: 0.8004677 (6)	total: 3.66s	remaining: 8m 39s
7:	learn: 0.7951425	test: 0.7973594	best: 0.7973594 (7)	total: 4.16s	remaining: 8m 35s
8:	learn: 0.7922564	test: 0.7944660	best: 0.7944660 (8)	total: 4.7s	remaining: 8m 37s
9:	learn: 0.7899917	test: 0.7922096	best: 0.7922096 (9)	total: 5.26s	remaining: 8m 41s
10:	learn: 0.7878495	test: 0.7900162	best: 0.7900162 (10)	total: 5.76s	remaining: 8m 37s
11:	learn: 0.7860370	test: 0.7881411	best: 

<catboost.core.CatBoostRegressor at 0x7efe70fa12d0>

In [None]:
del X_train, y_train, X_val, y_val, train, train_dr
gc.collect()

0

In [17]:
model.save_model('/content/drive/MyDrive/blend/catboost_new3.cbm')

# Predict

In [15]:
model = CatBoostRegressor(eval_metric='RMSE', task_type='GPU', random_state=42)
model.load_model('/content/drive/MyDrive/blend/catboost_new.cbm')

<catboost.core.CatBoostRegressor at 0x7fa632fb9c30>

In [18]:
subm_merge = pd.read_parquet('/content/drive/MyDrive/blend/subm_merge.parquet')

In [20]:
pred = model.predict(subm_merge[['ego_id', 't', 'x2', 'x3', 'age_x', 'sex_x', 'age_y', 'sex_y',
       'u_mean_x2', 'v_mean_x2', 'u_mean_age_x', 'v_mean_age_x',
       'u_mean_age_y', 'v_mean_age_y', 'u_median_x2', 'v_median_x2',
       'u_median_age_x', 'v_median_age_x', 'u_median_age_y', 'v_median_age_y',
       'u_max_x2', 'v_max_x2', 'u_max_age_x', 'v_max_age_x', 'u_max_age_y',
       'v_max_age_y', 'u_std_x2', 'v_std_x2', 'u_std_age_x', 'v_std_age_x',
       'u_std_age_y', 'v_std_age_y', 'u_mean_time', 'v_mean_time',
       'u_median_time', 'v_median_time', 'u_std_time', 'v_std_time', 'u_count',
       'v_count', 'u_count_mean', 'v_count_mean', 'u_mean_median',
       'v_mean_median', 'u_mean_std', 'v_mean_std', 'u_mean_max', 'v_mean_max',
       'ego_mean_count_v', 'ego_mean_count_u', 'ego_mean_t', 'ego_mean_x2',
       'ego_max_count_v', 'ego_max_count_u', 'ego_max_t', 'ego_max_x2',
       'ego_median_count_v', 'ego_median_count_u', 'ego_median_t',
       'ego_median_x2', 'same_city', 'same_school', 'same_univ']])

In [21]:
subm_merge.x1 = pred.clip(0, 30)

In [22]:
subm_merge[['ego_id', 'u', 'v', 'x1']].to_csv('mysubm3.csv', index=False)

In [23]:
!cp -r /content/mysubm3.csv -d /content/drive/MyDrive/blend/mysubm3.csv