### На LB данное решение дает ~ 0.74. Признаков всего 9 (это минимум 1/10 от возможных).

In [1]:
import pandas as pd
import numpy as np
from sksurv.ensemble import RandomSurvivalForest
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split

In [2]:
def merge_by_concat(df1, df2, merge_on):
    merged_gf = df1[merge_on]
    merged_gf = merged_gf.merge(df2, on=merge_on, how='left')
    new_columns = [col for col in list(merged_gf) if col not in merge_on]
    df1 = pd.concat([df1, merged_gf[new_columns]], axis=1)
    return df1

### Подгружаем данные. Формируем датасет.

In [3]:
!ls './Data'

clients.csv	 report_dates.csv	  transactions.csv.zip
currency_rk.csv  sample_submit_naive.csv
mcc_codes.csv	 train.csv


In [4]:
clients = pd.read_csv('./Data/clients.csv')

In [5]:
clients.head()

Unnamed: 0,user_id,report,employee_count_nm,bankemplstatus,customer_age
0,3,2,ОТ 101 ДО 500,0,3
1,9,1,БОЛЕЕ 1001,0,3
2,13,6,ОТ 501 ДО 1000,0,2
3,37,5,БОЛЕЕ 1001,0,2
4,41,1,ОТ 101 ДО 500,0,2


In [6]:
report_dates = pd.read_csv('./Data/report_dates.csv', parse_dates=['report_dt'])

In [7]:
report_dates.head()

Unnamed: 0,report,report_dt
0,1,2022-07-31 03:00:00
1,2,2022-08-31 03:00:00
2,3,2022-09-30 03:00:00
3,4,2022-10-31 03:00:00
4,5,2022-11-30 03:00:00


In [8]:
%%time
transactions = pd.read_csv('./Data/transactions.csv.zip', 
                           parse_dates=['transaction_dttm'], 
                           low_memory=False, compression='zip')

CPU times: user 15.4 s, sys: 693 ms, total: 16.1 s
Wall time: 16.1 s


In [9]:
transactions.head()

Unnamed: 0,user_id,mcc_code,currency_rk,transaction_amt,transaction_dttm
0,3,3,1,-183.883957,2022-01-28 12:05:33
1,3,3,1,-3206.437012,2022-01-28 12:52:30
2,3,16,1,-153866.890625,2022-02-16 14:45:56
3,3,56,1,-15144.601562,2022-03-09 19:58:29
4,3,0,1,5297.908691,2022-03-12 18:11:31


In [10]:
clients=merge_by_concat(clients, report_dates, ['report'])

In [11]:
clients

Unnamed: 0,user_id,report,employee_count_nm,bankemplstatus,customer_age,report_dt
0,3,2,ОТ 101 ДО 500,0,3,2022-08-31 03:00:00
1,9,1,БОЛЕЕ 1001,0,3,2022-07-31 03:00:00
2,13,6,ОТ 501 ДО 1000,0,2,2022-12-31 03:00:00
3,37,5,БОЛЕЕ 1001,0,2,2022-11-30 03:00:00
4,41,1,ОТ 101 ДО 500,0,2,2022-07-31 03:00:00
...,...,...,...,...,...,...
95995,562043,12,,0,2,2023-06-30 03:00:00
95996,562205,12,,0,1,2023-06-30 03:00:00
95997,562312,12,,0,0,2023-06-30 03:00:00
95998,562721,12,,0,2,2023-06-30 03:00:00


In [12]:
transactions=merge_by_concat(transactions, clients[['user_id','report_dt']], ['user_id'])

In [13]:
transactions

Unnamed: 0,user_id,mcc_code,currency_rk,transaction_amt,transaction_dttm,report_dt
0,3,3,1,-183.883957,2022-01-28 12:05:33,2022-08-31 03:00:00
1,3,3,1,-3206.437012,2022-01-28 12:52:30,2022-08-31 03:00:00
2,3,16,1,-153866.890625,2022-02-16 14:45:56,2022-08-31 03:00:00
3,3,56,1,-15144.601562,2022-03-09 19:58:29,2022-08-31 03:00:00
4,3,0,1,5297.908691,2022-03-12 18:11:31,2022-08-31 03:00:00
...,...,...,...,...,...,...
13075018,562740,155,1,-2484.366211,2023-03-20 11:52:09,2023-06-30 03:00:00
13075019,562740,9,1,-187.658463,2023-03-20 12:10:22,2023-06-30 03:00:00
13075020,562740,1,1,-891.933350,2023-03-20 15:53:37,2023-06-30 03:00:00
13075021,562740,13,1,-464.467316,2023-03-20 15:54:49,2023-06-30 03:00:00


In [14]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13075023 entries, 0 to 13075022
Data columns (total 6 columns):
 #   Column            Dtype         
---  ------            -----         
 0   user_id           int64         
 1   mcc_code          int64         
 2   currency_rk       int64         
 3   transaction_amt   float64       
 4   transaction_dttm  datetime64[ns]
 5   report_dt         datetime64[ns]
dtypes: datetime64[ns](2), float64(1), int64(3)
memory usage: 598.5 MB


### Кодим новые признаки

In [17]:
# Количество транзакций, по клиентам
count_trans = transactions['user_id'].value_counts().to_frame().reset_index().\
rename(columns={"index":"user_id", "user_id":"count_trans"}).sort_values(["user_id"]).reset_index(drop=True)

In [18]:
count_trans

Unnamed: 0,user_id,count_trans
0,3,11
1,9,90
2,13,22
3,37,315
4,41,16
...,...,...
95995,562043,37
95996,562205,151
95997,562312,56
95998,562721,85


In [19]:
count_trans['count_trans'].describe()

count    96000.000000
mean       136.198156
std        148.584868
min         11.000000
25%         35.000000
50%         82.000000
75%        186.000000
max       1497.000000
Name: count_trans, dtype: float64

In [20]:
# Сумма транзакций, по клиентам
sum_trans = transactions.groupby(['user_id'])[['transaction_amt']].sum().reset_index().\
rename(columns={"transaction_amt":"sum_trans"}).sort_values(["user_id"]).reset_index(drop=True)

In [21]:
sum_trans

Unnamed: 0,user_id,sum_trans
0,3,13706.416641
1,9,-323434.666813
2,13,-124717.379150
3,37,-331859.599463
4,41,-108586.614166
...,...,...
95995,562043,-29581.256115
95996,562205,-40491.908630
95997,562312,-18537.821270
95998,562721,-164004.761685


In [22]:
sum_trans['sum_trans'].describe()

count    9.600000e+04
mean    -1.399246e+05
std      3.687130e+05
min     -1.795695e+07
25%     -2.168482e+05
50%     -8.570759e+04
75%     -6.480743e+03
max      1.134800e+07
Name: sum_trans, dtype: float64

In [23]:
# Сумма положительный транзакций
sum_trans_p = transactions[transactions['transaction_amt']>0].groupby(['user_id'])[['transaction_amt']].sum().\
reset_index().rename(columns={"transaction_amt":"sum_trans_p"}).sort_values(["user_id"]).reset_index(drop=True)

In [24]:
sum_trans_p

Unnamed: 0,user_id,sum_trans_p
0,3,186108.229797
1,13,128766.684326
2,37,10738.788574
3,42,72779.679138
4,46,5408.328873
...,...,...
69462,561824,1939.841522
69463,561908,936792.233398
69464,562205,1968.906334
69465,562721,34391.163893


In [25]:
sum_trans_p['sum_trans_p'].describe()

count    6.946700e+04
mean     1.261222e+05
std      2.924621e+05
min      1.092291e+00
25%      7.560175e+03
50%      3.728342e+04
75%      1.278209e+05
max      1.262594e+07
Name: sum_trans_p, dtype: float64

In [26]:
# Сумма отрицательных транзакций
sum_trans_m = transactions[transactions['transaction_amt']<0].groupby(['user_id'])[['transaction_amt']].sum().\
reset_index().rename(columns={"transaction_amt":"sum_trans_m"}).sort_values(["user_id"]).reset_index(drop=True)

In [27]:
sum_trans_m

Unnamed: 0,user_id,sum_trans_m
0,3,-172401.813156
1,9,-323434.666813
2,13,-253484.063477
3,37,-342598.388037
4,41,-108586.614166
...,...,...
95555,562043,-29581.256115
95556,562205,-42460.814964
95557,562312,-18537.821270
95558,562721,-198395.925579


In [28]:
sum_trans_m['sum_trans_m'].describe()

count    9.556000e+04
mean    -2.322529e+05
std      3.721220e+05
min     -1.810122e+07
25%     -2.819604e+05
50%     -1.387938e+05
75%     -5.155015e+04
max     -1.883282e+00
Name: sum_trans_m, dtype: float64

In [29]:
# Количество дней от транзакции до отчета, по транзакциям
transactions['diff_days'] = (transactions['report_dt']-transactions['transaction_dttm']) / np.timedelta64(1, 'D')

In [30]:
transactions

Unnamed: 0,user_id,mcc_code,currency_rk,transaction_amt,transaction_dttm,report_dt,diff_days
0,3,3,1,-183.883957,2022-01-28 12:05:33,2022-08-31 03:00:00,214.621146
1,3,3,1,-3206.437012,2022-01-28 12:52:30,2022-08-31 03:00:00,214.588542
2,3,16,1,-153866.890625,2022-02-16 14:45:56,2022-08-31 03:00:00,195.509769
3,3,56,1,-15144.601562,2022-03-09 19:58:29,2022-08-31 03:00:00,174.292720
4,3,0,1,5297.908691,2022-03-12 18:11:31,2022-08-31 03:00:00,171.367002
...,...,...,...,...,...,...,...
13075018,562740,155,1,-2484.366211,2023-03-20 11:52:09,2023-06-30 03:00:00,101.630451
13075019,562740,9,1,-187.658463,2023-03-20 12:10:22,2023-06-30 03:00:00,101.617801
13075020,562740,1,1,-891.933350,2023-03-20 15:53:37,2023-06-30 03:00:00,101.462766
13075021,562740,13,1,-464.467316,2023-03-20 15:54:49,2023-06-30 03:00:00,101.461933


In [31]:
# Максимальная дата до отчета, по клиентам
max_day_trans = transactions.groupby(['user_id'])[['diff_days']].max().reset_index().\
rename(columns={"diff_days":"max_day_trans"}).sort_values(["user_id"]).reset_index(drop=True)

In [32]:
max_day_trans

Unnamed: 0,user_id,max_day_trans
0,3,214.621146
1,9,283.587488
2,13,282.616204
3,37,283.736042
4,41,256.469896
...,...,...
95995,562043,266.598877
95996,562205,280.398981
95997,562312,280.244144
95998,562721,280.883727


In [33]:
max_day_trans['max_day_trans'].describe()

count    96000.000000
mean       275.015360
std         18.260865
min        104.550081
25%        275.370174
50%        281.393802
75%        283.303084
max        283.750000
Name: max_day_trans, dtype: float64

In [34]:
# Минимальная дата до отчета, по клиентам
min_day_trans = transactions.groupby(['user_id'])[['diff_days']].min().reset_index().\
rename(columns={"diff_days":"min_day_trans"}).sort_values(["user_id"]).reset_index(drop=True)

In [35]:
min_day_trans

Unnamed: 0,user_id,min_day_trans
0,3,108.264062
1,9,102.289907
2,13,114.257708
3,37,104.713507
4,41,103.262766
...,...,...
95995,562043,142.873935
95996,562205,102.048947
95997,562312,122.261134
95998,562721,105.999363


In [36]:
min_day_trans['min_day_trans'].describe()

count    96000.000000
mean       108.122144
std         11.094320
min        101.250000
25%        101.662975
50%        103.546522
75%        109.354031
max        191.473646
Name: min_day_trans, dtype: float64

### Собираем все признаки

In [37]:
clients=merge_by_concat(clients, count_trans, ['user_id'])

In [38]:
clients=merge_by_concat(clients, sum_trans, ['user_id'])

In [39]:
clients=merge_by_concat(clients, sum_trans_p, ['user_id'])

In [40]:
clients=merge_by_concat(clients, sum_trans_m, ['user_id'])

In [41]:
clients=merge_by_concat(clients, max_day_trans, ['user_id'])

In [42]:
clients=merge_by_concat(clients, min_day_trans, ['user_id'])

In [43]:
clients

Unnamed: 0,user_id,report,employee_count_nm,bankemplstatus,customer_age,report_dt,count_trans,sum_trans,sum_trans_p,sum_trans_m,max_day_trans,min_day_trans
0,3,2,ОТ 101 ДО 500,0,3,2022-08-31 03:00:00,11,13706.416641,186108.229797,-172401.813156,214.621146,108.264062
1,9,1,БОЛЕЕ 1001,0,3,2022-07-31 03:00:00,90,-323434.666813,,-323434.666813,283.587488,102.289907
2,13,6,ОТ 501 ДО 1000,0,2,2022-12-31 03:00:00,22,-124717.379150,128766.684326,-253484.063477,282.616204,114.257708
3,37,5,БОЛЕЕ 1001,0,2,2022-11-30 03:00:00,315,-331859.599463,10738.788574,-342598.388037,283.736042,104.713507
4,41,1,ОТ 101 ДО 500,0,2,2022-07-31 03:00:00,16,-108586.614166,,-108586.614166,256.469896,103.262766
...,...,...,...,...,...,...,...,...,...,...,...,...
95995,562043,12,,0,2,2023-06-30 03:00:00,37,-29581.256115,,-29581.256115,266.598877,142.873935
95996,562205,12,,0,1,2023-06-30 03:00:00,151,-40491.908630,1968.906334,-42460.814964,280.398981,102.048947
95997,562312,12,,0,0,2023-06-30 03:00:00,56,-18537.821270,,-18537.821270,280.244144,122.261134
95998,562721,12,,0,2,2023-06-30 03:00:00,85,-164004.761685,34391.163893,-198395.925579,280.883727,105.999363


### Заполняем пропуски и кодируем признаки

In [44]:
clients['sum_trans_p'] = clients['sum_trans_p'].fillna(0)
clients['sum_trans_m'] = clients['sum_trans_m'].fillna(0)
clients['employee_count_nm'] = clients['employee_count_nm'].fillna(0)

In [45]:
clients['employee_count_nm'].unique()

array(['ОТ 101 ДО 500', 'БОЛЕЕ 1001', 'ОТ 501 ДО 1000', 'ДО 10', 0,
       'ОТ 11 ДО 50', 'ОТ 51 ДО 100', 'БОЛЕЕ 500', 'ОТ 11 ДО 30',
       'ОТ 31 ДО 50'], dtype=object)

In [46]:
clients.replace({'employee_count_nm':{'ОТ 101 ДО 500':1,'БОЛЕЕ 1001':2,'ОТ 501 ДО 1000':3,'ДО 10':4,
                                      'ОТ 11 ДО 50':5,'ОТ 51 ДО 100':6,'БОЛЕЕ 500':7,'ОТ 11 ДО 30':8,
                                      'ОТ 31 ДО 50':9}}, inplace=True)

In [47]:
clients['employee_count_nm'].value_counts()

0    36466
2    17833
1    14362
6     7314
3     6481
5     4413
4     3797
7     1996
8     1871
9     1467
Name: employee_count_nm, dtype: int64

In [48]:
clients['employee_count_nm'] = clients['employee_count_nm'].astype(np.int16)

### Делим данные на train и test

In [49]:
train = pd.read_csv('./Data/train.csv')

In [50]:
train

Unnamed: 0,user_id,target,time
0,3,0,77
1,13,0,86
2,37,0,89
3,41,0,57
4,42,0,84
...,...,...,...
63995,561824,0,91
63996,562043,0,75
63997,562312,0,91
63998,562721,0,29


In [51]:
clients.columns

Index(['user_id', 'report', 'employee_count_nm', 'bankemplstatus',
       'customer_age', 'report_dt', 'count_trans', 'sum_trans', 'sum_trans_p',
       'sum_trans_m', 'max_day_trans', 'min_day_trans'],
      dtype='object')

In [52]:
clients_columns = ['user_id', 'bankemplstatus', 'customer_age', 'count_trans', 'sum_trans', 'sum_trans_p',
                   'sum_trans_m', 'max_day_trans', 'min_day_trans', 'employee_count_nm']

In [53]:
train_columns = ['bankemplstatus', 'customer_age', 'count_trans', 'sum_trans', 'sum_trans_p',
                   'sum_trans_m', 'max_day_trans', 'min_day_trans', 'employee_count_nm']

In [54]:
df = merge_by_concat(clients[clients_columns], train, ['user_id'])

In [55]:
df

Unnamed: 0,user_id,bankemplstatus,customer_age,count_trans,sum_trans,sum_trans_p,sum_trans_m,max_day_trans,min_day_trans,employee_count_nm,target,time
0,3,0,3,11,13706.416641,186108.229797,-172401.813156,214.621146,108.264062,1,0.0,77.0
1,9,0,3,90,-323434.666813,0.000000,-323434.666813,283.587488,102.289907,2,,
2,13,0,2,22,-124717.379150,128766.684326,-253484.063477,282.616204,114.257708,3,0.0,86.0
3,37,0,2,315,-331859.599463,10738.788574,-342598.388037,283.736042,104.713507,2,0.0,89.0
4,41,0,2,16,-108586.614166,0.000000,-108586.614166,256.469896,103.262766,1,0.0,57.0
...,...,...,...,...,...,...,...,...,...,...,...,...
95995,562043,0,2,37,-29581.256115,0.000000,-29581.256115,266.598877,142.873935,0,0.0,75.0
95996,562205,0,1,151,-40491.908630,1968.906334,-42460.814964,280.398981,102.048947,0,,
95997,562312,0,0,56,-18537.821270,0.000000,-18537.821270,280.244144,122.261134,0,0.0,91.0
95998,562721,0,2,85,-164004.761685,34391.163893,-198395.925579,280.883727,105.999363,0,0.0,29.0


### Целевая переменная

In [56]:
df['time'] = df['time'].fillna(-1)
df['time'] = df['time'].astype(np.int32)

In [57]:
df['target'] = df['target'].fillna(-1)
df['target'] = df['target'].astype(np.int8)

In [58]:
X = df[df['time']!=-1][train_columns].copy()
y = df[df['time']!=-1][['target', 'time']].copy()

In [64]:
y['target'] = y['target'].astype(bool)

In [65]:
aux = [(e1,e2) for e1,e2 in  np.array(y)]

In [70]:
y = np.array(aux, dtype=[('Status', '?'), ('Survival_in_days', '<f8')])

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=33)

In [72]:
y_train

array([( True, 30.), (False, 86.), (False, 91.), ..., ( True, 20.),
       (False, 77.), (False, 91.)],
      dtype=[('Status', '?'), ('Survival_in_days', '<f8')])

In [73]:
#help(RandomSurvivalForest)

### Тренировка 

In [93]:
%%time
rsf = RandomSurvivalForest(
    n_estimators=250, max_depth=8, min_samples_split=10, min_samples_leaf=15, n_jobs=-1, random_state=333, 
    verbose=2, low_memory=True
)
rsf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


building tree 1 of 250
building tree 2 of 250
building tree 3 of 250
building tree 4 of 250
building tree 5 of 250
building tree 6 of 250
building tree 7 of 250
building tree 8 of 250
building tree 9 of 250
building tree 10 of 250
building tree 11 of 250
building tree 12 of 250
building tree 13 of 250
building tree 14 of 250
building tree 15 of 250
building tree 16 of 250
building tree 17 of 250
building tree 18 of 250
building tree 19 of 250
building tree 20 of 250
building tree 21 of 250
building tree 22 of 250
building tree 23 of 250
building tree 24 of 250
building tree 25 of 250
building tree 26 of 250
building tree 27 of 250
building tree 28 of 250
building tree 29 of 250


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   32.3s


building tree 30 of 250
building tree 31 of 250
building tree 32 of 250
building tree 33 of 250
building tree 34 of 250
building tree 35 of 250
building tree 36 of 250
building tree 37 of 250
building tree 38 of 250
building tree 39 of 250
building tree 40 of 250
building tree 41 of 250
building tree 42 of 250
building tree 43 of 250
building tree 44 of 250
building tree 45 of 250
building tree 46 of 250
building tree 47 of 250
building tree 48 of 250
building tree 49 of 250
building tree 50 of 250
building tree 51 of 250
building tree 52 of 250
building tree 53 of 250
building tree 54 of 250
building tree 55 of 250
building tree 56 of 250
building tree 57 of 250
building tree 58 of 250
building tree 59 of 250
building tree 60 of 250
building tree 61 of 250
building tree 62 of 250
building tree 63 of 250
building tree 64 of 250
building tree 65 of 250
building tree 66 of 250
building tree 67 of 250
building tree 68 of 250
building tree 69 of 250
building tree 70 of 250
building tree 71

[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  3.3min


building tree 151 of 250
building tree 152 of 250
building tree 153 of 250
building tree 154 of 250
building tree 155 of 250
building tree 156 of 250
building tree 157 of 250
building tree 158 of 250
building tree 159 of 250
building tree 160 of 250
building tree 161 of 250
building tree 162 of 250
building tree 163 of 250
building tree 164 of 250
building tree 165 of 250
building tree 166 of 250
building tree 167 of 250
building tree 168 of 250
building tree 169 of 250
building tree 170 of 250
building tree 171 of 250
building tree 172 of 250
building tree 173 of 250
building tree 174 of 250
building tree 175 of 250
building tree 176 of 250
building tree 177 of 250
building tree 178 of 250
building tree 179 of 250
building tree 180 of 250
building tree 181 of 250
building tree 182 of 250
building tree 183 of 250
building tree 184 of 250
building tree 185 of 250
building tree 186 of 250
building tree 187 of 250
building tree 188 of 250
building tree 189 of 250
building tree 190 of 250


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:  5.9min finished


In [94]:
rsf.score(X_test, y_test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 138 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 250 out of 250 | elapsed:    0.0s finished


0.7396798119909609

### Важность признаков

In [96]:
result = permutation_importance(rsf, X_test, y_test, n_repeats=3, random_state=33)
pd.DataFrame(
    {
        k: result[k]
        for k in (
            "importances_mean",
            "importances_std",
        )
    },
    index=X_test.columns,
).sort_values(by="importances_mean", ascending=False)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 138 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 250 out of 250 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 138 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 250 out of 250 | elapsed:    0.1s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 138 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 250 out of 250 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Do

Unnamed: 0,importances_mean,importances_std
min_day_trans,0.05883716,0.001719456
employee_count_nm,0.04897188,0.004062839
customer_age,0.02326863,0.0006795993
sum_trans_m,0.02057574,0.002945689
sum_trans,0.01430488,0.001859042
count_trans,0.01244419,0.001811689
max_day_trans,0.008563378,0.0010398
sum_trans_p,0.005001936,0.0004638295
bankemplstatus,8.054046e-07,5.28828e-07


### Решение

In [97]:
X_pred = df[df['time']==-1][train_columns].copy()
X_pred

Unnamed: 0,bankemplstatus,customer_age,count_trans,sum_trans,sum_trans_p,sum_trans_m,max_day_trans,min_day_trans,employee_count_nm
1,0,3,90,-323434.666813,0.000000,-323434.666813,283.587488,102.289907,2
9,0,3,67,32793.204498,122847.822327,-90054.617828,260.055984,105.952083,5
10,0,1,12,49056.019875,52405.939453,-3349.919579,253.615683,104.469097,4
17,0,1,427,-572952.872180,112552.514648,-685505.386828,280.338704,101.447153,2
20,0,2,50,-49969.439034,0.000000,-49969.439034,282.725127,134.985833,0
...,...,...,...,...,...,...,...,...,...
95988,0,3,99,-71254.860472,0.000000,-71254.860472,276.356921,106.102477,0
95990,0,3,72,-3495.118294,69579.185341,-73074.303635,282.555150,111.490914,0
95993,0,2,39,-717608.803839,0.000000,-717608.803839,283.235301,105.251678,0
95994,0,2,56,778253.475967,936792.233398,-158538.757431,258.524664,101.367234,0


In [98]:
predictions = rsf.predict(X_pred)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 138 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 250 out of 250 | elapsed:    0.1s finished


In [99]:
predictions

array([1.01415056, 2.18783468, 2.32293071, ..., 3.01195427, 3.47605313,
       3.84247404])

In [100]:
submit = df[df['time']==-1][['user_id']].copy()

In [101]:
submit['predict'] = predictions

In [102]:
submit

Unnamed: 0,user_id,predict
1,9,1.014151
9,61,2.187835
10,62,2.322931
17,80,0.657391
20,88,10.925604
...,...,...
95988,561362,4.489318
95990,561419,4.746076
95993,561895,3.011954
95994,561908,3.476053


In [103]:
submit.to_csv('submission.csv',index=False)