In [3]:
import pandas as pd 
import zipfile
import datetime
import numpy as np

%matplotlib inline

# Итак, наша задача: научить модель предсказывать пользователей, прошедших через турникеты, то есть это задача мультиклассовой классификации

# начнем работу с нашими данными

In [4]:
train = pd.read_csv("data/train.csv")
train

Unnamed: 0.1,Unnamed: 0,user_id,ts,gate_id
0,0,18,2022-07-29 09:08:54,7
1,1,18,2022-07-29 09:09:54,9
2,2,18,2022-07-29 09:09:54,9
3,3,18,2022-07-29 09:10:06,5
4,4,18,2022-07-29 09:10:08,5
...,...,...,...,...
37513,37513,6,2022-12-31 20:38:56,11
37514,37514,6,2022-12-31 20:39:22,6
37515,37515,6,2022-12-31 20:39:23,6
37516,37516,6,2022-12-31 20:39:31,9


In [5]:
test = pd.read_csv("data/test.csv")
test

Unnamed: 0.1,Unnamed: 0,ts,gate_id,user_word
0,37518,2023-01-03 08:21:00,9,gini
1,37519,2023-01-03 08:21:00,9,gini
2,37520,2023-01-03 08:21:18,5,gini
3,37521,2023-01-03 08:21:19,5,gini
4,37522,2023-01-03 08:21:39,10,gini
...,...,...,...,...
7120,44638,2023-02-24 19:43:36,11,collinear
7121,44639,2023-02-24 19:44:00,4,collinear
7122,44640,2023-02-24 19:44:01,4,collinear
7123,44641,2023-02-24 19:44:09,9,collinear


In [6]:
#объединим данные и удалим ненужный сьтолбец Unnamed: 0)
data = pd.concat([train, test], axis = 0)
data = data.drop(columns = ["Unnamed: 0"])
data

Unnamed: 0,user_id,ts,gate_id,user_word
0,18.0,2022-07-29 09:08:54,7,
1,18.0,2022-07-29 09:09:54,9,
2,18.0,2022-07-29 09:09:54,9,
3,18.0,2022-07-29 09:10:06,5,
4,18.0,2022-07-29 09:10:08,5,
...,...,...,...,...
7120,,2023-02-24 19:43:36,11,collinear
7121,,2023-02-24 19:44:00,4,collinear
7122,,2023-02-24 19:44:01,4,collinear
7123,,2023-02-24 19:44:09,9,collinear


# теперь нам надо подготовить данные для того,чтобы можель их "воспринимала". Для этого: 
## 1.категориальные переменные gate_id,час и день недели закодируем
## 2.разобьем ts на дату(дальше мы выясним день недели) и время(преобразуем его в ко-во секунд с начала суток)

In [7]:
data["date"] = data.ts.str.split(' ').str[0]
data["time"] = data.ts.str.split(' ').str[1]

In [8]:
data

Unnamed: 0,user_id,ts,gate_id,user_word,date,time
0,18.0,2022-07-29 09:08:54,7,,2022-07-29,09:08:54
1,18.0,2022-07-29 09:09:54,9,,2022-07-29,09:09:54
2,18.0,2022-07-29 09:09:54,9,,2022-07-29,09:09:54
3,18.0,2022-07-29 09:10:06,5,,2022-07-29,09:10:06
4,18.0,2022-07-29 09:10:08,5,,2022-07-29,09:10:08
...,...,...,...,...,...,...
7120,,2023-02-24 19:43:36,11,collinear,2023-02-24,19:43:36
7121,,2023-02-24 19:44:00,4,collinear,2023-02-24,19:44:00
7122,,2023-02-24 19:44:01,4,collinear,2023-02-24,19:44:01
7123,,2023-02-24 19:44:09,9,collinear,2023-02-24,19:44:09


In [9]:
#используем дату для получения дня недели
data["date"] = pd.to_datetime(data["date"])
data["day_of_week"] = data["date"].dt.weekday

In [10]:
data["time"] = pd.to_timedelta(data["time"])
data['time_seconds'] = data['time'].dt.total_seconds()

In [11]:
data = data.drop(columns = ["date", "time"])
data

Unnamed: 0,user_id,ts,gate_id,user_word,day_of_week,time_seconds
0,18.0,2022-07-29 09:08:54,7,,4,32934.0
1,18.0,2022-07-29 09:09:54,9,,4,32994.0
2,18.0,2022-07-29 09:09:54,9,,4,32994.0
3,18.0,2022-07-29 09:10:06,5,,4,33006.0
4,18.0,2022-07-29 09:10:08,5,,4,33008.0
...,...,...,...,...,...,...
7120,,2023-02-24 19:43:36,11,collinear,4,71016.0
7121,,2023-02-24 19:44:00,4,collinear,4,71040.0
7122,,2023-02-24 19:44:01,4,collinear,4,71041.0
7123,,2023-02-24 19:44:09,9,collinear,4,71049.0


In [12]:
min_time = data["time_seconds"].min()
min_time

3172.0

In [13]:
data["time_seconds"] = data["time_seconds"] - min_time
data["time_seconds"] = data["time_seconds"].astype(int)
data

Unnamed: 0,user_id,ts,gate_id,user_word,day_of_week,time_seconds
0,18.0,2022-07-29 09:08:54,7,,4,29762
1,18.0,2022-07-29 09:09:54,9,,4,29822
2,18.0,2022-07-29 09:09:54,9,,4,29822
3,18.0,2022-07-29 09:10:06,5,,4,29834
4,18.0,2022-07-29 09:10:08,5,,4,29836
...,...,...,...,...,...,...
7120,,2023-02-24 19:43:36,11,collinear,4,67844
7121,,2023-02-24 19:44:00,4,collinear,4,67868
7122,,2023-02-24 19:44:01,4,collinear,4,67869
7123,,2023-02-24 19:44:09,9,collinear,4,67877


In [14]:
#разобьем наши данные обратно на train и test, user_word у train NaN ->
#можем оттолкнуться от этого(или наоборот от user_id в test == Nan)
train_idx = data["user_word"].isnull()
train = data.loc[train_idx]
test = data.loc[~train_idx]

In [15]:
#проверим размерность изначально test имел 37518 строк
#кол-во столбцов мы увеличили в ходе подготовки данных->не обращаем внимание 
#на их число 
train = train.drop(columns = ["user_word","ts"])
train

Unnamed: 0,user_id,gate_id,day_of_week,time_seconds
0,18.0,7,4,29762
1,18.0,9,4,29822
2,18.0,9,4,29822
3,18.0,5,4,29834
4,18.0,5,4,29836
...,...,...,...,...
37513,6.0,11,5,71164
37514,6.0,6,5,71190
37515,6.0,6,5,71191
37516,6.0,9,5,71199


In [16]:
#проверим размерность изначально test имел 7125 строк
#кол-во столбцов мы увеличили в ходе подготовки данных->не обращаем внимание 
#на их число 
test = test.drop(columns = ["user_id","ts"], axis = 1)
test

Unnamed: 0,gate_id,user_word,day_of_week,time_seconds
0,9,gini,1,26888
1,9,gini,1,26888
2,5,gini,1,26906
3,5,gini,1,26907
4,10,gini,1,26927
...,...,...,...,...
7120,11,collinear,4,67844
7121,4,collinear,4,67868
7122,4,collinear,4,67869
7123,9,collinear,4,67877


# У нас нет необходимости кодировать категориальные пременные, тк мы используем catboost. Достаточно просто указать номер их столбцов

In [17]:
cat_features = list((1,2))
cat_features

[1, 2]

In [18]:
from catboost.utils import create_cd
feature_names = dict()
for column, name in enumerate(train):
    if column == 0 or column == 1:
        continue
    feature_names[column-1] = name

create_cd(
    label = 0,
    cat_features = cat_features,
    feature_names = feature_names,
)

# Теперь мы можем разделить данные на признаки(X_) и целевую переменную(y_). Здесь всё ясно: надо предсказать user_id => это и есть целевая переменная, остальное признаки

In [19]:
X = train.drop(columns = ["user_id"])
X

Unnamed: 0,gate_id,day_of_week,time_seconds
0,7,4,29762
1,9,4,29822
2,9,4,29822
3,5,4,29834
4,5,4,29836
...,...,...,...
37513,11,5,71164
37514,6,5,71190
37515,6,5,71191
37516,9,5,71199


In [20]:
y = train["user_id"].astype(int)
y

0        18
1        18
2        18
3        18
4        18
         ..
37513     6
37514     6
37515     6
37516     6
37517     6
Name: user_id, Length: 37518, dtype: int32

### У нас уже есть test, но вот val нет, выделим её из train части. Так же нам надо провести нормализацию данных, чтобы избавиться от выбросов 

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.77, random_state=42)

In [22]:
#нормализуем время, чтобы избавится от выбросов
X_train["time_seconds"] = (X_train["time_seconds"] - X_train["time_seconds"].mean()) / X_train["time_seconds"].std()
X_val["time_seconds"] = (X_val["time_seconds"] - X_val["time_seconds"].mean()) / X_val["time_seconds"].std()

In [23]:
X_train

Unnamed: 0,gate_id,day_of_week,time_seconds
28106,9,5,1.155898
16084,5,0,-0.211524
4953,4,3,1.352398
32998,4,4,1.657030
26473,5,4,0.627849
...,...,...,...
16850,3,2,0.234158
6265,3,1,0.180399
11284,3,3,-0.886852
860,7,0,0.178861


In [24]:
X_val

Unnamed: 0,gate_id,day_of_week,time_seconds
15824,4,4,1.359357
7599,11,4,-0.176380
7954,3,0,-1.018624
25905,4,2,1.510635
35108,3,1,0.919525
...,...,...,...
971,4,0,0.936972
892,5,0,0.393610
32737,6,3,1.666810
22324,7,2,0.368359


In [25]:
y_train

28106    14
16084    12
4953     33
32998    49
26473    14
         ..
16850    53
6265     55
11284     9
860      12
15795    53
Name: user_id, Length: 28888, dtype: int32

In [26]:
y_val

15824    32
7599     54
7954     33
25905    54
35108    12
         ..
971       9
892      15
32737    33
22324    19
14640    29
Name: user_id, Length: 8630, dtype: int32

In [27]:
from catboost import Pool, CatBoostClassifier
train_pool = Pool(data = X_train, label = y_train, cat_features = [0,1]) 
val_pool = Pool(data = X_val, label = y_val, cat_features = [0,1]) 

In [28]:
model_clf = CatBoostClassifier(task_type = "GPU", iterations = 1000,learning_rate=0.5,loss_function='MultiClassOneVsAll',custom_loss = 'Accuracy')
model_clf.fit(train_pool, verbose = False, plot=True,eval_set = (X_train,y_train))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x1e08ec23e10>

In [29]:
model_clf.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,day_of_week,34.557099
1,gate_id,32.984216
2,time_seconds,32.458685


In [30]:
model_clf.tree_count_

1000

In [31]:
y_pred = model_clf.predict(train_pool)
y_val_pred = model_clf.predict(val_pool)

In [32]:
# Сделаем словарь для val, набодобие того, который нужно предсказать

user_dict_val = dict() 
inverse_user_dict_val = dict()

y_val_list = list(y_val.unique())
for i in range(len(y_val_list)):
    user_dict_val[y_val_list[i]] = 'user_'+str(i)
    inverse_user_dict_val['user_'+str(i)] = y_val_list[i]

In [33]:
# Сделаем словарь для train, набодобие того, который нужно предсказать

user_dict_train = dict() 
inverse_user_dict_train = dict()

y_train_list = list(y.unique())
for i in range(len(y_train_list)):
    user_dict_train[y_train_list[i]] = 'train_user_'+str(i)
    inverse_user_dict_train['train_user_'+str(i)] = y_train_list[i]

In [34]:
user_dict_val

{32: 'user_0',
 54: 'user_1',
 33: 'user_2',
 12: 'user_3',
 19: 'user_4',
 9: 'user_5',
 49: 'user_6',
 17: 'user_7',
 3: 'user_8',
 37: 'user_9',
 23: 'user_10',
 55: 'user_11',
 0: 'user_12',
 6: 'user_13',
 50: 'user_14',
 53: 'user_15',
 20: 'user_16',
 47: 'user_17',
 56: 'user_18',
 11: 'user_19',
 39: 'user_20',
 29: 'user_21',
 40: 'user_22',
 34: 'user_23',
 48: 'user_24',
 18: 'user_25',
 15: 'user_26',
 43: 'user_27',
 46: 'user_28',
 14: 'user_29',
 1: 'user_30',
 42: 'user_31',
 26: 'user_32',
 24: 'user_33',
 27: 'user_34',
 35: 'user_35',
 57: 'user_36',
 31: 'user_37',
 25: 'user_38',
 10: 'user_39',
 45: 'user_40',
 41: 'user_41',
 7: 'user_42',
 30: 'user_43',
 22: 'user_44',
 36: 'user_45',
 28: 'user_46',
 5: 'user_47',
 44: 'user_48',
 2: 'user_49',
 8: 'user_50',
 52: 'user_51',
 38: 'user_52',
 51: 'user_53',
 4: 'user_54'}

In [35]:
inverse_user_dict_val

{'user_0': 32,
 'user_1': 54,
 'user_2': 33,
 'user_3': 12,
 'user_4': 19,
 'user_5': 9,
 'user_6': 49,
 'user_7': 17,
 'user_8': 3,
 'user_9': 37,
 'user_10': 23,
 'user_11': 55,
 'user_12': 0,
 'user_13': 6,
 'user_14': 50,
 'user_15': 53,
 'user_16': 20,
 'user_17': 47,
 'user_18': 56,
 'user_19': 11,
 'user_20': 39,
 'user_21': 29,
 'user_22': 40,
 'user_23': 34,
 'user_24': 48,
 'user_25': 18,
 'user_26': 15,
 'user_27': 43,
 'user_28': 46,
 'user_29': 14,
 'user_30': 1,
 'user_31': 42,
 'user_32': 26,
 'user_33': 24,
 'user_34': 27,
 'user_35': 35,
 'user_36': 57,
 'user_37': 31,
 'user_38': 25,
 'user_39': 10,
 'user_40': 45,
 'user_41': 41,
 'user_42': 7,
 'user_43': 30,
 'user_44': 22,
 'user_45': 36,
 'user_46': 28,
 'user_47': 5,
 'user_48': 44,
 'user_49': 2,
 'user_50': 8,
 'user_51': 52,
 'user_52': 38,
 'user_53': 51,
 'user_54': 4}

In [36]:
# Превратим y_val (y) в напободие того, что нужно предсказывать

y_val_word = y_val.copy()

for y_val_id in y_val_list:
    y_val_word.loc[y_val == y_val_id] = user_dict_val[y_val_id]
    
y_train_word = y_train.copy()

for y_train_id in y_train_list:
    y_train_word.loc[y == y_train_id] = user_dict_train[y_train_id]

In [37]:
val_words = pd.DataFrame()

val_words['word'] = y_val_word
val_words['true'] = y_val
val_words['preds'] = y_val_pred

train_words = pd.DataFrame()

train_words['word'] = y_train_word
train_words['true'] = y_train
train_words['preds'] = y_pred

In [38]:
val_words

Unnamed: 0,word,true,preds
15824,user_0,32,37
7599,user_1,54,55
7954,user_2,33,37
25905,user_1,54,54
35108,user_3,12,23
...,...,...,...
971,user_5,9,48
892,user_26,15,11
32737,user_2,33,33
22324,user_4,19,53


In [39]:
# Предскажем самым частотным user_id

comp_df = pd.DataFrame(val_words.groupby('word')['preds'].agg(lambda x: x.value_counts().index[0]))

comp_df_train = pd.DataFrame(train_words.groupby('word')['preds'].agg(lambda x: x.value_counts().index[0]))

In [40]:
val_words.groupby('word')['preds'].count()

word
user_0     243
user_1     245
user_10     54
user_11    506
user_12    292
user_13    416
user_14    227
user_15    303
user_16     34
user_17    315
user_18     30
user_19    280
user_2     307
user_20    245
user_21    236
user_22     56
user_23     51
user_24    160
user_25    379
user_26    397
user_27     22
user_28    120
user_29    164
user_3     480
user_30    324
user_31     71
user_32     72
user_33     96
user_34    137
user_35    141
user_36     97
user_37     45
user_38     60
user_39      6
user_4     430
user_40      9
user_41     36
user_42     12
user_43      3
user_44     28
user_45     11
user_46     13
user_47      1
user_48      1
user_49     11
user_5     230
user_50      5
user_51      2
user_52      2
user_53      1
user_54      1
user_6     306
user_7     142
user_8     226
user_9     549
Name: preds, dtype: int64

In [41]:
for idx in comp_df.index:
    comp_df.loc[idx, 'true'] = inverse_user_dict_val[idx]
    
comp_df = comp_df.astype(int)

for idx in comp_df_train.index:
    comp_df_train.loc[idx, 'true'] = inverse_user_dict_train[idx]
    
comp_df_train = comp_df_train.astype(int)

In [42]:
comp_df

Unnamed: 0_level_0,preds,true
word,Unnamed: 1_level_1,Unnamed: 2_level_1
user_0,32,32
user_1,54,54
user_10,11,23
user_11,55,55
user_12,0,0
user_13,6,6
user_14,32,50
user_15,55,53
user_16,37,20
user_17,47,47


In [43]:
comp_df['comp'] = comp_df['preds'] == comp_df['true']

comp_df_train['comp'] = comp_df_train['preds'] == comp_df_train['true']

In [44]:
# Веса юзеров мы не знаем, давайте возьмем равные веса для простоты

comp_df['norm'] = 1

comp_df_train['norm'] = 1

In [45]:
true_answers = (comp_df['comp'] * comp_df['norm']).sum()
total_answers = comp_df['norm'].sum()
precent_true = round((true_answers/total_answers)*100, 1)

In [46]:
print('Оценка val', true_answers, total_answers, precent_true)

Оценка val 23 55 41.8


In [47]:
true_answers_train = (comp_df_train['comp'] * comp_df_train['norm']).sum()
total_answers_train = comp_df_train['norm'].sum()
precent_true_train = round((true_answers_train/total_answers_train)*100, 1)
print('Оценка train', true_answers_train, total_answers_train, precent_true_train)

Оценка train 50 56 89.3


In [48]:
X_test = test.drop(columns = ["user_word"])
X_test = X_test[X_test.columns].astype(int)
X_test["time_seconds"] = (X_test["time_seconds"] - X_test["time_seconds"].mean()) / X_test["time_seconds"].std()
X_test

Unnamed: 0,gate_id,day_of_week,time_seconds
0,9,1,-1.697222
1,9,1,-1.697222
2,5,1,-1.695901
3,5,1,-1.695828
4,10,1,-1.694360
...,...,...,...
7120,11,4,1.307948
7121,4,4,1.309709
7122,4,4,1.309783
7123,9,4,1.310370


In [49]:
y_test = train["user_id"].astype(int)
y_test

0        18
1        18
2        18
3        18
4        18
         ..
37513     6
37514     6
37515     6
37516     6
37517     6
Name: user_id, Length: 37518, dtype: int32

In [50]:
test_pool = Pool(data = X_test, label = None,cat_features= [0,1]) 

In [51]:
y_test_pred = model_clf.predict(test_pool)

In [52]:
test_words = pd.DataFrame()

test_words['word'] = test['user_word']
test_words['preds'] = y_test_pred

In [53]:
test_words

Unnamed: 0,word,preds
0,gini,47
1,gini,47
2,gini,3
3,gini,3
4,gini,18
...,...,...
7120,collinear,50
7121,collinear,49
7122,collinear,49
7123,collinear,33


In [54]:
comp_df_test = pd.DataFrame(test_words.groupby('word')['preds'].agg(lambda x: x.value_counts().index[0]))

In [55]:
comp_df_test

Unnamed: 0_level_0,preds
word,Unnamed: 1_level_1
aucroc,12
binary,12
blue,18
categorical,14
coefficient,15
collinear,12
distributed,11
epsilon,15
f1,37
fit,1


In [56]:
comp_df_test.to_csv('answer.csv') 

In [57]:
import zipfile

with zipfile.ZipFile('answer.zip', 'w') as zip_file:
    zip_file.write('answer.csv')