# Импорты

In [141]:
import pandas as pd 
import numpy as np 
from catboost import CatBoostClassifier # Катбустер
from sklearn.model_selection import train_test_split # Для разделения выборок
from sklearn.metrics import log_loss, accuracy_score # Метрики
import ast # Abstract Syntax Trees
from sklearn.ensemble import RandomForestClassifier # Рандомный лес
from sklearn.linear_model import LogisticRegression # Логистическая регрессия
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score # Метрики

# Подгрузка

In [142]:
geo_info = pd.read_csv('./VK_data/geo_info.csv', sep=';')
referer_vectors = pd.read_csv('./VK_data/referer_vectors.csv', sep=';')
test_users = pd.read_csv('./VK_data/test_users.csv', sep=';')
test = pd.read_csv('./VK_data/test.csv', sep=';')
train_labels = pd.read_csv('./VK_data/train_labels.csv', sep=';')
train = pd.read_csv('./VK_data/train.csv', sep=';')

# Работа с датафреймами

## train

In [143]:
# Объединяем датафреймы
train = train.merge(geo_info, on='geo_id', how='left')
train = train.merge(train_labels, on='user_id', how='left')
train = train.merge(referer_vectors, on='referer', how='left')

In [144]:
# Извлечение информации о браузере, ОС и их версиях
def safe_literal_eval(x):
    if pd.isna(x):
        return {}
    try:
        return ast.literal_eval(x)
    except (ValueError, SyntaxError):
        return {}

train['parsed_user_agent'] = train['user_agent'].apply(safe_literal_eval)
train['browser'] = train['parsed_user_agent'].apply(lambda x: x.get('browser', None))
train['browser_version'] = train['parsed_user_agent'].apply(lambda x: x.get('browser_version', None))
train['os'] = train['parsed_user_agent'].apply(lambda x: x.get('os', None))
train['os_version'] = train['parsed_user_agent'].apply(lambda x: x.get('os_version', None))

train.drop(columns=['parsed_user_agent'], inplace=True)

In [145]:
# Список колонок векторов
vector_columns = ['component0', 'component1', 'component2', 'component3', 'component4',
       'component5', 'component6', 'component7', 'component8', 'component9']

In [146]:
# Смена типа данных
for col in vector_columns:
    train[col] = train[col].astype(str)

In [147]:
train.dtypes # Проверим что смена типа данных прошла успешно

request_ts           int64
user_id             object
referer             object
geo_id               int64
user_agent          object
country_id          object
region_id           object
timezone_id         object
target             float64
component0          object
component1          object
component2          object
component3          object
component4          object
component5          object
component6          object
component7          object
component8          object
component9          object
browser             object
browser_version     object
os                  object
os_version          object
dtype: object

In [148]:
train.columns

Index(['request_ts', 'user_id', 'referer', 'geo_id', 'user_agent',
       'country_id', 'region_id', 'timezone_id', 'target', 'component0',
       'component1', 'component2', 'component3', 'component4', 'component5',
       'component6', 'component7', 'component8', 'component9', 'browser',
       'browser_version', 'os', 'os_version'],
      dtype='object')

In [149]:
train.head()

Unnamed: 0,request_ts,user_id,referer,geo_id,user_agent,country_id,region_id,timezone_id,target,component0,...,component4,component5,component6,component7,component8,component9,browser,browser_version,os,os_version
0,1701011363,fb858e8e0a2bec074450eaf94b627fd3,https://9b48ee5/,4799,"{'browser': 'Chrome Mobile', 'browser_version'...",c31b4e,470e75,f6155e,0.0,11731,...,-8992,9381,-3496,-3120,-899,16817,Chrome Mobile,119.0.0,Android,10
1,1700986581,46a5f128fd569c764a92c2eaa788095e,https://9b48ee5/,8257,"{'browser': 'Chrome Mobile', 'browser_version'...",c31b4e,44520b,e56e80,0.0,11731,...,-8992,9381,-3496,-3120,-899,16817,Chrome Mobile,111.0.0,Android,10
2,1701011071,5a74e9ac53ffb21a20cce117c0ad77ba,https://9634fd0/1409e548,3150,"{'browser': 'Yandex Browser', 'browser_version...",c31b4e,616bb9,af47f1,0.0,12498,...,11608,3106,-2188,10573,3347,21870,Yandex Browser,20.12.5,Android,11
3,1700992803,af735816ca19115431ae3d89518c8c91,https://9b48ee5/,2740,"{'browser': 'Chrome Mobile', 'browser_version'...",c31b4e,3c9dca,e56e80,0.0,11731,...,-8992,9381,-3496,-3120,-899,16817,Chrome Mobile,119.0.0,Android,10
4,1701021666,364f0ae0a3f29a685c4fb5bae6033b9a,https://9b48ee5/,4863,"{'browser': 'Yandex Browser', 'browser_version...",c31b4e,776e76,10b7947,0.0,11731,...,-8992,9381,-3496,-3120,-899,16817,Yandex Browser,18.11.1,Android,4.4.4


In [150]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 759972 entries, 0 to 759971
Data columns (total 23 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   request_ts       759972 non-null  int64  
 1   user_id          759972 non-null  object 
 2   referer          759972 non-null  object 
 3   geo_id           759972 non-null  int64  
 4   user_agent       759971 non-null  object 
 5   country_id       759972 non-null  object 
 6   region_id        697990 non-null  object 
 7   timezone_id      759972 non-null  object 
 8   target           601290 non-null  float64
 9   component0       759972 non-null  object 
 10  component1       759972 non-null  object 
 11  component2       759972 non-null  object 
 12  component3       759972 non-null  object 
 13  component4       759972 non-null  object 
 14  component5       759972 non-null  object 
 15  component6       759972 non-null  object 
 16  component7       759972 non-null  obje

In [151]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   request_ts  150000 non-null  int64 
 1   user_id     150000 non-null  object
 2   referer     150000 non-null  object
 3   geo_id      150000 non-null  int64 
 4   user_agent  150000 non-null  object
dtypes: int64(2), object(3)
memory usage: 5.7+ MB


In [152]:
# Избавимся от ненужных теперь столбцов
train = train.drop(['geo_id', 'user_agent', 'referer'], axis=1)

In [153]:
# train = train.dropna() 

In [154]:
train.head()

Unnamed: 0,request_ts,user_id,country_id,region_id,timezone_id,target,component0,component1,component2,component3,component4,component5,component6,component7,component8,component9,browser,browser_version,os,os_version
0,1701011363,fb858e8e0a2bec074450eaf94b627fd3,c31b4e,470e75,f6155e,0.0,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817,Chrome Mobile,119.0.0,Android,10
1,1700986581,46a5f128fd569c764a92c2eaa788095e,c31b4e,44520b,e56e80,0.0,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817,Chrome Mobile,111.0.0,Android,10
2,1701011071,5a74e9ac53ffb21a20cce117c0ad77ba,c31b4e,616bb9,af47f1,0.0,12498,2451,10304,-6380,11608,3106,-2188,10573,3347,21870,Yandex Browser,20.12.5,Android,11
3,1700992803,af735816ca19115431ae3d89518c8c91,c31b4e,3c9dca,e56e80,0.0,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817,Chrome Mobile,119.0.0,Android,10
4,1701021666,364f0ae0a3f29a685c4fb5bae6033b9a,c31b4e,776e76,10b7947,0.0,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817,Yandex Browser,18.11.1,Android,4.4.4


In [155]:
# Избавимся от ненужных теперь столбцов
train = train.drop(['browser_version', 'os_version'], axis=1)

In [None]:
train.head()

## test

In [156]:
# Объединяем датафреймы
test = test.merge(geo_info, on='geo_id', how='left')
test = test.merge(train_labels, on='user_id', how='left')
test = test.merge(referer_vectors, on='referer', how='left')

In [157]:
# Извлечение информации о браузере, ОС и их версиях
def safe_literal_eval(x):
    if pd.isna(x):
        return {}
    try:
        return ast.literal_eval(x)
    except (ValueError, SyntaxError):
        return {}

test['parsed_user_agent'] = test['user_agent'].apply(safe_literal_eval)
test['browser'] = test['parsed_user_agent'].apply(lambda x: x.get('browser', None))
test['browser_version'] = test['parsed_user_agent'].apply(lambda x: x.get('browser_version', None))
test['os'] = test['parsed_user_agent'].apply(lambda x: x.get('os', None))
test['os_version'] = test['parsed_user_agent'].apply(lambda x: x.get('os_version', None))

test.drop(columns=['parsed_user_agent'], inplace=True)

In [158]:
# Список колонок векторов
vector_columns = ['component0', 'component1', 'component2', 'component3', 'component4',
       'component5', 'component6', 'component7', 'component8', 'component9']

In [159]:
# Смена типа данных
for col in vector_columns:
    test[col] = test[col].astype(str)

In [160]:
#test = test.dropna()

In [161]:
# Избавимся от ненужных теперь столбцов
test = test.drop(['geo_id', 'user_agent', 'referer'], axis=1)

In [162]:
# Избавимся от ненужных теперь столбцов
test = test.drop(['browser_version', 'os_version'], axis=1)

# Предобработка

In [163]:
object_columns = train.select_dtypes(include=['object']).columns
object_columns

Index(['user_id', 'country_id', 'region_id', 'timezone_id', 'component0',
       'component1', 'component2', 'component3', 'component4', 'component5',
       'component6', 'component7', 'component8', 'component9', 'browser',
       'os'],
      dtype='object')

In [164]:
train.isna().sum()

request_ts          0
user_id             0
country_id          0
region_id       61982
timezone_id         0
target         158682
component0          0
component1          0
component2          0
component3          0
component4          0
component5          0
component6          0
component7          0
component8          0
component9          0
browser             1
os                  1
dtype: int64

In [165]:
cat_features = ['country_id', 'region_id', 'timezone_id', 'os', 'browser']
train[cat_features] = train[cat_features].replace({None: 'Other', '': 'Other'})
test[cat_features] = test[cat_features].fillna('Other')

In [166]:
train.isna().sum()

request_ts          0
user_id             0
country_id          0
region_id           0
timezone_id         0
target         158682
component0          0
component1          0
component2          0
component3          0
component4          0
component5          0
component6          0
component7          0
component8          0
component9          0
browser             0
os                  0
dtype: int64

In [167]:
train = train.dropna()

In [168]:
train.isna().sum()

request_ts     0
user_id        0
country_id     0
region_id      0
timezone_id    0
target         0
component0     0
component1     0
component2     0
component3     0
component4     0
component5     0
component6     0
component7     0
component8     0
component9     0
browser        0
os             0
dtype: int64

In [169]:
train.dtypes

request_ts       int64
user_id         object
country_id      object
region_id       object
timezone_id     object
target         float64
component0      object
component1      object
component2      object
component3      object
component4      object
component5      object
component6      object
component7      object
component8      object
component9      object
browser         object
os              object
dtype: object

In [170]:
test.isna().sum()

request_ts          0
user_id             0
country_id          0
region_id           0
timezone_id         0
target         143138
component0          0
component1          0
component2          0
component3          0
component4          0
component5          0
component6          0
component7          0
component8          0
component9          0
browser             0
os                  0
dtype: int64

In [171]:
test = train.dropna()
test.isna().sum()

request_ts     0
user_id        0
country_id     0
region_id      0
timezone_id    0
target         0
component0     0
component1     0
component2     0
component3     0
component4     0
component5     0
component6     0
component7     0
component8     0
component9     0
browser        0
os             0
dtype: int64

In [172]:
# Сменим тип данных с float
#train['target'] = train['target'].astype(int)
#test['target'] = test['target'].astype(int)

In [173]:
train[object_columns] = train[object_columns].apply(pd.Categorical)

In [174]:
train[object_columns] = train[object_columns].apply(lambda x: x.cat.codes)

In [175]:
print(train.dtypes)

request_ts       int64
user_id          int32
country_id       int16
region_id        int16
timezone_id      int16
target         float64
component0       int16
component1       int16
component2       int16
component3       int16
component4       int16
component5       int16
component6       int16
component7       int16
component8       int16
component9       int32
browser           int8
os                int8
dtype: object


In [176]:
train.head()

Unnamed: 0,request_ts,user_id,country_id,region_id,timezone_id,target,component0,component1,component2,component3,component4,component5,component6,component7,component8,component9,browser,os
0,1701011363,491215,103,116,205,0.0,5280,24890,17644,1661,7145,26016,3924,5568,10317,19591,7,0
1,1700986581,138243,103,111,179,0.0,5280,24890,17644,1661,7145,26016,3924,5568,10317,19591,7,0
2,1701011071,176874,103,160,96,0.0,6099,22995,9295,11207,9182,19068,2460,12669,20864,25074,56,0
3,1700992803,342422,103,99,179,0.0,5280,24890,17644,1661,7145,26016,3924,5568,10317,19591,7,0
4,1701021666,106200,103,200,22,0.0,5280,24890,17644,1661,7145,26016,3924,5568,10317,19591,56,0


In [177]:
object_columns_test = test.select_dtypes(include=['object']).columns
test[object_columns_test] = test[object_columns_test].apply(pd.Categorical)
test[object_columns_test] = test[object_columns_test].apply(lambda x: x.cat.codes)


In [178]:
train.head()

Unnamed: 0,request_ts,user_id,country_id,region_id,timezone_id,target,component0,component1,component2,component3,component4,component5,component6,component7,component8,component9,browser,os
0,1701011363,491215,103,116,205,0.0,5280,24890,17644,1661,7145,26016,3924,5568,10317,19591,7,0
1,1700986581,138243,103,111,179,0.0,5280,24890,17644,1661,7145,26016,3924,5568,10317,19591,7,0
2,1701011071,176874,103,160,96,0.0,6099,22995,9295,11207,9182,19068,2460,12669,20864,25074,56,0
3,1700992803,342422,103,99,179,0.0,5280,24890,17644,1661,7145,26016,3924,5568,10317,19591,7,0
4,1701021666,106200,103,200,22,0.0,5280,24890,17644,1661,7145,26016,3924,5568,10317,19591,56,0


In [179]:
test.head()

Unnamed: 0,request_ts,user_id,country_id,region_id,timezone_id,target,component0,component1,component2,component3,component4,component5,component6,component7,component8,component9,browser,os
0,1701011363,491215,103,116,205,0.0,5280,24890,17644,1661,7145,26016,3924,5568,10317,19591,7,0
1,1700986581,138243,103,111,179,0.0,5280,24890,17644,1661,7145,26016,3924,5568,10317,19591,7,0
2,1701011071,176874,103,160,96,0.0,6099,22995,9295,11207,9182,19068,2460,12669,20864,25074,56,0
3,1700992803,342422,103,99,179,0.0,5280,24890,17644,1661,7145,26016,3924,5568,10317,19591,7,0
4,1701021666,106200,103,200,22,0.0,5280,24890,17644,1661,7145,26016,3924,5568,10317,19591,56,0


# Разделение выборок

In [180]:
X = train.drop(columns=['target'])
y = train['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [181]:
print(X_train.dtypes)

request_ts     int64
user_id        int32
country_id     int16
region_id      int16
timezone_id    int16
component0     int16
component1     int16
component2     int16
component3     int16
component4     int16
component5     int16
component6     int16
component7     int16
component8     int16
component9     int32
browser         int8
os              int8
dtype: object


In [182]:
X_train.dtypes

request_ts     int64
user_id        int32
country_id     int16
region_id      int16
timezone_id    int16
component0     int16
component1     int16
component2     int16
component3     int16
component4     int16
component5     int16
component6     int16
component7     int16
component8     int16
component9     int32
browser         int8
os              int8
dtype: object

# Catbooster

In [183]:
model = CatBoostClassifier(iterations=1000,
                           learning_rate=0.1,
                           depth=6,
                           random_seed=42,
                           loss_function='Logloss',
                           od_type='Iter',
                           use_best_model=False,
                           od_wait=500,
                           verbose=100)

model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=250)

0:	learn: 0.6757048	test: 0.6756228	best: 0.6756228 (0)	total: 66.2ms	remaining: 1m 6s
100:	learn: 0.5049127	test: 0.5034425	best: 0.5034425 (100)	total: 6.5s	remaining: 57.9s
200:	learn: 0.4826258	test: 0.4822665	best: 0.4822665 (200)	total: 12.8s	remaining: 50.7s
300:	learn: 0.4701938	test: 0.4709715	best: 0.4709715 (300)	total: 19.2s	remaining: 44.6s
400:	learn: 0.4623658	test: 0.4642933	best: 0.4642933 (400)	total: 25.5s	remaining: 38.1s
500:	learn: 0.4558533	test: 0.4589120	best: 0.4589120 (500)	total: 32.1s	remaining: 32s
600:	learn: 0.4509262	test: 0.4551383	best: 0.4551383 (600)	total: 38.4s	remaining: 25.5s
700:	learn: 0.4468106	test: 0.4523175	best: 0.4523175 (700)	total: 44.1s	remaining: 18.8s
800:	learn: 0.4430988	test: 0.4497279	best: 0.4497279 (800)	total: 50.3s	remaining: 12.5s
900:	learn: 0.4396310	test: 0.4472889	best: 0.4472889 (900)	total: 56.8s	remaining: 6.24s
999:	learn: 0.4365781	test: 0.4453110	best: 0.4453110 (999)	total: 1m 2s	remaining: 0us

bestTest = 0.4453

<catboost.core.CatBoostClassifier at 0x29149138190>

In [184]:
# Получение лучших метрик
best_metrics = model.get_best_score()

print("Лучшие метрики:")
for metric_name, value in best_metrics.items():
    print(f"{metric_name}: {value}")

# Метрики на валидационной выборке
y_pred_proba = model.predict_proba(X_val)[:, 1]
y_pred_class = model.predict(X_val)

print("\nМетрики на валидационной выборке:")
print(f"AUC-ROC: {roc_auc_score(y_val, y_pred_proba)}")
print(f"Precision: {precision_score(y_val, y_pred_class)}")
print(f"Recall: {recall_score(y_val, y_pred_class)}")
print(f"F1-score: {f1_score(y_val, y_pred_class)}")

Лучшие метрики:
learn: {'Logloss': 0.4365780878632235}
validation: {'Logloss': 0.4453109612725933}

Метрики на валидационной выборке:
AUC-ROC: 0.8734354239255968
Precision: 0.8053728935416558
Recall: 0.7971950288134416
F1-score: 0.8012630954561711


In [185]:
test.head()

Unnamed: 0,request_ts,user_id,country_id,region_id,timezone_id,target,component0,component1,component2,component3,component4,component5,component6,component7,component8,component9,browser,os
0,1701011363,491215,103,116,205,0.0,5280,24890,17644,1661,7145,26016,3924,5568,10317,19591,7,0
1,1700986581,138243,103,111,179,0.0,5280,24890,17644,1661,7145,26016,3924,5568,10317,19591,7,0
2,1701011071,176874,103,160,96,0.0,6099,22995,9295,11207,9182,19068,2460,12669,20864,25074,56,0
3,1700992803,342422,103,99,179,0.0,5280,24890,17644,1661,7145,26016,3924,5568,10317,19591,7,0
4,1701021666,106200,103,200,22,0.0,5280,24890,17644,1661,7145,26016,3924,5568,10317,19591,56,0


In [186]:
y_test = test['target'] 
X_test = test.drop(columns='target')
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1] 

In [187]:
# Вычисление метрик
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Вывод результатов
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
print(f'ROC AUC: {roc_auc:.4f}')

Accuracy: 0.8155
Precision: 0.8113
Recall: 0.8007
F1 Score: 0.8060
ROC AUC: 0.8780
