!pip install pandas numpy sklearn catboost

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

import catboost as ctb


### Задание 1

Взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php

#### Решение

!wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip'

In [2]:
import zipfile
archive = 'bank.zip'
with zipfile.ZipFile(archive, 'r') as zip_file:
    zip_file.extract('bank-full.csv', '.')

In [3]:
df = pd.read_csv('bank-full.csv', sep=';')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [5]:
df.y.replace(('yes', 'no'), (1, 0), inplace=True)
df.y.value_counts()

0    39922
1     5289
Name: y, dtype: int64

In [6]:
df.rename({'y': 'target'}, axis=1, inplace=True)

 Имеем дисбаланс классов целевой переменной

### Задание 2

*Сделать feature engineering

#### Решение

In [7]:
for i in ['housing', 'loan', 'default']:
    df[i].replace(('yes', 'no'), (1, 0), inplace=True)
    
df = df.drop(columns=['poutcome', 'day', 'contact'])

df.head(10)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,month,duration,campaign,pdays,previous,target
0,58,management,married,tertiary,0,2143,1,0,may,261,1,-1,0,0
1,44,technician,single,secondary,0,29,1,0,may,151,1,-1,0,0
2,33,entrepreneur,married,secondary,0,2,1,1,may,76,1,-1,0,0
3,47,blue-collar,married,unknown,0,1506,1,0,may,92,1,-1,0,0
4,33,unknown,single,unknown,0,1,0,0,may,198,1,-1,0,0
5,35,management,married,tertiary,0,231,1,0,may,139,1,-1,0,0
6,28,management,single,tertiary,0,447,1,1,may,217,1,-1,0,0
7,42,entrepreneur,divorced,tertiary,1,2,1,0,may,380,1,-1,0,0
8,58,retired,married,primary,0,121,1,0,may,50,1,-1,0,0
9,43,technician,single,secondary,0,593,1,0,may,55,1,-1,0,0


In [8]:
# Делим на train/test
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['target']), df['target'], stratify=df['target'], test_size=0.3, random_state=42)

### Задание 3

Обучить любой классификатор (какой вам нравится)

#### Решение

In [9]:
cat_feats = ['job', 'marital', 'education', 'month']

model = ctb.CatBoostClassifier(cat_features=cat_feats)
model.fit(X_train, y_train, verbose=False)

y_pred = model.predict(X_test)
    
f1 = f1_score(y_test, y_pred)
roc = roc_auc_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='binary')
rec = recall_score(y_test, y_pred, average='binary')

metrics = pd.DataFrame({'f1': [f1], 'roc-auc': [roc], 'precision': [prec], 'recall': [rec]})

In [10]:
metrics

Unnamed: 0,f1,roc-auc,precision,recall
0,0.492776,0.687669,0.621285,0.408318


### Задание 4

Разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные примеры (класс 1), а только лишь часть

#### Решение

In [11]:
df_sample = df.copy()

pos_ind = df_sample.loc[df_sample['target'] == 1].sample(frac=0.2).index
unlab_ind = df_sample[~df.index.isin(pos_ind)].index

df_sample.loc[pos_ind, 'is_labeled'] = 1
df_sample.loc[unlab_ind, 'is_labeled'] = 0
df_sample['is_labeled'] = df_sample['is_labeled'].astype(int)



In [12]:
df_sample.head(10)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,month,duration,campaign,pdays,previous,target,is_labeled
0,58,management,married,tertiary,0,2143,1,0,may,261,1,-1,0,0,0
1,44,technician,single,secondary,0,29,1,0,may,151,1,-1,0,0,0
2,33,entrepreneur,married,secondary,0,2,1,1,may,76,1,-1,0,0,0
3,47,blue-collar,married,unknown,0,1506,1,0,may,92,1,-1,0,0,0
4,33,unknown,single,unknown,0,1,0,0,may,198,1,-1,0,0,0
5,35,management,married,tertiary,0,231,1,0,may,139,1,-1,0,0,0
6,28,management,single,tertiary,0,447,1,1,may,217,1,-1,0,0,0
7,42,entrepreneur,divorced,tertiary,1,2,1,0,may,380,1,-1,0,0,0
8,58,retired,married,primary,0,121,1,0,may,50,1,-1,0,0,0
9,43,technician,single,secondary,0,593,1,0,may,55,1,-1,0,0,0


### Задание 5

Применить random negative sampling для построения классификатора в новых условиях

#### Решение

In [13]:
rns_df = df_sample.sample(frac=1)

pos_sample = rns_df[rns_df['is_labeled'] == 1]
neg_sample = rns_df[rns_df['is_labeled'] == 0][:pos_sample.shape[0]]

train_samples = pd.concat([neg_sample, pos_sample]).sample(frac=1)
test_samples = rns_df[rns_df['is_labeled'] == 0][pos_sample.shape[0]:]

In [14]:
model = ctb.CatBoostClassifier(cat_features=cat_feats)
model.fit(train_samples.iloc[:, :-2],
          train_samples['is_labeled'], verbose=False)

<catboost.core.CatBoostClassifier at 0x130698d00>

In [15]:
y_pred = model.predict(test_samples.iloc[:, :-2])
    
f1 = f1_score(test_samples['target'], y_pred)
roc = roc_auc_score(test_samples['target'], y_pred)
prec = precision_score(test_samples['target'], y_pred, average='binary')
rec = recall_score(test_samples['target'], y_pred, average='binary')

In [16]:
metrics = pd.concat((metrics, pd.DataFrame({'f1': [f1], 'roc-auc': [roc], 'precision': [prec], 'recall': [rec]})))

### Задание 6

Сравнить качество с решением из пункта 3 (построить отчет - таблицу метрик)

#### Решение

In [17]:
metrics

Unnamed: 0,f1,roc-auc,precision,recall
0,0.492776,0.687669,0.621285,0.408318
0,0.473168,0.829183,0.329352,0.839942


По roc-auc RNS оказался лучше, при этом у RNS сильно увеличился recall и упал precision.

### Задание 7

*Поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

#### Решение

In [19]:
rns_metrics = pd.DataFrame(columns=['f1', 'roc-auc', 'precision', 'recall'])

fracs = np.linspace(0.1, 0.9, 9)

for frac in fracs:
    df_sample = df.copy()

    pos_ind = df_sample.loc[df_sample['target'] == 1].sample(frac=frac).index
    unlab_ind = df_sample[~df.index.isin(pos_ind)].index

    df_sample.loc[pos_ind, 'is_labeled'] = 1
    df_sample.loc[unlab_ind, 'is_labeled'] = 0
    df_sample['is_labeled'] = df_sample['is_labeled'].astype(int)

    rns_df = df_sample.sample(frac=1)

    pos_sample = rns_df[rns_df['is_labeled'] == 1]
    neg_sample = rns_df[rns_df['is_labeled'] == 0][:pos_sample.shape[0]]
    train_samples = pd.concat([neg_sample, pos_sample]).sample(frac=1)
    test_samples = rns_df[rns_df['is_labeled'] == 0][pos_sample.shape[0]:]

    model = ctb.CatBoostClassifier(cat_features=cat_feats)
    model.fit(train_samples.iloc[:, :-2],
              train_samples['is_labeled'], verbose=False)
    
    y_pred = model.predict(test_samples.iloc[:, :-2])
    
    f1 = f1_score(test_samples['target'], y_pred)
    roc = roc_auc_score(test_samples['target'], y_pred)
    prec = precision_score(test_samples['target'], y_pred, average='binary')
    rec = recall_score(test_samples['target'], y_pred, average='binary')

    rns_metrics = pd.concat((rns_metrics, pd.DataFrame({'f1': [f1], 'roc-auc': [roc], 'precision': [prec], 'recall': [rec]})))

In [20]:
rns_metrics.index = fracs
rns_metrics

Unnamed: 0,f1,roc-auc,precision,recall
0.1,0.466158,0.82144,0.319434,0.862179
0.2,0.476928,0.83585,0.330594,0.855693
0.3,0.450007,0.837297,0.30548,0.854091
0.4,0.423326,0.836245,0.28264,0.842871
0.5,0.3723,0.837096,0.237788,0.8572
0.6,0.33089,0.841905,0.204741,0.862016
0.7,0.270468,0.84618,0.159798,0.879755
0.8,0.205879,0.847748,0.116737,0.870934
0.9,0.116516,0.842714,0.0625,0.858351


Roc-auc меняется незначительно. F1 выше всего при сэмплинге 10-20%.