Домашнее задание
1. взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)
2. сделать feature engineering
3. обучить любой классификатор (какой вам нравится)
4. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть
5. применить random negative sampling для построения классификатора в новых условиях
6. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)
7. поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score



In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    models_results['F1'].append(f1 * 100.0)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    models_results['ROC'].append(roc * 100.0)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    models_results['Recall'].append(rec * 100.0)
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    models_results['Precision'].append(prc * 100.0)
    print("precision: %.2f%%" % (prc * 100.0))     

**Breast Cancer Coimbra Data Set**

Ссылка на датасет: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Coimbra

In [4]:
DATASET_PATH = './dataR2.csv'

In [5]:
df = pd.read_csv(DATASET_PATH)
df.head(3)

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Classification
0,48,23.5,70,2.707,0.467409,8.8071,9.7024,7.99585,417.114,1
1,83,20.690495,92,3.115,0.706897,8.8438,5.429285,4.06405,468.786,1
2,82,23.12467,91,4.498,1.009651,17.9393,22.43204,9.27715,554.697,1


In [6]:
print(df.shape)

(116, 10)


In [7]:
df['Classification'].value_counts()

2    64
1    52
Name: Classification, dtype: int64

В описании к датасету дано, что 1 - это здоровый человек, 2 - пациент. Разметим данный признак с помощью бинарных переменных.

In [8]:
df['Is_desease'] = 0
df.loc[df['Classification'] == 2, 'Is_desease'] = 1
df.head(3)

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Classification,Is_desease
0,48,23.5,70,2.707,0.467409,8.8071,9.7024,7.99585,417.114,1,0
1,83,20.690495,92,3.115,0.706897,8.8438,5.429285,4.06405,468.786,1,0
2,82,23.12467,91,4.498,1.009651,17.9393,22.43204,9.27715,554.697,1,0


In [9]:
df.columns

Index(['Age', 'BMI', 'Glucose', 'Insulin', 'HOMA', 'Leptin', 'Adiponectin',
       'Resistin', 'MCP.1', 'Classification', 'Is_desease'],
      dtype='object')

In [10]:
features = ['Age', 'BMI', 'Glucose', 'Insulin', 'HOMA', 'Leptin', 'Adiponectin',
       'Resistin', 'MCP.1']

In [11]:
x_data = df[features]
y_data = df['Is_desease']

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=7)

models_results = {
    'model': [],
    'F1': [],
    'ROC': [],
    'Recall': [],
    'Precision': [],
}

**Random Forest Classifier**

In [12]:
model = RandomForestClassifier(random_state=42)

model.fit(x_train, y_train)
y_predict = model.predict(x_test)

In [13]:
evaluate_results(y_test, y_predict)
models_results['model'].append('RandomForestClassifier')

Classification results:
f1: 81.25%
roc: 73.95%
recall: 76.47%
precision: 86.67%


**Random Negative Sampling**

In [14]:
mod_data = df[[*features, 'Is_desease']]
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 16/64 as positives and unlabeling the rest


In [15]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    100
 1     16
Name: class_test, dtype: int64


In [16]:
mod_data.head(10)

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Is_desease,class_test
0,48,23.5,70,2.707,0.467409,8.8071,9.7024,7.99585,417.114,0,-1
1,83,20.690495,92,3.115,0.706897,8.8438,5.429285,4.06405,468.786,0,-1
2,82,23.12467,91,4.498,1.009651,17.9393,22.43204,9.27715,554.697,0,-1
3,68,21.367521,77,3.226,0.612725,9.8827,7.16956,12.766,928.22,0,-1
4,86,21.111111,92,3.549,0.805386,6.6994,4.81924,10.57635,773.92,0,-1
5,49,22.854458,92,3.226,0.732087,6.8317,13.67975,10.3176,530.41,0,-1
6,89,22.7,77,4.69,0.890787,6.964,5.589865,12.9361,1256.083,0,-1
7,76,23.8,118,6.47,1.883201,4.311,13.25132,5.1042,280.694,0,-1
8,73,22.0,97,3.35,0.801543,4.47,10.358725,6.28445,136.855,0,-1
9,75,23.0,83,4.952,1.013839,17.127,11.57899,7.0913,318.302,0,-1


In [17]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

In [18]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(16, 11) (16, 11)


In [19]:
model = RandomForestClassifier(random_state=42)

model.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values)
y_predict = model.predict(sample_test.iloc[:,:-2].values)
evaluate_results(sample_test.iloc[:,-2].values, y_predict)
models_results['model'].append('RandomNegativeSampling_25%')

Classification results:
f1: 66.04%
roc: 58.52%
recall: 87.50%
precision: 53.03%


In [20]:
pd.DataFrame(data=models_results).sort_values('F1', ascending=False)

Unnamed: 0,model,F1,ROC,Recall,Precision
0,RandomForestClassifier,81.25,73.94958,76.470588,86.666667
1,RandomNegativeSampling_25%,66.037736,58.522727,87.5,53.030303


**Лучшей моделью оказался RandomForestClassifier до PU learning**

Попробуем разные варианты доли позитивного класса

In [21]:
share_P = [0.05, 0.1, 0.15, 0.2, 0.3, 0.35, 0.4, 0.45, 0.5]

In [22]:
for s in share_P:
    np.random.shuffle(pos_ind)
    pos_sample_len = int(np.ceil(s * len(pos_ind)))
    pos_sample = pos_ind[:pos_sample_len]
    mod_data['class_test'] = -1
    mod_data.loc[pos_sample,'class_test'] = 1
    x_data = mod_data.iloc[:,:-2].values 
    y_labeled = mod_data.iloc[:,-1].values
    y_positive = mod_data.iloc[:,-2].values
    mod_data = mod_data.sample(frac=1)
    neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
    sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
    pos_sample = mod_data[mod_data['class_test']==1]    
    sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)
    model = RandomForestClassifier(random_state=42)
    model.fit(sample_train.iloc[:,:-2].values, 
              sample_train.iloc[:,-2].values)
    y_predict = model.predict(sample_test.iloc[:,:-2].values)
    evaluate_results(sample_test.iloc[:,-2].values, y_predict)
    model_name = 'RandomNegativeSampling' + '_' + str(int(s * 100)) + '%'
    models_results['model'].append(model_name)
    
pd.DataFrame(data=models_results).sort_values('F1', ascending=False)

Classification results:
f1: 68.29%
roc: 50.00%
recall: 100.00%
precision: 51.85%
Classification results:
f1: 69.44%
roc: 54.63%
recall: 92.59%
precision: 55.56%
Classification results:
f1: 74.81%
roc: 63.59%
recall: 96.08%
precision: 61.25%
Classification results:
f1: 62.99%
roc: 49.70%
recall: 93.02%
precision: 47.62%
Classification results:
f1: 69.39%
roc: 62.22%
recall: 94.44%
precision: 54.84%
Classification results:
f1: 78.38%
roc: 77.72%
recall: 87.88%
precision: 70.73%
Classification results:
f1: 64.94%
roc: 63.87%
recall: 96.15%
precision: 49.02%
Classification results:
f1: 71.88%
roc: 72.92%
recall: 95.83%
precision: 57.50%
Classification results:
f1: 63.33%
roc: 64.69%
recall: 95.00%
precision: 47.50%


Unnamed: 0,model,F1,ROC,Recall,Precision
0,RandomForestClassifier,81.25,73.94958,76.470588,86.666667
7,RandomNegativeSampling_35%,78.378378,77.723178,87.878788,70.731707
4,RandomNegativeSampling_15%,74.80916,63.594771,96.078431,61.25
9,RandomNegativeSampling_45%,71.875,72.916667,95.833333,57.5
3,RandomNegativeSampling_10%,69.444444,54.62963,92.592593,55.555556
6,RandomNegativeSampling_30%,69.387755,62.222222,94.444444,54.83871
2,RandomNegativeSampling_5%,68.292683,50.0,100.0,51.851852
1,RandomNegativeSampling_25%,66.037736,58.522727,87.5,53.030303
8,RandomNegativeSampling_40%,64.935065,63.866397,96.153846,49.019608
10,RandomNegativeSampling_50%,63.333333,64.6875,95.0,47.5


Результаты получились случайные, вывод о увеличении или уменьшении качества моделей в зависимости от доли Р сделать трудно.

Думаю, наиболее предпочтителен на практике  2-step approach