In [1]:
import pandas as pd
import numpy as np

In [5]:
data = pd.read_csv('citrus.csv')
data.tail()

Unnamed: 0,name,diameter,weight,red,green,blue
9995,grapefruit,15.35,253.89,149,77,20
9996,grapefruit,15.41,254.67,148,68,7
9997,grapefruit,15.59,256.5,168,82,20
9998,grapefruit,15.92,260.14,142,72,11
9999,grapefruit,16.45,261.51,152,74,2


In [14]:
data.iloc[:, 0].value_counts()

0    5000
1    5000
Name: name, dtype: int64

Заменим name на бинарную переменную

In [11]:
data['name'].replace('grapefruit', 1, inplace=True)

In [13]:
data['name'].replace('orange', 0, inplace=True)

In [15]:
data.head()

Unnamed: 0,name,diameter,weight,red,green,blue
0,0,2.96,86.76,172,85,2
1,0,3.91,88.05,166,78,3
2,0,4.42,95.17,156,81,2
3,0,4.47,95.6,163,81,4
4,0,4.48,95.76,161,72,9


In [21]:
from sklearn.model_selection import train_test_split

x_data = data.iloc[:,1:]
y_data = data.iloc[:,0]

X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=7)

In [25]:
import xgboost as xgb

model = xgb.XGBClassifier()

model.fit(X_train, y_train)
y_predict = model.predict(X_test)





In [26]:
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 

    
evaluate_results(y_test, y_predict)

Classification results:
f1: 98.36%
roc: 98.35%
recall: 99.40%
precision: 97.35%


Представим, что нам неизвестны негативы и часть позитивов

In [27]:
mod_data = data.copy()
pos_ind = np.where(mod_data.iloc[:,0].values == 1)[0]
np.random.shuffle(pos_ind)
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 1250/5000 as positives and unlabeling the rest


Создаем столбец для новой целевой переменной, где у нас два класса - P (1) и U (-1)

In [29]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    8750
 1    1250
Name: class_test, dtype: int64


In [33]:
x_data = mod_data.iloc[:,1:-1].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,0].values # original class

In [32]:

mod_data.iloc[:,0]

0       0
1       0
2       0
3       0
4       0
       ..
9995    1
9996    1
9997    1
9998    1
9999    1
Name: name, Length: 10000, dtype: int64

random negative sampling

In [35]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(1250, 7) (1250, 7)


In [43]:
 sample_train.iloc[:,0]

8561    1
6710    1
942     0
8426    1
9876    1
       ..
1897    0
6283    1
7879    1
3975    0
3872    0
Name: name, Length: 2500, dtype: int64

In [41]:
model = xgb.XGBClassifier()

model.fit(sample_train.iloc[:,1:-1].values, 
          sample_train.iloc[:,0].values)
y_predict = model.predict(sample_test.iloc[:,1:-1].values)
evaluate_results(sample_test.iloc[:,0].values, y_predict)


Classification results:
f1: 94.52%
roc: 95.52%
recall: 98.32%
precision: 91.00%


