In [1]:
import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [3]:
data = data.drop(['id','Unnamed: 32'], axis = 1) # удаляем идентификаторы пациентов и пустой столбец

names_of_cols = ['radius', 'texture', 
                 'perimeter', 'area', 
                 'smoothness', 'compactness', 
                 'concavity', 'concave points',
                 'symmetry', 'fractal_dimension']

for name in names_of_cols:
    data[name] = sum(data[name + '_mean'], data[name + '_worst'])/2
    data.drop(columns=name + '_mean', axis=1, inplace=True)
    data.drop(columns=name + '_worst', axis=1, inplace=True)

data.drop(columns=['radius', 'area'], axis=1, inplace=True)
data.drop(columns=['radius_se', 'area_se'], axis=1, inplace=True)

In [None]:
data = data.replace({'diagnosis':{'M':1, 'B':0}})
data['diagnosis'] = pd.to_numeric(data['diagnosis'])


k = 8 
corr = data.corr()
cols = corr.nlargest(k, 'diagnosis')['diagnosis'].index.tolist() # 10 наиболее скореллированных признаков с целевой переменной
data

In [None]:
cm = np.corrcoef(data[cols].values.T)
plt.figure(figsize=(10,7))
sns.set(font_scale=1.25)
sns.heatmap(cm, yticklabels=cols, xticklabels=cols, 
            square=True, annot=True, 
            cmap=sns.color_palette('coolwarm',1000), vmin=0, center=0.5)
plt.show()

In [6]:
#sns.pairplot(data[cols], hue='diagnosis')

In [7]:
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(data.loc[:, 'texture_se':], 
                                                    data['diagnosis'], 
                                                    test_size=0.3, random_state=729)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
from sklearn.metrics import precision_recall_fscore_support

In [78]:

neighbors = range(1, 50)

report = {'precision':[],'recall':[],'fscore':[],'k':[]}

for k in neighbors:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    preds = knn.predict(X_train)
    preds_test = knn.predict(X_test)
    
    scorings = precision_recall_fscore_support(preds_test, y_test, average='macro')
    report['precision'].append(scorings[0])
    report['recall'].append(scorings[1])
    report['fscore'].append(scorings[2])
    report['k'].append(k)
    
report = pd.DataFrame(report)

for i in report.columns[:-1]:
    print('\nBest', i,'\n{:.3f}'.format(report.iloc[report[i].argmax()][i]) ,
            '\nK =', int(report.iloc[report[i].argmax()]['k']),  "\n" + "==" * 20)


Best precision 
0.911 
K = 9 

Best recall 
0.949 
K = 9 

Best fscore 
0.926 
K = 9 


In [76]:
%%time
from sklearn.model_selection import GridSearchCV

param_grid = {
        'n_neighbors': range(1, 50),
        'p': range(1, 4)
}

knn = KNeighborsClassifier()

grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='recall', verbose=1)
grid_search.fit(X_train, y_train)

print("Best CV score: {:.3f}, best CV k: {}".format(
    grid_search.best_score_, grid_search.best_estimator_.n_neighbors)
) 


test_predictions = grid_search.best_estimator_.predict(X_test)
print(f"Resulting test score: {precision_recall_fscore_support(test_predictions, y_test, average='macro')}")

Fitting 5 folds for each of 147 candidates, totalling 735 fits
Best CV score: 0.935, best CV k: 5
Resulting test score: (0.8978510895883778, 0.9345454545454546, 0.9124699767689097, None)
CPU times: total: 28.4 s
Wall time: 5.82 s
