In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# 1-Read Data

In [2]:
# malignant = M  kotu huylu tumor
# benign = B     iyi huylu tumor

data = pd.read_csv("data/dataset.csv")
data.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [3]:
data.drop(["id", "Unnamed: 32"], axis=1, inplace=True) # axis=1 tüm columnu drop eder

In [4]:
M = data[data.diagnosis == "M"]
M.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 212 entries, 0 to 567
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   diagnosis                212 non-null    object 
 1   radius_mean              212 non-null    float64
 2   texture_mean             212 non-null    float64
 3   perimeter_mean           212 non-null    float64
 4   area_mean                212 non-null    float64
 5   smoothness_mean          212 non-null    float64
 6   compactness_mean         212 non-null    float64
 7   concavity_mean           212 non-null    float64
 8   concave points_mean      212 non-null    float64
 9   symmetry_mean            212 non-null    float64
 10  fractal_dimension_mean   212 non-null    float64
 11  radius_se                212 non-null    float64
 12  texture_se               212 non-null    float64
 13  perimeter_se             212 non-null    float64
 14  area_se                  2

In [5]:
B = data[data.diagnosis == "B"]
B.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 357 entries, 19 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   diagnosis                357 non-null    object 
 1   radius_mean              357 non-null    float64
 2   texture_mean             357 non-null    float64
 3   perimeter_mean           357 non-null    float64
 4   area_mean                357 non-null    float64
 5   smoothness_mean          357 non-null    float64
 6   compactness_mean         357 non-null    float64
 7   concavity_mean           357 non-null    float64
 8   concave points_mean      357 non-null    float64
 9   symmetry_mean            357 non-null    float64
 10  fractal_dimension_mean   357 non-null    float64
 11  radius_se                357 non-null    float64
 12  texture_se               357 non-null    float64
 13  perimeter_se             357 non-null    float64
 14  area_se                  

# 2-Visualize Data

In [6]:
%matplotlib qt5
plt.scatter(M.radius_mean, M.texture_mean, color="red", label="malignant",alpha= 0.3)
plt.scatter(B.radius_mean, B.texture_mean, color="green", label="benign",alpha= 0.3)
plt.xlabel("radius_mean")
plt.ylabel("texture_mean")
plt.legend()
plt.show() 

# 3-Preprocessing

In [7]:
y = data.diagnosis.values
y[15:25] # rasgele(15 ile 20.) indexleri kontrol ederiz içerisinde ne var diye.

array(['M', 'M', 'M', 'M', 'B', 'B', 'B', 'M', 'M', 'M'], dtype=object)

In [8]:
data.diagnosis = [1 if each == "M" else 0 for each in data.diagnosis] # 1=Malignant(Kötü huylu)
y = data.diagnosis.values
y[15:25]

array([1, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=int64)

In [9]:
x = data.drop(["diagnosis"], axis = 1) # train edilecek verilerde target class olan y'yi drop edebiliriz.

In [10]:
# Normalization
x = (x - np.min(x))/(np.max(x)-np.min(x)) # verileri min,max yani 0,1 arasına uyarlarız.

# 4-Split Data

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42 )

# 5-Crate KNN Model

In [12]:
k = 3
knn = KNeighborsClassifier(n_neighbors = k) # n_neighbors = k
knn.fit(x_train, y_train)

KNeighborsClassifier(n_neighbors=3)

# 6-Predict Data

In [13]:
prediction = knn.predict(x_test)
prediction # Hangi index için hangi tahmini yapmış?

array([0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], dtype=int64)

In [14]:
# Yukarıda yapılan prediction'un yüzde kaçı doğru tahmin?
print(" {} nn score: {} ".format(k, knn.score(x_test, y_test)))

 3 nn score: 0.9707602339181286 


# 7-Finding the Best K Value

In [15]:
score_list = []
for i in range(1, 50):
    knn2 = KNeighborsClassifier(n_neighbors = i)
    knn2.fit(x_train, y_train)
    score_list.append(knn2.score(x_test, y_test))

plt.plot(range(1, 50), score_list) # x=1,2,3..50, y=score_list
plt.xlabel("k values")
plt.ylabel("accuracy")
plt.show() # Best K value = 4 according to plot! :)