In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# 1- Read Data

In [2]:
data = pd.read_csv("data/dataset.csv")

# 2-Data Preprocessing

In [3]:
data.drop(["id","Unnamed: 32"], axis=1, inplace = True) # remove unnecessary columns

In [4]:
data.diagnosis = [ 1 if each == "M" else 0 for each in data.diagnosis]
y = data.diagnosis.values
x = data.drop(["diagnosis"], axis = 1) # axis=1, tüm sütundaki verileri siler.

In [5]:
y[15:25] # Yalnızca öÖrnek olsun diye herhangi bir aralık print edildi.

array([1, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=int64)

In [6]:
# normalization
x = (x - np.min(x))/(np.max(x)-np.min(x))
x.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,0.521037,0.022658,0.545989,0.363733,0.593753,0.792037,0.70314,0.731113,0.686364,0.605518,...,0.620776,0.141525,0.66831,0.450698,0.601136,0.619292,0.56861,0.912027,0.598462,0.418864
1,0.643144,0.272574,0.615783,0.501591,0.28988,0.181768,0.203608,0.348757,0.379798,0.141323,...,0.606901,0.303571,0.539818,0.435214,0.347553,0.154563,0.192971,0.639175,0.23359,0.222878
2,0.601496,0.39026,0.595743,0.449417,0.514309,0.431017,0.462512,0.635686,0.509596,0.211247,...,0.556386,0.360075,0.508442,0.374508,0.48359,0.385375,0.359744,0.835052,0.403706,0.213433
3,0.21009,0.360839,0.233501,0.102906,0.811321,0.811361,0.565604,0.522863,0.776263,1.0,...,0.24831,0.385928,0.241347,0.094008,0.915472,0.814012,0.548642,0.88488,1.0,0.773711
4,0.629893,0.156578,0.630986,0.48929,0.430351,0.347893,0.463918,0.51839,0.378283,0.186816,...,0.519744,0.123934,0.506948,0.341575,0.437364,0.172415,0.319489,0.558419,0.1575,0.142595


# 3- Visualize Data

In [7]:
M = data[y == 1] # malignant = M  kotu huylu tumor
B = data[y == 0] # benign    = B  iyi huylu tumor

In [8]:
# Örnek farklı görsel kıyaslamalar
%matplotlib qt5
plt.scatter(M.radius_mean, M.texture_mean, color="red", label="malignant",alpha= 0.3)
plt.scatter(B.radius_mean, B.texture_mean, color="black", label="benign", alpha= 0.3)
plt.xlabel("radius_mean")
plt.ylabel("texture_mean")
plt.legend()
plt.savefig("sample_fig.png")
plt.show()

In [9]:
# Örnek farklı görsel kıyaslamalar
%matplotlib qt5
plt.scatter(M.radius_mean, M.perimeter_mean, color="red", label="malignant",alpha= 0.9)
plt.scatter(B.radius_mean, B.perimeter_mean, color="blue", label="benign", alpha= 0.1)
plt.xlabel("radius_mean")
plt.ylabel("perimeter_mean")
plt.legend()
plt.savefig("sample_fig2.png")
plt.show()

In [10]:
# Örnek farklı görsel kıyaslamalar
%matplotlib qt5
plt.scatter(M.radius_mean, M.area_mean, color="red", label="malignant",alpha= 0.3)
plt.scatter(B.radius_mean, B.area_mean, color="blue", label="benign", alpha= 0.3)
plt.xlabel("radius_mean")
plt.ylabel("area_mean")
plt.legend()
plt.savefig("sample_fig3.png")
plt.show()

# 4-Split Data

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.15, random_state = 42)

# 5-Create Model (DT Classification)

In [12]:
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)

DecisionTreeClassifier()

# 6-Test(Predict) Data

#### Note

Aşağıdaki sonucun yorumu:

- Okunan x_test içerisindeki 86 sample'ın %91.8'ini, yani 79 tanesini  doğru tahmin etti. 
 
- 86 sample'ın %8.2'ini yani yaklaşık 7 tanesini ise yanlış tahmin etti.

In [13]:
print("accuracy: ", dt.score(x_test, y_test)) 

accuracy:  0.9186046511627907
