In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns 
sns.set(style="darkgrid")
import warnings
warnings.filterwarnings("ignore") 

# Machine learning libraries that I'll use in this study 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold,cross_val_score,cross_val_predict
from sklearn import metrics
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings("ignore")


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# Preparing for Data wrangling

In [None]:
# load csv file
df = pd.read_csv("/kaggle/input/breast-cancer-wisconsin-data/data.csv")

In [None]:
df.head()
# load 5 rows of df 
# There are a few unnecessary columns

In [None]:
df.drop(['id','Unnamed: 32'],axis=1,inplace=True)
#In this study 'id' and 'Unnamed: 32' are not needed 
#So drop both columns 

In [None]:
df.isna().sum()
# checking missing value 
# No missing values

In [None]:
data.info()
# get a information about each column

# Data Wrangling 

## Gathering , Assessing, cleaning 

## Prediction 

### Split Train and Test

In [None]:
train,test = train_test_split(df,test_size=0.2,random_state=2019)
# test size =0.2 means I will use 20% for testing 
# so that means we use 80% for training. Spliting teat set and training set is very important.
# Spliting test-set and training-set is very important.Because we have to use tes-set to examine our prediction model and get a performance in numeric value.
#So never use test-set for training.Otherwise we can't get a exact result of prediction model.
# Reason why we use random_state : https://stackoverflow.com/questions/28064634/random-state-pseudo-random-number-in-scikit-learn

x_train = train.drop(['diagnosis'],axis=1)
y_train = train.diagnosis

# we should think about why we drop diagonosis column.Because we want to know the diagnosis in the end (That mean malignant or benign)
# We're going to use other columns as a x variable to get a diagonosis(y variable).That's the reason why we drop diagnosis in x_train and x_test

x_test = test.drop(['diagnosis'],axis=1)
y_test = test.diagnosis 

print(len(train),len(test))

We got 455 rows for trainig and 114 rows for testing

In [None]:
### SVM 

In [None]:
model = svm.SVC(gamma='scale')
model.fit(x_train,y_train)
# learning train dataset

y_pred = model.predict(x_test)
# prediction test dataset

print('SVM: %.2f' % (metrics.accuracy_score(y_pred,y_test)*100))
# metrics.accuracy_score : measure the accurace_score
# so we compare prediction of y (prediction, y_pred) and test result of y (fact,y_test) how close our y_pred to y_test

In [None]:
So we got 91.23%.That means our prediction is 91.23% equal to y_test result

In [None]:
### DecisionTreeClassifier

In [None]:
model = DecisionTreeClassifier()
model.fit(x_train,y_train)

y_pred = model.predict(x_test)

print('DecisionTreeClassifier: %.2f' % (metrics.accuracy_score(y_pred,y_test)*100))


In [None]:
### KNeighborsClassifier

In [None]:
model = KNeighborsClassifier()
model.fit(x_train,y_train)

y_pred = model.predict(x_test)

print('KNeighborsClassifier: %.2f' % (metrics.accuracy_score(y_pred,y_test)*100))

In [None]:
### LogisticRegression

In [None]:
model = LogisticRegression(solver='lbfgs',max_iter=2000)
# about parameters: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
model.fit(x_train,y_train)

y_pred = model.predict(x_test)

print('LogisticRegression: %.2f' % (metrics.accuracy_score(y_pred,y_test)*100))


In [None]:
### RandomForestClassifier

In [None]:
model = RandomForestClassifier(n_estimators=100)
model.fit(x_train,y_train)

y_pred = model.predict(x_test)

print('RandomForestClassifier: %.2f' % (metrics.accuracy_score(y_pred,y_test)*100))


In [None]:
### Compute Feature Importances

In [None]:
features = pd.Series(
     model.feature_importances_,
    index=x_train.columns).sort_values(ascending=False)

# model.feature_importances_ shows which paramet is important to predict the model 
# we are matching train dataset columns with model.feature_importances and saved in pandas series as a numeric values 
print(features)

In [None]:
### Extract Top 5 Features

In [None]:
top_5_features = features.keys()[:5]
# series.keys() : this function is an alias for index. It returns the index labels of the given series object.

print(top_5_features)

In [None]:
### SVM(Top 5)

In [None]:
model = svm.SVC(gamma='scale')
model.fit(x_train[top_5_features],y_train)

y_pred = model.predict(x_test[top_5_features])
# prediction test dataset

print('SVM(Top5): %.2f' % (metrics.accuracy_score(y_pred,y_test)*100))

In [None]:
### Cross Validation (principle version)


In [None]:
model = svm.SVC(gamma='scale')

cv = KFold(n_splits=5,random_state=2019)
# Interation : K=5

accs = []

for train_index,test_index in cv.split(df[top_5_features]):
    x_train = df.iloc[train_index][top_5_features]
    y_train = df.iloc[train_index].diagnosis
    
    x_test = df.iloc[test_index][top_5_features]
    y_test = df.iloc[test_index].diagnosis
    
    
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    accs.append(metrics.accuracy_score(y_pred,y_test))
    # position of y_pred and y_test are not important
    
print(accs)
    

In [None]:
### Cross Validation (simple version)

In [None]:
model = svm.SVC(gamma='scale')
cv = KFold(n_splits=5,random_state=2019)

accs = cross_val_score(model,df[top_5_features],df.diagnosis,cv=cv)
# cross_vall_score : apply cross validation (in our case would be KFold) and learning.
# In the end will be print out the model score
# x variable : df[top_5_features] , y variable : di.diagnosis
print(accs)

In [None]:
### Test all Models

In [None]:
model = {
    'SVM': svm.SVC(gamma='scale'),
    'DecisionTreeClassifier':DecisionTreeClassifier(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'LogisticRegression': LogisticRegression(solver='lbfgs',max_iter=2000),
    'RandomForestClassifier': RandomForestClassifier(n_estimators=100)
    
}

cv = KFold(n_splits=5,random_state=2019)

for name, model in model.items():
    scores = cross_val_score(model,df[top_5_features],df.diagnosis,cv=cv)
    
    print('%s:%.2f%%' % (name,np.mean(scores)*100))



In [None]:
### Normalized Dataset

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0,1))
# scale the range between 0 and 1 
scaled_data = scaler.fit_transform(df[top_5_features])

model = {
    'SVM': svm.SVC(gamma='scale'),
    'DecisionTreeClassifier':DecisionTreeClassifier(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'LogisticRegression': LogisticRegression(solver='lbfgs',max_iter=2000),
    'RandomForestClassifier': RandomForestClassifier(n_estimators=100)
    
}

cv = KFold(n_splits=5,random_state=2019)

for name, model in model.items():
    scores = cross_val_score(model,scaled_data,df.diagnosis,cv=cv)
    
    print('%s:%.2f%%' % (name,np.mean(scores)*100))