In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns 
sns.set(style="darkgrid")
import warnings
warnings.filterwarnings("ignore") 

# Machine learning libraries that I'll use in this study 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold,cross_val_score,cross_val_predict
from sklearn import metrics
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings("ignore")


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# Introduction 

 Breast cancer is cancer that forms in the cells of the breasts.

 After skin cancer, breast cancer is the most common cancer diagnosed in women in the United States. Breast cancer can occur in both men and women, but it's far more common in women.Substantial support for breast cancer awareness and research funding has helped created advances in the diagnosis and treatment of breast cancer. 

Breast cancer survival rates have increased, and the number of deaths associated with this disease is steadily declining, largely due to factors such as earlier detection, a new personalized approach to treatment and a better understanding of the disease.

In this study you'll get a csv file called 'breast-cancer-wisconsin-data'.From there you will get informations about diagnosis and specific data in numeric value.Our goal of this study is prediction of the breast cancer whether they have **benign or malignant** by using factor columns 

# Data Wrangling

## 1. Gathering  

In [None]:
# load csv file
df = pd.read_csv("/kaggle/input/breast-cancer-wisconsin-data/data.csv")

## 2. Assessing & Cleaning  

In [None]:
df.head()
# load 5 rows of df 
# There are a few unnecessary columns

In [None]:
df.drop(['id','Unnamed: 32'],axis=1,inplace=True)
#In this study 'id' and 'Unnamed: 32' are not needed 
#So drop both columns 

In [None]:
df.isna().sum()
# checking missing value 
# No missing values

In [None]:
df.info()
# get a information about each column

In [None]:
df.shape
# rows and columns

In [None]:
df.columns
# all columns in df 

> We have total 31 columns which consist of 1 categorical and 30 quantitative values. 
So we gonna use `diagonisis` for final result.That means using different kind of cancer factors(30 quantitative values) we will get `diagonisis` prediction. 

Of course that's really good if we use all factor columns to guess who has probably have a different type of `diagnosis`.But it takes a lot of time. Also it's really **hard to read at a glance**.So what we`re going to do now is copy the original file and filter columns that I want to use. 

**Notice**
- Copying the file is also important process.Because keeping the original file is more easiler when you use that file again later. 

**Columns information**
- mean : average
- se(standard error) : quantifies the variation in the means from multiple sets of measurements.In other words standard error is the mean of standard deviation.
- standard deviation : quantifies the variation within a set of measurements 
- worst : worst or largest mean value from each data

The confusing things between **standard error** and **standard deviation** is that the standard error can be estimated from a single set of measurements, even though it describes the means from multiple sets. Thus,even if you only have a singel set of measurements, you are often given the option to plot the standard error. 

## What is/are the main feature(s) of interest in your dataset?
- diagnosis 

## What features in the dataset do you think will help support your investigation into your feature(s) of interest?

- all the other columns  

## Unnecessary featues in the dataset do you think
- Nothings, we will use all columns for factors of breast cancer 

# Data Visualization 

### 1. Univariate Exploration of data

Using only one variable to visualize **df_new table**

In [None]:
#countplot
plt.subplots(figsize=(10,5))
sns.countplot(data=df,x='diagnosis');

plt.title('Diagnosis counting'.title(),
         fontsize = 14, weight="bold")

plt.xlabel('Type of diagonosis'.title(),
          fontsize=14,weight="bold")

plt.ylabel('Count'.title(),
           fontsize=14,weight="bold")

plt.legend(['malignant','benign'],loc='center right',bbox_to_anchor=(1.2, 0.93), 
           title="Diagonisis", title_fontsize = 14);

> There are a lot more benign than malignant.So we called `inbalanced data`.This is actually not a extreme case. If it`s too strong.We should have a balance between two type of diagnosis in order to get a right prediction later.

In [None]:
#pie chart  

plt.figure(figsize=(15,7))
sorted_counts = df['diagnosis'].value_counts()
# count the value of diagnosis 
ax=plt.pie(sorted_counts, labels = sorted_counts.index, startangle = 90,
        counterclock = False,pctdistance=0.8 ,wedgeprops = {'width' : 0.4}, autopct='%1.0f%%');


plt.title('Proprotion of malignant and benign'.title(),
         fontsize = 14, weight="bold");

plt.legend(['Benign(B)','Malignant(M)'],bbox_to_anchor=(1,0.9));

> As you can see on the pie chart benign possesses 63% of the total dataset.

In [None]:
#distplot = histogram + curveline 
# for example : radius mean

plt.subplots(figsize=(15,7))
x = df.radius_mean
bins = np.arange(0,30,1)
sns.distplot(x,bins=bins,color='black')


plt.title('radius mean Histogram'.title(),
         fontsize = 14, weight="bold")

plt.xlabel('radius mean range'.title(),
          fontsize=14,weight="bold")

plt.ylabel('Count in percentage'.title(),
           fontsize=14,weight="bold");

> Approximately normal distributed graph

In [None]:
plt.subplots(figsize=(15,7))
x = df.symmetry_mean
bins = np.arange(0,1,0.01)
sns.distplot(x,color='black',bins=bins)


plt.title('Symmetry mean Histogram'.title(),
         fontsize = 14, weight="bold")

plt.xlabel('symmetry mean range'.title(),
          fontsize=14,weight="bold")

plt.ylabel('Count'.title(),
           fontsize=14,weight="bold");

> Nornmal distributed and symmetry but not the meam value isn't located on the center. 

In [None]:
plt.subplots(figsize=(15,7))
x = df.concavity_mean
bins = np.arange(0,1,0.01)
sns.distplot(x,color='black',bins=bins)


plt.title('concavity mean Histogram'.title(),
         fontsize = 14, weight="bold")

plt.xlabel('concavity mean range'.title(),
          fontsize=14,weight="bold")

plt.ylabel('Count'.title(),
           fontsize=14,weight="bold");

> Right skewed normal distributed 

### 2.Bivariate Exploration of Data

Using only one variable to visualize **df_new** table

In [None]:
# split table into different valriables 
y=df.diagnosis 
x = df.iloc[:,1:] 

# standardization
stand = (x - x.mean()) / (x.std())             

> Why we're doing standardization? because columns with **mean,se and worst** have different size of value.So it's hard to compare with raw data.That's why we're doing standardization to make it comparable.


## $$ z_{score} = \frac {(x- \mu)}{\sigma}$$


- Z = standard score or z score , this score tells us you how many standard deviations from the mean your score is.
- x = observed value
- $\mu$ = mean value of dataset
- $\sigma$ = standard deviation of dataset

In [None]:
# Because we have 30 sub features we'll divide 3 groups to visualize

data = pd.concat([y,stand.iloc[:,0:10]],axis=1)
data = pd.melt(data,id_vars="diagnosis",
                    var_name="features",
                    value_name='value')

# In order to visualize different type of numeric value in one graph.We're going to melt df_new table into the new table called `data`.
# id_var : Column(s) to use as identifier variables.
# var_name : Name to use for the ‘variable’ column. If None it uses frame.columns.name or ‘variable’.
# value_name : Name to use for the ‘value’ column. 


plt.figure(figsize=(15,7))
sns.violinplot(x="features", y="value", hue="diagnosis", data=data,split=True, inner="quart")


plt.title('Sub features with standardization(first 10 features)'.title(),
         fontsize = 14, weight="bold")

plt.xlabel('Sub features'.title(),
          fontsize=14,weight="bold")

plt.ylabel('z score'.title(),
           fontsize=14,weight="bold");

plt.xticks(rotation=45);


In [None]:
data = pd.concat([y,stand.iloc[:,10:20]],axis=1)
data = pd.melt(data,id_vars="diagnosis",
                    var_name="features",
                    value_name='value')

# In order to visualize different type of numeric value in one graph.We're going to melt df_new table into the new table called `data`.
# id_var : Column(s) to use as identifier variables.
# var_name : Name to use for the ‘variable’ column. If None it uses frame.columns.name or ‘variable’.
# value_name : Name to use for the ‘value’ column. 


plt.figure(figsize=(15,7))
sns.violinplot(x="features", y="value", hue="diagnosis", data=data,split=True, inner="quart")


plt.title('Sub features with standardization(Second 10 features)'.title(),
         fontsize = 14, weight="bold")

plt.xlabel('Sub features'.title(),
          fontsize=14,weight="bold")

plt.ylabel('z score'.title(),
           fontsize=14,weight="bold");

plt.xticks(rotation=45);


In [None]:
data = pd.concat([y,stand.iloc[:,20:31]],axis=1)
data = pd.melt(data,id_vars="diagnosis",
                    var_name="features",
                    value_name='value')

# In order to visualize different type of numeric value in one graph.We're going to melt df_new table into the new table called `data`.
# id_var : Column(s) to use as identifier variables.
# var_name : Name to use for the ‘variable’ column. If None it uses frame.columns.name or ‘variable’.
# value_name : Name to use for the ‘value’ column. 


plt.figure(figsize=(15,7))
sns.violinplot(x="features", y="value", hue="diagnosis", data=data,split=True, inner="quart")


plt.title('Sub features with standardization(last 10 features)'.title(),
         fontsize = 14, weight="bold")

plt.xlabel('Sub features'.title(),
          fontsize=14,weight="bold")

plt.ylabel('z score'.title(),
           fontsize=14,weight="bold");

plt.xticks(rotation=45);


In [None]:
## find out proper plotting ....

### 3.Multivariate Exploration of Data

In [None]:
# Pargrid
# Facetgrid
# Scatterplot with size or shape  

## Prediction 

> In this lat part we'll do a prediction.For that we'll use **SVM,RandomForest,DecisionTree,KNN,LogisticRegression** which are the most popular and fundamental machine learning algorithms in Data science. If you don't have any idea of it, please check how work each of algorithms above before you dive into the last part of this study

### Split Train and Test

In [None]:
train,test = train_test_split(df,test_size=0.2,random_state=2019)
# test size =0.2 means I will use 20% for testing 
# so that means we use 80% for training. Spliting teat set and training set is very important.
# Spliting test-set and training-set is very important.Because we have to use tes-set to examine our prediction model and get a performance in numeric value.
#So never use test-set for training.Otherwise we can't get a exact result of prediction model.
# Reason why we use random_state : https://stackoverflow.com/questions/28064634/random-state-pseudo-random-number-in-scikit-learn

x_train = train.drop(['diagnosis'],axis=1)
y_train = train.diagnosis

# we should think about why we drop diagonosis column.Because we want to know the diagnosis in the end (That mean malignant or benign)
# We're going to use other columns as a x variable to get a diagonosis(y variable).That's the reason why we drop diagnosis in x_train and x_test

x_test = test.drop(['diagnosis'],axis=1)
y_test = test.diagnosis 

print(len(train),len(test))

We got 455 rows for trainig and 114 rows for testing

In [None]:
### SVM 

In [None]:
model = svm.SVC(gamma='scale')
model.fit(x_train,y_train)
# learning train dataset

y_pred = model.predict(x_test)
# prediction test dataset

print('SVM: %.2f' % (metrics.accuracy_score(y_pred,y_test)*100))
# metrics.accuracy_score : measure the accurace_score
# so we compare prediction of y (prediction, y_pred) and test result of y (fact,y_test) how close our y_pred to y_test

In [None]:
So we got 91.23%.That means our prediction is 91.23% equal to y_test result

In [None]:
### DecisionTreeClassifier

In [None]:
model = DecisionTreeClassifier()
model.fit(x_train,y_train)

y_pred = model.predict(x_test)

print('DecisionTreeClassifier: %.2f' % (metrics.accuracy_score(y_pred,y_test)*100))


In [None]:
### KNeighborsClassifier

In [None]:
model = KNeighborsClassifier()
model.fit(x_train,y_train)

y_pred = model.predict(x_test)

print('KNeighborsClassifier: %.2f' % (metrics.accuracy_score(y_pred,y_test)*100))

In [None]:
### LogisticRegression

In [None]:
model = LogisticRegression(solver='lbfgs',max_iter=2000)
# about parameters: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
model.fit(x_train,y_train)

y_pred = model.predict(x_test)

print('LogisticRegression: %.2f' % (metrics.accuracy_score(y_pred,y_test)*100))


In [None]:
### RandomForestClassifier

In [None]:
model = RandomForestClassifier(n_estimators=100)
model.fit(x_train,y_train)

y_pred = model.predict(x_test)

print('RandomForestClassifier: %.2f' % (metrics.accuracy_score(y_pred,y_test)*100))


In [None]:
### Compute Feature Importances

In [None]:
features = pd.Series(
     model.feature_importances_,
    index=x_train.columns).sort_values(ascending=False)

# model.feature_importances_ shows which paramet is important to predict the model 
# we are matching train dataset columns with model.feature_importances and saved in pandas series as a numeric values 
print(features)

In [None]:
### Extract Top 5 Features

In [None]:
top_5_features = features.keys()[:5]
# series.keys() : this function is an alias for index. It returns the index labels of the given series object.

print(top_5_features)

In [None]:
### SVM(Top 5)

In [None]:
model = svm.SVC(gamma='scale')
model.fit(x_train[top_5_features],y_train)

y_pred = model.predict(x_test[top_5_features])
# prediction test dataset

print('SVM(Top5): %.2f' % (metrics.accuracy_score(y_pred,y_test)*100))

In [None]:
### Cross Validation (principle version)


In [None]:
model = svm.SVC(gamma='scale')

cv = KFold(n_splits=5,random_state=2019)
# Interation : K=5

accs = []

for train_index,test_index in cv.split(df[top_5_features]):
    x_train = df.iloc[train_index][top_5_features]
    y_train = df.iloc[train_index].diagnosis
    
    x_test = df.iloc[test_index][top_5_features]
    y_test = df.iloc[test_index].diagnosis
    
    
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    accs.append(metrics.accuracy_score(y_pred,y_test))
    # position of y_pred and y_test are not important
    
print(accs)
    

In [None]:
### Cross Validation (simple version)

In [None]:
model = svm.SVC(gamma='scale')
cv = KFold(n_splits=5,random_state=2019)

accs = cross_val_score(model,df[top_5_features],df.diagnosis,cv=cv)
# cross_vall_score : apply cross validation (in our case would be KFold) and learning.
# In the end will be print out the model score
# x variable : df[top_5_features] , y variable : di.diagnosis
print(accs)

In [None]:
### Test all Models

In [None]:
model = {
    'SVM': svm.SVC(gamma='scale'),
    'DecisionTreeClassifier':DecisionTreeClassifier(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'LogisticRegression': LogisticRegression(solver='lbfgs',max_iter=2000),
    'RandomForestClassifier': RandomForestClassifier(n_estimators=100)
    
}

cv = KFold(n_splits=5,random_state=2019)

for name, model in model.items():
    scores = cross_val_score(model,df[top_5_features],df.diagnosis,cv=cv)
    
    print('%s:%.2f%%' % (name,np.mean(scores)*100))



In [None]:
### Normalized Dataset

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0,1))
# scale the range between 0 and 1 
scaled_data = scaler.fit_transform(df[top_5_features])

model = {
    'SVM': svm.SVC(gamma='scale'),
    'DecisionTreeClassifier':DecisionTreeClassifier(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'LogisticRegression': LogisticRegression(solver='lbfgs',max_iter=2000),
    'RandomForestClassifier': RandomForestClassifier(n_estimators=100)
    
}

cv = KFold(n_splits=5,random_state=2019)

for name, model in model.items():
    scores = cross_val_score(model,scaled_data,df.diagnosis,cv=cv)
    
    print('%s:%.2f%%' % (name,np.mean(scores)*100))