# Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score

# Import Data

In [2]:
data = pd.read_csv("https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv", sep=",")

In [3]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
train,test = train_test_split(data,test_size=0.2)

In [5]:
train_x = train.drop(["Outcome"],axis=1)
train_y = train["Outcome"]
test_x = test.drop(["Outcome"],axis=1)
test_y = test["Outcome"]

# Create Pipelines

3 model için Pipeline oluşturulması sağlanır.

- MinMax Scaler ile data preprocessing
- PCA ile boyut indirgeme
- Modelin eğitilmesi

## Decision Tree Pipeline (CART)

In [6]:
DecisionTreePipeline = Pipeline([('pca', PCA(n_components = 3)), 
                                 ('myscaler', MinMaxScaler()), 
                                 ('decision_tree', DecisionTreeClassifier())], verbose = True)

## Logistic Regression Pipeline (CART)

In [7]:
LogisticPipeline = Pipeline([('pca', PCA(n_components = 3)), 
                                 ('myscaler', MinMaxScaler()), 
                                 ('logistic_regression', LogisticRegression())], verbose = True)

## Random Forest Pipeline

In [8]:
RandomForestPipeline = Pipeline([('pca', PCA(n_components = 3)), 
                                 ('myscaler', MinMaxScaler()), 
                                 ('random_forest', RandomForestClassifier())], verbose = True)

# Model Training and Testing

In [9]:
Pipelines = [DecisionTreePipeline,LogisticPipeline,RandomForestPipeline]

En iyi modelin seçimi için değişkenler tanımlanır.

In [10]:
classifier = 0.0
accuracy = 0.0
pipeline = ""

Pipeline için bir dictionary oluşturulur ve model eğitilir.

In [11]:
pipelineDic = {0:'Decision Tree',1:'Logistic Regression', 2:'Random Forest'}

for pipe in Pipelines:
    pipe.fit(train_x,train_y)

[Pipeline] ............... (step 1 of 3) Processing pca, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing myscaler, total=   0.0s
[Pipeline] ..... (step 3 of 3) Processing decision_tree, total=   0.0s
[Pipeline] ............... (step 1 of 3) Processing pca, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing myscaler, total=   0.0s
[Pipeline]  (step 3 of 3) Processing logistic_regression, total=   0.0s
[Pipeline] ............... (step 1 of 3) Processing pca, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing myscaler, total=   0.0s
[Pipeline] ..... (step 3 of 3) Processing random_forest, total=   0.1s


Her bir model için test accuracy değerlerinin hesaplanması sağlanır.

In [12]:
for i,model in enumerate(Pipelines):
    print("{} Test Accuracy: {}".format(pipelineDic[i],model.score(test_x,test_y)))

Decision Tree Test Accuracy: 0.7207792207792207
Logistic Regression Test Accuracy: 0.7792207792207793
Random Forest Test Accuracy: 0.7402597402597403


In [18]:
for i,model in enumerate(Pipelines):
    if model.score(test_x,test_y)>accuracy:
        accuracy = model.score(test_x,test_y)
        pipeline = model
        classifier = i
print('En iyi model: {}'.format(pipelineDic[classifier]))

En iyi model: Logistic Regression
