# Installing the libraries

In [3]:
#!pip3 install pandas
#!pip3 install impyute

Collecting impyute
  Downloading https://files.pythonhosted.org/packages/37/28/86829f67c9affb847facaab94687761d3555539ec675f7577778c5b2680a/impyute-0.0.8-py2.py3-none-any.whl
Collecting numpy (from impyute)
  Downloading https://files.pythonhosted.org/packages/c1/e2/4db8df8f6cddc98e7d7c537245ef2f4e41a1ed17bf0c3177ab3cc6beac7f/numpy-1.16.3-cp36-cp36m-manylinux1_x86_64.whl (17.3MB)
[K    100% |████████████████████████████████| 17.3MB 67kB/s 
[?25hCollecting scipy (from impyute)
  Downloading https://files.pythonhosted.org/packages/72/4c/5f81e7264b0a7a8bd570810f48cd346ba36faedbd2ba255c873ad556de76/scipy-1.3.0-cp36-cp36m-manylinux1_x86_64.whl (25.2MB)
[K    100% |████████████████████████████████| 25.2MB 42kB/s 
[?25hCollecting scikit-learn (from impyute)
  Downloading https://files.pythonhosted.org/packages/85/04/49633f490f726da6e454fddc8e938bbb5bfed2001681118d3814c219b723/scikit_learn-0.21.2-cp36-cp36m-manylinux1_x86_64.whl (6.7MB)
[K    100% |████████████████████████████████| 6.7MB 

# Importing needed libraries

In [28]:
import pandas as pd
import numpy as np

import sys
sys.setrecursionlimit(100000) #Increase the recursion limit of the OS

from impyute.imputation.cs import fast_knn
import eli5

from sklearn.neighbors import KNeighborsClassifier


# Exploratory Data Analysis (EDA)

In [69]:
data = pd.read_csv('diabetes_dataset.csv')
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
4,5,116.0,74.0,,,25.6,0.201,30,0


In [26]:
data.dtypes

Pregnancies                   int64
Glucose                     float64
BloodPressure               float64
SkinThickness               float64
Insulin                     float64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

### Taking a look at the **missing** data

In [11]:
data.isna().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

Here we can see that there is a lot of missing data at the *Glucose, BloodPressure, SkinThickness, Insulin, BMI*. And they all are **numeric** values.

# Data Imputation:

## Imputation Using k-NN:

In [17]:
# start the KNN training
imputed_training = fast_knn(data.values, k=30)

array([[  6.   , 148.   ,  72.   , ...,   0.627,  50.   ,   1.   ],
       [  1.   ,  85.   ,  66.   , ...,   0.351,  31.   ,   0.   ],
       [  8.   , 183.   ,  64.   , ...,   0.672,  32.   ,   1.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,   0.245,  30.   ,   0.   ],
       [  1.   , 126.   ,  60.   , ...,   0.349,  47.   ,   1.   ],
       [  1.   ,  93.   ,  70.   , ...,   0.315,  23.   ,   0.   ]])

In [18]:
imputed_data = pd.DataFrame(imputed_training, columns=data.columns)

# Checking again if there is any missing value.
imputed_data.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

## Imputation Using Multivariate Imputation by Chained Equation (MICE)

In [70]:
 from impyute.imputation.cs import mice

# Reading dataset with missing data again
data = pd.read_csv('diabetes_dataset.csv')

# start the MICE training
imputed_training = mice(data.values)

imputed_data = pd.DataFrame(imputed_training, columns=data.columns)

# Checking again if there is any missing value.
imputed_data.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

# Normalizing the data:

In [71]:
from sklearn import preprocessing

X = imputed_data.loc[:, imputed_data.columns != 'Outcome']

data_norm = preprocessing.normalize(X, norm='l1')

data_normalized = pd.DataFrame(data_norm, columns=X.columns)
data_normalized['Outcome'] = imputed_data['Outcome']

In [72]:
test_data = pd.read_csv('diabetes_app.csv')
data_test_norm = preprocessing.normalize(X, norm='l1')

data_test_normalized = pd.DataFrame(data_test_norm, columns=X.columns)
data_test_normalized.to_csv('diabetes_app_franca.csv', index=False)

## Imputation Using Random Forest Imputation (MissForest)

In [21]:
# Reading dataset with missing data again
data = pd.read_csv('diabetes_dataset.csv')

# Let data be an array containing missing values
from missingpy import MissForest
imputer = MissForest()
imputed_training = imputer.fit_transform(data)

imputed_data = pd.DataFrame(imputed_training, columns=data.columns)

# Checking again if there is any missing value.
imputed_data.isna().sum()

Iteration: 0
Iteration: 1
Iteration: 2


Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

# Feature importance in model

In [32]:
print('\n - Lendo o arquivo com o dataset sobre diabetes')
data = pd.read_csv('diabetes_dataset_franca.csv')

# Criando X and y par ao algorítmo de aprendizagem de máquina.\
print(' - Criando X e y para o algoritmo de aprendizagem a partir do arquivo diabetes_dataset')
# Caso queira modificar as colunas consideradas basta algera o array a seguir.
feature_cols = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 
                'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
X = data[feature_cols]
y = data.Outcome

# Ciando o modelo preditivo para a base trabalhada
print(' - Criando modelo preditivo')
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X, y)



perm = PermutationImportance(svc).fit(X_test, y_test)
eli5.show_weights(perm)


 - Lendo o arquivo com o dataset sobre diabetes
 - Criando X e y para o algoritmo de aprendizagem a partir do arquivo diabetes_dataset
 - Criando modelo preditivo


# Write 'diabetes_dataset_franca.csv'

In [None]:
import matplotlib.pyplot as plt

plt.matshow(data_normalized.corr())
plt.show()
# data_normalized.to_csv('diabetes_dataset_franca.csv')

# Running Aydano's script to send to server:

In [74]:
!python3 diabetes_csv.py


 - Lendo o arquivo com o dataset sobre diabetes
 - Criando X e y para o algoritmo de aprendizagem a partir do arquivo diabetes_dataset
 - Criando modelo preditivo
 - Aplicando modelo e enviando para o servidor
 - Resposta do servidor:
 {"status":"success","dev_key":"Vov\u00f3Learn","accuracy":0.5816326530612245,"old_accuracy":0.75} 



0.74489795918367