# Data cleaning Notebook

### Libraries

In [1]:
import pandas as pd
import numpy as np

### Data import

In [2]:
ruta = 'data/diabetes_prediction_dataset.csv'
data = pd.read_csv(ruta)
data.info()
#data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [3]:
cols = ['gender','smoking_history','age','hypertension','heart_disease','bmi','HbA1c_level','blood_glucose_level','diabetes']
data = data.loc[:,cols]

for col in cols:
  print(f'Columna {col}:{data[col].nunique()} subniveles')

Columna gender:3 subniveles
Columna smoking_history:6 subniveles
Columna age:102 subniveles
Columna hypertension:2 subniveles
Columna heart_disease:2 subniveles
Columna bmi:4247 subniveles
Columna HbA1c_level:18 subniveles
Columna blood_glucose_level:18 subniveles
Columna diabetes:2 subniveles


In [4]:
#trycol = data['smoking_history'].unique()
for col in cols:
    trycol = data[col].unique()
    print(trycol)

['Female' 'Male' 'Other']
['never' 'No Info' 'current' 'former' 'ever' 'not current']
[80.   54.   28.   36.   76.   20.   44.   79.   42.   32.   53.   78.
 67.   15.   37.   40.    5.   69.   72.    4.   30.   45.   43.   50.
 41.   26.   34.   73.   77.   66.   29.   60.   38.    3.   57.   74.
 19.   46.   21.   59.   27.   13.   56.    2.    7.   11.    6.   55.
  9.   62.   47.   12.   68.   75.   22.   58.   18.   24.   17.   25.
  0.08 33.   16.   61.   31.    8.   49.   39.   65.   14.   70.    0.56
 48.   51.   71.    0.88 64.   63.   52.    0.16 10.   35.   23.    0.64
  1.16  1.64  0.72  1.88  1.32  0.8   1.24  1.    1.8   0.48  1.56  1.08
  0.24  1.4   0.4   0.32  1.72  1.48]
[0 1]
[1 0]
[25.19 27.32 23.45 ... 59.42 44.39 60.52]
[6.6 5.7 5.  4.8 6.5 6.1 6.  5.8 3.5 6.2 4.  4.5 9.  7.  8.8 8.2 7.5 6.8]
[140  80 158 155  85 200 145 100 130 160 126 159  90 260 220 300 280 240]
[0 1]


### Patterns

Let's proceed to define the ranges with which we are going to establish the patterns.

*   Age: 25 - 80
*   Gender: Male & Female
*   bmi: 6 differents (check web)
*   smoking_history: 4 differents (never, current, former,ever)

In [5]:
# Smoking_history
data['smoking_history'] = data['smoking_history'].str.replace('not current','former', regex = False)
data = data.drop(data[data['smoking_history'] == 'No Info'].index)
data = data.drop(data[data['gender'] == 'Other'].index)

data['smoking_history'] = data['smoking_history'].replace('never',0)
data['smoking_history'] = data['smoking_history'].replace('current',1)
data['smoking_history'] = data['smoking_history'].replace('former',2)
data['smoking_history'] = data['smoking_history'].replace('ever',3)

trycol = data['smoking_history'].unique()
print(trycol)

[0 1 2 3]


In [6]:
# Age
data = data.loc[(data['age'] >= 25) & (data['age'] <= 80)]

In [7]:
# BMI
data['bmi'] = np.where((data['bmi'] <= 18.5), 5001, data['bmi'])
data['bmi'] = np.where((data['bmi'] > 18.5) & (data['bmi'] <= 24.9), 5002, data['bmi'])
data['bmi'] = np.where((data['bmi'] > 24.9) & (data['bmi'] <= 29.9), 5003, data['bmi'])
data['bmi'] = np.where((data['bmi'] > 29.9) & (data['bmi'] <= 34.9), 5004, data['bmi'])
data['bmi'] = np.where((data['bmi'] > 34.9) & (data['bmi'] <= 39.9), 5005, data['bmi'])
data['bmi'] = np.where((data['bmi'] > 39.9) & (data['bmi'] <= 99.9), 5006, data['bmi'])

data['bmi'] = data['bmi'].replace(5001, 0)
data['bmi'] = data['bmi'].replace(5002, 1)
data['bmi'] = data['bmi'].replace(5003, 2)
data['bmi'] = data['bmi'].replace(5004, 3)
data['bmi'] = data['bmi'].replace(5005, 4)
data['bmi'] = data['bmi'].replace(5006, 5)

trycol = data['bmi'].unique()
print(trycol)

[2. 1. 3. 5. 4. 0.]


In [8]:
# Gender
data['gender'] = data['gender'].replace('Male',0)
data['gender'] = data['gender'].replace('Female',1)

Ranges where it is 100% certain that you have diabetes.
* BGL > 160
* HbA1c > 6.5

The other values are not indicative of diabetes


Targets are set for an adaline neuron in the output layer.

In [9]:
count = data['diabetes'].value_counts()
print(f'Conteo de valores en la columna: \n{count}')

Conteo de valores en la columna: 
diabetes
0    47191
1     6947
Name: count, dtype: int64


In [10]:
#data['diabetes'] = data['diabetes'].replace()

BGL_condicion = (data['blood_glucose_level'] > 126) | (data['HbA1c_level'] > 6.5)
data.loc[BGL_condicion, 'diabetes'] = 1

data['diabetes'] = data['diabetes'].replace(0,-1)
print(data['diabetes'].unique())

[ 1 -1]


We check that our data are not biased to a single prediction. 

In [11]:
count = data['diabetes'].value_counts()
print(f'Conteo de valores en la columna: \n{count}')

Conteo de valores en la columna: 
diabetes
 1    37631
-1    16507
Name: count, dtype: int64


In [12]:
for col in cols:
    trycol = data[col].unique()
    print(f'Columna {col}:{data[col].nunique()} subniveles')
    #print(trycol)

Columna gender:2 subniveles
Columna smoking_history:4 subniveles
Columna age:56 subniveles
Columna hypertension:2 subniveles
Columna heart_disease:2 subniveles
Columna bmi:6 subniveles
Columna HbA1c_level:18 subniveles
Columna blood_glucose_level:18 subniveles
Columna diabetes:2 subniveles


In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 54138 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   gender               54138 non-null  int64  
 1   smoking_history      54138 non-null  int64  
 2   age                  54138 non-null  float64
 3   hypertension         54138 non-null  int64  
 4   heart_disease        54138 non-null  int64  
 5   bmi                  54138 non-null  float64
 6   HbA1c_level          54138 non-null  float64
 7   blood_glucose_level  54138 non-null  int64  
 8   diabetes             54138 non-null  int64  
dtypes: float64(3), int64(6)
memory usage: 4.1 MB


### Data exportation

In case fewer standards are required, a random sampling of 200 samples is performed.

In [14]:
data_clean = data.sample(n=2000)
count = data_clean['diabetes'].value_counts()
print(f'Conteo de valores en la columna: {count}')

Conteo de valores en la columna: diabetes
 1    1430
-1     570
Name: count, dtype: int64


The information must be normalized so that it can enter the neuron.

The maximum values will be the ones we will use to treat the input information to the GUI.

In [15]:
for col in cols:
    max_value = data_clean[col].unique().max()
    print(max_value)

    data_clean[col] = data_clean[col] / max_value

1
3
80.0
1
1
5.0
9.0
300
1


In [16]:
for col in cols:
    max_value = data_clean[col].unique().max()
    print(max_value)

data_clean.info()

1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, 10412 to 4673
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   gender               2000 non-null   float64
 1   smoking_history      2000 non-null   float64
 2   age                  2000 non-null   float64
 3   hypertension         2000 non-null   float64
 4   heart_disease        2000 non-null   float64
 5   bmi                  2000 non-null   float64
 6   HbA1c_level          2000 non-null   float64
 7   blood_glucose_level  2000 non-null   float64
 8   diabetes             2000 non-null   float64
dtypes: float64(9)
memory usage: 156.2 KB


In [None]:
training = data_clean.sample(frac=0.7)
testing = data_clean.drop(training.index)

In [None]:
ruta1 = 'data/training.csv'
ruta2 = 'data/testing.csv'

training.to_csv(ruta1)
testing.to_csv(ruta2)