# Trabalho Prático - Python para Ciência de Dados

IFMG - Instituto Federal de Educação, Ciência e Tecnologia de Minas Gerais - Especialização em Inteligência Artificial  

Mat.: Programação Genética 

Prof.: Carlos Alexandre Silva

Aluno: Christhian da S. Gonçalves 

## 01 - Sobre o Conjunto de Dados

O conjunto de dados de previsão de diabetes é uma coleção de dados médicos e demográficos de pacientes, juntamente com seu status de diabetes (positivo ou negativo). Os dados incluem características como idade, sexo, índice de massa corporal (IMC), hipertensão, doenças cardíacas, histórico de tabagismo, nível de HbA1c e nível de glicose no sangue.

Link: https://www.kaggle.com/datasets/iammustafatz/diabetes-prediction-dataset

## 02 - Análise exploratória dos dados

### 2.1 - Importando Bibliotecas

In [172]:
#Importando bibliotecas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from category_encoders import BinaryEncoder

from sklearn.preprocessing import StandardScaler,LabelEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC, SVR
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold, cross_val_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score
from random import randint
from sklearn import svm

import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
import tpot
from tpot import TPOTClassifier
#permite que os gráficos do Matplotlib sejam exibidos diretamente no notebook
%matplotlib inline
#import warnings #controla o tratamento de avisos (warnings) durante a execução do código
#warnings.filterwarnings("ignore") #ignora avisos durante a execução do código

### 2.2 - Importando base de dados

In [173]:
#Importando dados
df_diabete = pd.read_csv('https://raw.githubusercontent.com/christhiangoncalves/Machine_Learning_Genetic_Programming/main/Diabetes_Dataset/diabetes_prediction_dataset.csv')
df_diabete.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


### 2.3 - Analisando dados

In [174]:
#Imformações do tipo dos dados das colunas
df_diabete.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [175]:
#Descrição estatistica dos dados
df_diabete.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,100000.0,41.885856,22.51684,0.08,24.0,43.0,60.0,80.0
hypertension,100000.0,0.07485,0.26315,0.0,0.0,0.0,0.0,1.0
heart_disease,100000.0,0.03942,0.194593,0.0,0.0,0.0,0.0,1.0
bmi,100000.0,27.320767,6.636783,10.01,23.63,27.32,29.58,95.69
HbA1c_level,100000.0,5.527507,1.070672,3.5,4.8,5.8,6.2,9.0
blood_glucose_level,100000.0,138.05806,40.708136,80.0,100.0,140.0,159.0,300.0
diabetes,100000.0,0.085,0.278883,0.0,0.0,0.0,0.0,1.0


In [176]:
#verificação se existem dados nulos
df_diabete.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

In [177]:
#Verifica numero de linhas duplicadas
df_diabete.duplicated().sum()

3854

In [178]:
#Excluindo 3854 linhas duplicadas
#df_diabete.drop_duplicates(inplace=True)

### 2.4 Plotando Gráficos

In [179]:
'''sns.countplot(x="gender", data=df_diabete)
plt.title("Gender Distribution")
plt.show()

sns.countplot(x="hypertension", data=df_diabete)
plt.title("Hypertension Distribution")
plt.show()

sns.countplot(x="heart_disease", data=df_diabete)
plt.title("Heart Disease Distribution")
plt.show()

sns.countplot(x="smoking_history", data=df_diabete)
plt.title("Smoking History Distribution")
plt.show()'''

'sns.countplot(x="gender", data=df_diabete)\nplt.title("Gender Distribution")\nplt.show()\n\nsns.countplot(x="hypertension", data=df_diabete)\nplt.title("Hypertension Distribution")\nplt.show()\n\nsns.countplot(x="heart_disease", data=df_diabete)\nplt.title("Heart Disease Distribution")\nplt.show()\n\nsns.countplot(x="smoking_history", data=df_diabete)\nplt.title("Smoking History Distribution")\nplt.show()'

In [180]:
#Codificando as colunas do genero e histórico de fumante
le = LabelEncoder()
df_temp = df_diabete.groupby('smoking_history')['smoking_history'].count()
df_diabete["gender_encoded"] = le.fit_transform(df_diabete["gender"])
df_diabete["smoking_history_encoded"] = le.fit_transform(df_diabete["smoking_history"])

#Excluindo colunas "gender" e "smoking_history", pois foi criado colunas codificadas
del df_diabete["smoking_history"]
del df_diabete["gender"]

display(df_temp,df_diabete.groupby('smoking_history_encoded')['smoking_history_encoded'].count())


smoking_history
No Info        35816
current         9286
ever            4004
former          9352
never          35095
not current     6447
Name: smoking_history, dtype: int64

smoking_history_encoded
0    35816
1     9286
2     4004
3     9352
4    35095
5     6447
Name: smoking_history_encoded, dtype: int64

In [181]:
'''plot1 = ['diabetes','age', 'hypertension','heart_disease']
plot2 = ['diabetes','bmi','HbA1c_level','blood_glucose_level']
plot3 = ['diabetes','gender_encoded','smoking_history_encoded']
sns.pairplot(df_diabete[plot1],hue= 'diabetes')'''

"plot1 = ['diabetes','age', 'hypertension','heart_disease']\nplot2 = ['diabetes','bmi','HbA1c_level','blood_glucose_level']\nplot3 = ['diabetes','gender_encoded','smoking_history_encoded']\nsns.pairplot(df_diabete[plot1],hue= 'diabetes')"

Pode-se verificar as seguintes relações:
- Quanto maior a idade, maior a frequência de diabetes.
- Quanto maor a idade e possui hipertensão, maoior é a incidência de diabetes.
- Quanto maor a idade e possui doênça do coração, maoior é a incidência de diabetes.

In [182]:
'''sns.pairplot(df_diabete[plot2],hue= 'diabetes')'''

"sns.pairplot(df_diabete[plot2],hue= 'diabetes')"

É possivel notar uma grande correlação entre os rotulos de dados acima, onde temos:
Quanto maior a glicose e o nível de HbA1C(Hemoglobina Glicada) no sangue maior o indice de diabéticos.
O bmi(indice de massa corporal) não possui uma relação forte com quantidade de diabéticos 

In [183]:
'''sns.pairplot(df_diabete[plot3],hue= 'diabetes')'''

"sns.pairplot(df_diabete[plot3],hue= 'diabetes')"

Pode-se verificar as seguintes relações:
- Indice de diabetes é independente do genero
- Indice de diabetes é independente se é fumante ou não, porém ocorre um leve aumento de diabétes com os ex fumantes e não fumantes.

In [184]:
#Mapa de Correlação
'''plt.figure(figsize=(12,8))
sns.heatmap(df_diabete.corr(),annot=True,cmap='coolwarm')
plt.title('Correlation Heatmap')'''

"plt.figure(figsize=(12,8))\nsns.heatmap(df_diabete.corr(),annot=True,cmap='coolwarm')\nplt.title('Correlation Heatmap')"

# 03 Modelo de programação genética

In [185]:
classifiers = ['LinearSVM', 'RadialSVM',
               'Logistic',  'RandomForest',
               'AdaBoost',  'DecisionTree',
               'KNeighbors','GradientBoosting']

models = [svm.SVC(kernel='linear'),
          svm.SVC(kernel='rbf'),
          LogisticRegression(max_iter = 1000),
          RandomForestClassifier(n_estimators=200, random_state=0),
          AdaBoostClassifier(random_state = 0),
          DecisionTreeClassifier(random_state=0),
          KNeighborsClassifier(),
          GradientBoostingClassifier(random_state=0)]


def acc_score(df,label):
    Score = pd.DataFrame({"Classificador":classifiers})
    j = 0
    acc = []
    X_train,X_test,Y_train,Y_test = split(df,label)
    for i in models:
        model = i
        model.fit(X_train,Y_train)
        predictions = model.predict(X_test)
        acc.append(accuracy_score(Y_test,predictions))
        j = j+1
    Score["Acurácia"] = acc
    Score.sort_values(by="Acurácia", ascending=False,inplace = True)
    Score.reset_index(drop=True, inplace=True)
    return Score

'''def plot(score,x,y,c = "b"):
    gen = [1,2,3,4,5]
    plt.figure(figsize=(6,4))
    ax = sns.pointplot(x=gen, y=score,color = c )
    ax.set(xlabel="Geração", ylabel="Acurácia")
    ax.set(ylim=(x,y))

    def initilization_of_population(size,n_feat):
    population = []
    for i in range(size):
        chromosome = np.ones(n_feat,dtype=np.bool)
        chromosome[:int(0.3*n_feat)]=False
        np.random.shuffle(chromosome)
        population.append(chromosome)
    return population'''


def fitness_score(population):
    scores = []
    for chromosome in population:
        logmodel.fit(X_train.iloc[:,chromosome],Y_train)
        predictions = logmodel.predict(X_test.iloc[:,chromosome])
        scores.append(accuracy_score(Y_test,predictions))
    scores, population = np.array(scores), np.array(population)
    inds = np.argsort(scores)
    return list(scores[inds][::-1]), list(population[inds,:][::-1])


def selection(pop_after_fit,n_parents):
    population_nextgen = []
    for i in range(n_parents):
        population_nextgen.append(pop_after_fit[i])
    return population_nextgen


def crossover(pop_after_sel):
    pop_nextgen = pop_after_sel
    for i in range(0,len(pop_after_sel),2):
        new_par = []
        child_1 , child_2 = pop_nextgen[i] , pop_nextgen[i+1]
        new_par = np.concatenate((child_1[:len(child_1)//2],child_2[len(child_1)//2:]))
        pop_nextgen.append(new_par)
    return pop_nextgen


def mutation(pop_after_cross,mutation_rate,n_feat):
    mutation_range = int(mutation_rate*n_feat)
    pop_next_gen = []
    for n in range(0,len(pop_after_cross)):
        chromo = pop_after_cross[n]
        rand_posi = []
        for i in range(0,mutation_range):
            pos = randint(0,n_feat-1)
            rand_posi.append(pos)
        for j in rand_posi:
            chromo[j] = not chromo[j]
        pop_next_gen.append(chromo)
    return pop_next_gen

def generations(df,label,size,n_feat,n_parents,mutation_rate,n_gen,X_train,
                                   X_test, Y_train, Y_test):
    best_chromo= []
    best_score= []
    population_nextgen = initilization_of_population(size,n_feat)
    for i in range(n_gen):
        scores, pop_after_fit = fitness_score(population_nextgen)
        print('Melhor score da geração',i+1,':',scores[:1])  #2
        pop_after_sel = selection(pop_after_fit,n_parents)
        pop_after_cross = crossover(pop_after_sel)
        population_nextgen = mutation(pop_after_cross,mutation_rate,n_feat)
        best_chromo.append(pop_after_fit[0])
        best_score.append(scores[0])
    return best_chromo,best_score

In [186]:
def split(df,label):
    X_tr, X_te, Y_tr, Y_te = train_test_split(df, label, test_size=0.25, random_state=42)
    return X_tr, X_te, Y_tr, Y_te

In [187]:
#dividindo base em treino e teste e verificando atributos
x = df_diabete.drop(['diabetes'],axis=1)
y = df_diabete['diabetes']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=42)

print("Dataset de diabetes:\n",df_diabete.shape[0],"Registros\n",df_diabete.shape[1],"Atributos")

Dataset de diabetes:
 100000 Registros
 9 Atributos


In [191]:
#Iniciar classificador
tpot = TPOTClassifier(generations=2, verbosity=2)

In [192]:
tpot.fit(x_train,y_train)

                                                                                
                                                                              
TPOT closed during evaluation in one generation.
                                                                              
                                                                              
TPOT closed prematurely. Will use the current best pipeline.
                                                                              
Best pipeline: XGBClassifier(PolynomialFeatures(input_matrix, degree=2, include_bias=False, interaction_only=False), learning_rate=0.1, max_depth=2, min_child_weight=14, n_estimators=100, n_jobs=1, subsample=0.35000000000000003, verbosity=0)


In [None]:
score1 = acc_score(x,y)
score1

Apenas os classificadores KNeighbors e RadialSVM não apresentaram 100% de acurácia.