# Atividade 01

Equipe: ehtudoifelse.


Integrantes: Felipe Vasconcelos; Taigo Italo

In [2]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
import requests

In [None]:
# Dados

data = pd.read_csv('../diabetes_dataset.csv')

# Criando X and y par ao algorítmo de aprendizagem de máquina.\
print(' - Criando X e y para o algoritmo de aprendizagem a partir do arquivo diabetes_dataset')
# Caso queira modificar as colunas consideradas basta algera o array a seguir.
feature_cols = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 
                'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
X = data[feature_cols]
y = data.Outcome

In [3]:
def fillna_linear_reg(model, x):
    if np.isnan(x['SkinThickness']):
        try:
            predicted = model.predict([[x['BMI']]])
            x['SkinThickness'] = predicted[0]
        except: pass
    return x

def clusteringBloodPressure(x):
    if x < 60:
        return 0
    elif x < 70:
        return 1
    elif x < 90:
        return 2
    else:
        return 3
    
def clusteringAge(x):
    if x < 25:
        return 0
    elif x < 40:
        return 1 
    elif x < 55:
        return 2
    else:
        return 3
    
def clusteringGlucose(x):
    if x < 70:
        return 0
    elif x < 100:
        return 1 
    elif x < 126:
        return 2
    else:
        return 3

def preprocessing_cluster_step(settings, data, feature_cols):
    if(settings['AgeClustering'] and 'Age' in feature_cols):
        data['Age'] = data['Age'].apply(lambda x: clusteringAge(x))

    if(settings['BloodPressureClustering'] and 'BloodPressure' in feature_cols):
        data['BloodPressure'] = data['BloodPressure'].apply(lambda x: clusteringBloodPressure(x))

    if(settings['GlucoseClustering'] and 'Glucose' in feature_cols):
        data['Glucose'] = data['Glucose'].apply(lambda x: clusteringGlucose(x))

    return data


def preprocessing_fill_step(settings, data, feature_cols, outcomeCol):
    class0, class1 = data[data[outcomeCol] == 0].copy(), data[data[outcomeCol] == 1].copy()

    if(settings['SkinThicknessLRFill'] and 'SkinThickness' in feature_cols):
        c0lr = class0[['BMI', 'SkinThickness']].dropna()
        model0, x0, st0 = LinearRegression(), c0lr['BMI'].to_numpy().reshape(-1, 1), c0lr['SkinThickness'] #['BMI'].to_numpy().reshape(-1, 1)
        model0.fit(x0, st0)
        for index, row in class0.iterrows():
            class0.loc[index,:] = fillna_linear_reg(model0, row)

        c1lr = class1[['BMI', 'SkinThickness']].dropna()
        model1, x1, st1 = LinearRegression(), c1lr['BMI'].to_numpy().reshape(-1, 1), c1lr['SkinThickness']
        model1.fit(x1, st1)
        for index, row in class1.iterrows():
            class1.loc[index,:] = fillna_linear_reg(model1, row)

    if(settings['MedianFill']):
        for column in feature_cols:
            c0m, c1m = class0[column].median(), class1[column].median()
            class0[column].fillna(value=c0m, inplace=True)
            class1[column].fillna(value=c1m, inplace=True)

    if(settings['MeanFill']):
        for column in feature_cols:
            c0m, c1m = class0[column].mean(), class1[column].mean()
            class0[column].fillna(value=c0m, inplace=True)
            class1[column].fillna(value=c1m, inplace=True)
    data = pd.concat([class0, class1],axis=0)

    return data


def preprocessing_train(settings, data, feature_cols, norm_cols, zscore_params = {}, outcomeCol='Outcome', outlierZscore=3):
    data = preprocessing_fill_step(settings, data[feature_cols+[outcomeCol]].copy(), feature_cols, outcomeCol)

    zscore_params = {}
    if(settings['ZScoreNormalization']):
        for column in [x for x in norm_cols if x in feature_cols]:
            zscore_params[column] = {
                "mean": data[column].mean(),
                "std": data[column].std()
            }
            data[column] = (data[column] - zscore_params[column]["mean"]) / zscore_params[column]["std"]

    data = preprocessing_cluster_step(settings, data, feature_cols)
    return data, zscore_params

def preprocessing_test(settings, data, feature_cols, norm_cols, zscore_params):
    if(settings['ZScoreNormalization']):
        for column in [x for x in norm_cols if x in feature_cols]:
            data[column] = (data[column] - zscore_params[column]["mean"]) / zscore_params[column]["std"]
            
    data = preprocessing_cluster_step(settings, data, feature_cols)
    return data

In [4]:
data = pd.read_csv('diabetes_dataset.csv')

#In order of execution
settings = {
    "SkinThicknessLRFill": True,
    "MedianFill": True,
    "MeanFill": False,

    "ZScoreNormalization": True,
    
    "AgeClustering": False,
    "BloodPressureClustering": True,
    "GlucoseClustering": False,
}

feature_cols = ['BMI', 'Glucose', 'DiabetesPedigreeFunction', 'Age', 'BloodPressure', 'SkinThickness']
norm_cols = ['Glucose', 'BloodPressure', 'BMI', 'DiabetesPedigreeFunction', 'SkinThickness', 'Insulin']
if(settings["GlucoseClustering"]): norm_cols.remove('Glucose')
if(settings["BloodPressureClustering"]): norm_cols.remove('BloodPressure')

data, zscore_params = preprocessing_train(settings, data, feature_cols, norm_cols)

selected = data[feature_cols+["Outcome"]].copy().dropna()
X = selected[feature_cols]
y = selected.Outcome

In [5]:
# Criando o modelo preditivo para a base trabalhada
print(' - Criando modelo preditivo')
neigh = KNeighborsClassifier(n_neighbors=3)

scores = {f'{n}':cross_val_score(neigh, X, y, cv=n, scoring="accuracy") for n in [2,3,5,10,20]}

neigh.fit(X, y)
scores['1'] = neigh.score(X, y)

for k in sorted(scores.keys(), key=lambda x:int(x)):
    print(f'k{k} cross validation: {scores[k].mean():.3f} +/- {scores[k].std():.3f}, between {scores[k].max():.3f} and {scores[k].min():.3f}')


 - Criando modelo preditivo
k1 cross validation: 0.841 +/- 0.000, between 0.841 and 0.841
k2 cross validation: 0.713 +/- 0.003, between 0.717 and 0.710
k3 cross validation: 0.689 +/- 0.035, between 0.738 and 0.663
k5 cross validation: 0.690 +/- 0.036, between 0.757 and 0.649
k10 cross validation: 0.683 +/- 0.041, between 0.741 and 0.596
k20 cross validation: 0.671 +/- 0.074, between 0.793 and 0.517


In [7]:
#realizando previsões com o arquivo de
data_app = pd.read_csv('diabetes_app.csv')
processed = preprocessing_test(settings, data_app, feature_cols, norm_cols, zscore_params)
y_pred = neigh.predict(data_app[feature_cols])

#### show decision boundary for data if it is 2D

In [None]:
if (len(feature_cols) == 2):
    import matplotlib.pyplot as plt
    from matplotlib.colors import ListedColormap
    cmap_light = ListedColormap(['mistyrose', 'lavender'])
    cmap_bold = ListedColormap(['red', 'blue'])
    cmap_pred = ListedColormap(['deeppink','deepskyblue'])

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    h = 0.05
    x_min, x_max = X[feature_cols[0]].min() - 1, X[feature_cols[0]].max() + 1
    y_min, y_max = X[feature_cols[1]].min() - 1, X[feature_cols[1]].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    Z = neigh.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure(figsize=(10,10)) 
    plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
    plt.scatter(X[feature_cols[0]], X[feature_cols[1]], c=y, s=5, cmap=cmap_bold)
    plt.scatter(processed[feature_cols[0]], processed[feature_cols[1]], s=5, c=y_pred, cmap=cmap_pred)
    plt.xlabel(feature_cols[0])
    plt.xlim(xx.min(), xx.max())
    plt.ylabel(feature_cols[1])
    plt.ylim(yy.min(), yy.max())
    plt.title("classification")
    plt.show()

#### send 

In [8]:
y_pred

array([0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0],
      dtype=int64)

In [9]:
URL = "https://aydanomachado.com/mlclass/01_Preprocessing.php"
DEV_KEY = "ehtudoifelse"
SEND_IT = True

if SEND_IT:
    r = requests.post(url = URL, data = {'dev_key':DEV_KEY, 'predictions':pd.Series(y_pred).to_json(orient='values')})
    print(" - Resposta do servidor:\n", r.text, "\n")

 - Resposta do servidor:
 {"status":"success","dev_key":"ehtudoifelse","accuracy":0.6224489795918368,"old_accuracy":0.62244897959184} 

