In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB

import category_encoders as ce

np.random.seed(1)

In [2]:
#https://www.kaggle.com/klmsathishkumar/predict-your-bmi-here
def ordinal_encoding(df,col,mapping):
    ordinal_encoder = ce.OrdinalEncoder(cols = [col],return_df = True,mapping = [{'col':col,'mapping':mapping}])
    df_final = ordinal_encoder.fit_transform(df)
    return df_final

# Imc

In [30]:
# Import dataset
data_imc = pd.read_csv('Datasets/imc/imc.csv')
data_imc

Unnamed: 0,Gender,Height,Weight,Index
0,Male,174,96,4
1,Male,189,87,2
2,Female,185,110,4
3,Female,195,104,3
4,Male,149,61,3
...,...,...,...,...
495,Female,150,153,5
496,Female,184,121,4
497,Female,141,136,5
498,Male,150,95,5


In [31]:
# get target
y = data_imc['Index']

In [5]:
data_imc.drop(columns=['Index'])

Unnamed: 0,Gender,Height,Weight
0,Male,174,96
1,Male,189,87
2,Female,185,110
3,Female,195,104
4,Male,149,61
...,...,...,...
495,Female,150,153
496,Female,184,121
497,Female,141,136
498,Male,150,95


# Preprocessing

In [6]:
numbers = data_imc
numbers = numbers.drop(columns = ['Gender'])
columns = numbers.columns

In [32]:
#Scale Data

In [7]:
scaler = StandardScaler()

In [8]:
numbers = scaler.fit_transform(numbers)

In [9]:
numbers = pd.DataFrame(numbers, columns = columns)
numbers.head(3)

Unnamed: 0,Height,Weight,Index
0,0.247939,-0.309117,0.186157
1,1.164872,-0.587322,-1.291278
2,0.920357,0.123647,0.186157


In [33]:
# Encode Categorical Data

In [10]:
mapping = {"Male":0,"Female":1}

In [11]:
gender = ordinal_encoding(data_imc['Gender'],"Gender",mapping)
gender.head(3)

Unnamed: 0,Gender
0,0
1,0
2,1


In [34]:
# Create new dataframe with processed data
X = pd.concat([gender, numbers], axis = 1)
X.head(3)

Unnamed: 0,Gender,Height,Weight,Index
0,0,0.247939,-0.309117,0.186157
1,0,1.164872,-0.587322,-1.291278
2,1,0.920357,0.123647,0.186157


# Split Data

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

# Models

In [14]:
discriminative = LogisticRegression(random_state = 0)
generative = GaussianNB()

In [15]:
discriminative.fit(X_train, y_train)

LogisticRegression(random_state=0)

In [16]:
disc_score = discriminative.score(X_test, y_test)
disc_score

0.9878787878787879

In [17]:
generative.fit(X_train, y_train)

GaussianNB()

In [18]:
gen_score = generative.score(X_test, y_test)
gen_score

1.0

In [19]:
X_new = X.mask(np.random.random(X.shape) < .2)

In [20]:
X_new

Unnamed: 0,Gender,Height,Weight,Index
0,0.0,0.247939,,0.186157
1,,,,-1.291278
2,1.0,0.920357,0.123647,0.186157
3,1.0,1.531645,,-0.552561
4,0.0,-1.280283,,
...,...,...,...,...
495,,-1.219155,1.452850,0.924875
496,1.0,0.859228,0.463676,0.186157
497,1.0,,0.927351,0.924875
498,0.0,-1.219155,-0.340029,0.924875


In [21]:
X_new.median()

Gender    1.000000
Height   -0.057706
Weight    0.030912
Index     0.186157
dtype: float64

In [22]:
X_new= X_new.fillna(X_new.median())
X_new

Unnamed: 0,Gender,Height,Weight,Index
0,0.0,0.247939,0.030912,0.186157
1,1.0,-0.057706,0.030912,-1.291278
2,1.0,0.920357,0.123647,0.186157
3,1.0,1.531645,0.030912,-0.552561
4,0.0,-1.280283,0.030912,0.186157
...,...,...,...,...
495,1.0,-1.219155,1.452850,0.924875
496,1.0,0.859228,0.463676,0.186157
497,1.0,-0.057706,0.927351,0.924875
498,0.0,-1.219155,-0.340029,0.924875


In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.33, random_state=0)

In [24]:
discriminative = LogisticRegression(random_state = 0)
generative = GaussianNB()

In [25]:
discriminative.fit(X_train, y_train)

LogisticRegression(random_state=0)

In [26]:
new_disc_score = discriminative.score(X_test, y_test)
new_disc_score

0.793939393939394

In [27]:
generative.fit(X_train, y_train)

GaussianNB()

In [28]:
new_gen_score = generative.score(X_test, y_test)
new_gen_score

0.8606060606060606

# Results

In [29]:
print(f'Diferencia de rendimientos\nDiscriminative: {round(new_disc_score-disc_score, 3)}\nGenerative: {round(new_gen_score-gen_score, 3)}')

Diferencia de rendimientos
Discriminative: -0.194
Generative: -0.139
