In [15]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
import category_encoders as ce

np.random.seed(1)



In [16]:
#https://www.kaggle.com/klmsathishkumar/predict-your-bmi-here
def ordinal_encoding(df,col,mapping):
    ordinal_encoder = ce.OrdinalEncoder(cols = [col],return_df = True,mapping = [{'col':col,'mapping':mapping}])
    df_final = ordinal_encoder.fit_transform(df)
    return df_final

# Import Data

In [17]:
data_wine = pd.read_csv('Datasets/wine/wine.csv')
data_wine = data_wine.sample(n=1000, random_state=1)
data_wine = data_wine.dropna()

In [18]:
y = data_wine['quality']

In [19]:
data_wine = data_wine.drop(columns = ['quality'])

# Preprocess

In [20]:
scaler = StandardScaler()

In [21]:
columns = data_wine.columns

In [22]:
data_wine = scaler.fit_transform(data_wine)

In [23]:
data_wine = pd.DataFrame(data_wine, columns = columns)

In [24]:
nan_wine = data_wine.mask(np.random.random(data_wine.shape) < .1)
mod_wine = nan_wine.fillna(nan_wine.median())
nan_wine = pd.concat([y, nan_wine], axis = 1)
nan_wine = nan_wine.dropna()
y_nan = nan_wine['quality']
nan_wine = nan_wine.drop(columns=['quality'])

#Split Data

In [25]:
X_train_nan, X_test_nan, y_train_nan, y_test_nan = train_test_split(nan_wine, y_nan, test_size=0.33, random_state=0)
X_train_mod, X_test_mod, y_train_mod, y_test_mod = train_test_split(mod_wine, y, test_size=0.33, random_state=0)

# Models

In [26]:
discriminative = LogisticRegression(random_state = 0)
generative = GaussianNB()
    
discriminative.fit(X_train_nan, y_train_nan)
generative.fit(X_train_nan, y_train_nan)
    
dis_nan = discriminative.score(X_test_nan, y_test_nan)
gen_nan = generative.score(X_test_nan, y_test_nan)

In [27]:
discriminative = LogisticRegression(random_state = 0)
generative = GaussianNB()
    
discriminative.fit(X_train_mod, y_train_mod)
generative.fit(X_train_mod, y_train_mod)
    
dis_mod = discriminative.score(X_train_mod, y_train_mod)
gen_mod = generative.score(X_train_mod, y_train_mod)

In [28]:
print (f'Discriminativo con NaN: {dis_nan}\nDiscriminativo modificado: {dis_mod}\ndiff: {dis_mod-dis_nan}\n')
print (f'Generativo con NaN: {gen_nan}\nGenerativo modificado: {gen_mod}\ndiff: {gen_mod-gen_nan}')

Discriminativo con NaN: 0.43243243243243246
Discriminativo modificado: 0.5865671641791045
diff: 0.15413473174667203

Generativo con NaN: 0.32432432432432434
Generativo modificado: 0.5477611940298508
diff: 0.22343686970552645
