In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
import category_encoders as ce

np.random.seed(1)

In [2]:
#https://www.kaggle.com/klmsathishkumar/predict-your-bmi-here
def ordinal_encoding(df,col,mapping):
    ordinal_encoder = ce.OrdinalEncoder(cols = [col],return_df = True,mapping = [{'col':col,'mapping':mapping}])
    df_final = ordinal_encoder.fit_transform(df)
    return df_final

# Import Data

In [3]:
data_wine = pd.read_csv('Datasets/wine/wine.csv')
data_wine = data_wine.sample(n=1000, random_state=1)
data_wine = data_wine.dropna()
data_wine.head(1)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
75,8.8,0.41,0.64,2.2,0.093,9.0,42.0,0.9986,3.54,0.66,10.5,5


In [4]:
y = data_wine['quality']
y.head(1)

75    5
Name: quality, dtype: int64

In [5]:
data_wine = data_wine.drop(columns = ['quality'])
data_wine.head(1)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
75,8.8,0.41,0.64,2.2,0.093,9.0,42.0,0.9986,3.54,0.66,10.5


# Preprocess

In [6]:
scaler = StandardScaler()

In [7]:
columns = data_wine.columns

In [8]:
data_wine = scaler.fit_transform(data_wine)

In [9]:
data_wine = pd.DataFrame(data_wine, columns = columns)
data_wine.head(1)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,0.26152,-0.651975,1.878216,-0.240903,0.068525,-0.658969,-0.140598,0.968183,1.563748,-0.002718,0.103766


In [10]:
data_wine.isna().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
dtype: int64

# Split Data

In [11]:
X_train, X_test, y_train, y_test = train_test_split(data_wine, y, test_size=0.33, random_state=0)

In [12]:
X_train

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
863,0.548842,-0.935282,0.593843,-0.744961,-0.174670,2.393802,0.263553,-0.140963,1.099373,0.402142,0.480698
748,-0.542980,0.764561,-1.409779,-0.528936,-0.268206,-0.277373,-0.202775,0.541588,1.364730,-0.581091,-0.932797
64,0.721234,2.011113,0.028719,-0.096887,-0.062426,1.344412,3.123698,0.594913,-0.426428,-0.754602,-0.932797
798,-0.428052,2.407743,-1.255654,-0.384920,0.012403,-0.849767,-0.824546,-0.471573,-0.691785,-0.581091,-0.744331
52,1.927985,-1.388573,1.004842,-0.600944,-0.661059,0.104224,-0.451483,0.040340,-1.023481,0.055119,0.197999
...,...,...,...,...,...,...,...,...,...,...,...
835,-0.255659,0.254608,0.182843,-0.384920,-0.193377,-0.563570,1.009678,-0.007652,-0.094732,0.170793,-0.838564
192,-0.428052,-0.935282,0.182843,-0.600944,-0.193377,0.962815,0.574438,0.200313,1.033034,-0.234067,-0.838564
629,-0.255659,2.917696,-1.152905,-0.240903,-0.099841,-0.277373,-0.544749,-0.471573,-0.559107,-0.638928,-0.461632
559,-0.830302,-0.198683,-0.433656,-0.600944,-0.193377,-0.277373,-0.638014,-0.700868,0.502320,-0.118393,-0.555865


# Models

In [13]:
discriminative = LogisticRegression(random_state = 0)
generative = GaussianNB()

In [14]:
discriminative.fit(X_train, y_train)

LogisticRegression(random_state=0)

In [15]:
first_dis = discriminative.score(X_test, y_test)
first_dis

0.5787878787878787

In [16]:
generative.fit(X_train, y_train)

GaussianNB()

In [17]:
first_gen = generative.score(X_test, y_test)
first_gen

0.5363636363636364

In [18]:
new_wine = data_wine.mask(np.random.random(data_wine.shape) < .2)
new_wine = new_wine.fillna(new_wine.mean())

In [19]:
X_train, X_test, y_train, y_test = train_test_split(new_wine, y, test_size=0.33, random_state=0)

In [20]:
discriminative.fit(X_train, y_train)
generative.fit(X_train, y_train)

GaussianNB()

In [21]:
second_dis = discriminative.score(X_test, y_test)
second_dis

0.5515151515151515

In [22]:
second_gen = generative.score(X_test, y_test)
second_gen

0.503030303030303

In [23]:
print("Disminucion entre discriminativo: " + str(second_dis-first_dis) + "\n")
print("Disminucion entre generativos: " + str(second_gen-first_gen) + "\n")

Disminucion entre discriminativo: -0.027272727272727226

Disminucion entre generativos: -0.033333333333333326

