In [228]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
import category_encoders as ce

np.random.seed(43)

In [229]:
#https://www.kaggle.com/klmsathishkumar/predict-your-bmi-here
def ordinal_encoding(df,col,mapping):
    ordinal_encoder = ce.OrdinalEncoder(cols = [col],return_df = True,mapping = [{'col':col,'mapping':mapping}])
    df_final = ordinal_encoder.fit_transform(df)
    return df_final

# Food

In [230]:
data_food = pd.read_csv('Datasets/food/Food_Preference.csv')
data_food = data_food[data_food.Gender.notnull()]
data_food

Unnamed: 0,Timestamp,Participant_ID,Gender,Nationality,Age,Food,Juice,Dessert
0,2019/05/07 2:59:13 PM GMT+8,FPS001,Male,Indian,24,Traditional food,Fresh Juice,Maybe
1,2019/05/07 2:59:45 PM GMT+8,FPS002,Female,Indian,22,Western Food,Carbonated drinks,Yes
2,2019/05/07 3:00:05 PM GMT+8,FPS003,Male,Indian,31,Western Food,Fresh Juice,Maybe
3,2019/05/07 3:00:11 PM GMT+8,FPS004,Female,Indian,25,Traditional food,Fresh Juice,Maybe
4,2019/05/07 3:02:50 PM GMT+8,FPS005,Male,Indian,27,Traditional food,Fresh Juice,Maybe
...,...,...,...,...,...,...,...,...
283,2019/05/10 9:24:00 AM GMT+8,FPS284,Male,Indian,27,Western Food,Fresh Juice,Yes
284,2019/05/10 9:32:54 AM GMT+8,FPS285,Male,Indian,24,Traditional food,Fresh Juice,Yes
285,2019/05/10 12:09:17 PM GMT+8,FPS286,Male,Indian,25,Traditional food,Fresh Juice,Yes
286,2019/05/10 12:52:17 PM GMT+8,FPS287,Male,Indian,27,Traditional food,Fresh Juice,Yes


In [231]:
y = data_food['Age']

In [232]:
data_food = data_food.drop(columns=['Timestamp', 'Participant_ID', 'Age', 'Nationality'])

In [233]:
mapping_gender = {"Male":0, "Female":1}
mapping_food = {'Traditional food':0, "Western Food":1}
mapping_juice = {"Fresh Juice":0, "Carbonated drinks":1}
mapping_dessert = {"Maybe":0, "Yes":1, "No":2}

In [234]:
data_food["Gender"] = ordinal_encoding(data_food['Gender'],"Gender",mapping_gender)
data_food["Food"] = ordinal_encoding(data_food['Food'],"Food",mapping_food)
data_food["Juice"] = ordinal_encoding(data_food['Juice'],"Juice",mapping_juice)
data_food["Dessert"] = ordinal_encoding(data_food['Dessert'],"Dessert",mapping_dessert)

In [235]:
data_food

Unnamed: 0,Gender,Food,Juice,Dessert
0,0,0,0,0
1,1,1,1,1
2,0,1,0,0
3,1,0,0,0
4,0,0,0,0
...,...,...,...,...
283,0,1,0,1
284,0,0,0,1
285,0,0,0,1
286,0,0,0,1


In [236]:
#scaler = StandardScaler()
#columns = data_food.columns
#data_food = scaler.fit_transform(data_food)
#data_food = pd.DataFrame(data_food, columns = columns)

In [237]:
data_food

Unnamed: 0,Gender,Food,Juice,Dessert
0,0,0,0,0
1,1,1,1,1
2,0,1,0,0
3,1,0,0,0
4,0,0,0,0
...,...,...,...,...
283,0,1,0,1
284,0,0,0,1
285,0,0,0,1
286,0,0,0,1


In [238]:
scaler = StandardScaler()

In [239]:
X_train, X_test, y_train, y_test = train_test_split(data_food, y, test_size=0.33, random_state=0)

discriminative = LogisticRegression(random_state = 0)
generative = GaussianNB()

discriminative.fit(X_train, y_train)
generative.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [240]:
discriminative.score(X_test, y_test)

0.09574468085106383

In [241]:
generative.score(X_test, y_test)

0.02127659574468085

In [242]:
new_food = data_food.mask(np.random.random(data_food.shape) < .2)
new_food = new_food.fillna(new_food.median())

In [243]:
X_train, X_test, y_train, y_test = train_test_split(new_food, y, test_size=0.33, random_state=0)

discriminative.fit(X_train, y_train)
generative.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [244]:
discriminative.score(X_test, y_test)

0.07446808510638298

In [245]:
generative.score(X_test, y_test)

0.0