In [209]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
import category_encoders as ce

np.random.seed(1)

In [210]:
def ordinal_encoding(df,col,mapping):
    ordinal_encoder = ce.OrdinalEncoder(cols = [col],return_df = True,mapping = [{'col':col,'mapping':mapping}])
    df_final = ordinal_encoder.fit_transform(df)
    return df_final

# Students

In [211]:
data_students = pd.read_csv('Datasets/students/StudentsPerformance.csv')

In [212]:
y = data_students['gender']

In [213]:
data_students = data_students.drop(columns=['gender', 'race/ethnicity'])

In [214]:
mapping_education = {"some college":0,"associate's degree":1,"high school":2,"some high school":3,"bachelor's degree":4, "master's degree":5}
mapping_lunch = {"standard":0,"free/reduced":1}
mapping_preparation = {"none":0,"completed":1}

In [215]:
data_students["lunch"] = ordinal_encoding(data_students['lunch'],"lunch",mapping_lunch)
data_students["test preparation course"] = ordinal_encoding(data_students['test preparation course'],"test preparation course",mapping_preparation)
data_students["parental level of education"] = ordinal_encoding(data_students['parental level of education'],"parental level of education",mapping_education)

In [216]:
scaler = StandardScaler()
columns = data_students.columns
data_students = scaler.fit_transform(data_students)
data_students = pd.DataFrame(data_students, columns = columns)

In [217]:
nan_students = data_students.mask(np.random.random(data_students.shape) < .1)
mod_students = nan_students.fillna(nan_students.median())
nan_students = pd.concat([y, nan_students], axis = 1)
nan_students = nan_students.dropna()
y_nan = nan_students['gender']
nan_students = nan_students.drop(columns=['gender'])

In [218]:
X_train, X_test, y_train, y_test = train_test_split(data_students, y, test_size=0.33, random_state=0)
X_train_nan, X_test_nan, y_train_nan, y_test_nan = train_test_split(nan_students, y_nan, test_size=0.33, random_state=0)
X_train_mod, X_test_mod, y_train_mod, y_test_mod = train_test_split(mod_students, y, test_size=0.33, random_state=0)

discriminative = LogisticRegression(random_state = 0)
generative = GaussianNB()

discriminative.fit(X_train, y_train)
generative.fit(X_train, y_train)

dis_data = discriminative.score(X_test, y_test)
gen_data = generative.score(X_test, y_test)

discriminative.fit(X_train_nan, y_train_nan)
generative.fit(X_train_nan, y_train_nan)

dis_nan = discriminative.score(X_test_nan, y_test_nan)
gen_nan = generative.score(X_test_nan, y_test_nan)

discriminative.fit(X_train_mod, y_train_mod)
generative.fit(X_train_mod, y_train_mod)

dis_mod = discriminative.score(X_test_mod, y_test_mod)
gen_mod = generative.score(X_test_mod, y_test_mod)

In [219]:
print ("Discriminativos Original/ Sacando filas con nan/ Rellenando filas con nan\n")
print (str(dis_data) + "\n" + str(dis_nan) + "\n" + str(dis_mod)+ "\n")
print (str(gen_data) + "\n" + str(gen_nan) + "\n" + str(gen_mod))

Discriminativos Original/ Sacando filas con nan/ Rellenando filas con nan

0.8878787878787879
0.8722222222222222
0.8363636363636363

0.6848484848484848
0.7944444444444444
0.696969696969697
