In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB

import category_encoders as ce

np.random.seed(1)

In [2]:
def ordinal_encoding(df,col,mapping):
    ordinal_encoder = ce.OrdinalEncoder(cols = [col],return_df = True,mapping = [{'col':col,'mapping':mapping}])
    df_final = ordinal_encoder.fit_transform(df)
    return df_final

# Students

In [3]:
data_students = pd.read_csv('Datasets/students/StudentsPerformance.csv')
data_students

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [4]:
y = data_students['gender']

In [5]:
data_students = data_students.drop(columns=['gender', 'race/ethnicity'])

In [6]:
mapping_education = {"some college":0,"associate's degree":1,"high school":2,"some high school":3,"bachelor's degree":4, "master's degree":5}
mapping_lunch = {"standard":0,"free/reduced":1}
mapping_preparation = {"none":0,"completed":1}

In [7]:
data_students["lunch"] = ordinal_encoding(data_students['lunch'],"lunch",mapping_lunch)
data_students["test preparation course"] = ordinal_encoding(data_students['test preparation course'],"test preparation course",mapping_preparation)
data_students["parental level of education"] = ordinal_encoding(data_students['parental level of education'],"parental level of education",mapping_education)

In [8]:
data_students.columns

Index(['parental level of education', 'lunch', 'test preparation course',
       'math score', 'reading score', 'writing score'],
      dtype='object')

In [9]:
scaler = StandardScaler()
columns = data_students.columns
data_students = scaler.fit_transform(data_students)
data_students = pd.DataFrame(data_students, columns = columns)

In [10]:
data_students

Unnamed: 0,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,1.372449,-0.741881,-0.746748,0.390024,0.193999,0.391492
1,-1.264341,-0.741881,1.339140,0.192076,1.427476,1.313269
2,2.031647,-0.741881,-0.746748,1.577711,1.770109,1.642475
3,-0.605143,1.347925,-0.746748,-1.259543,-0.833899,-1.583744
4,-1.264341,-0.741881,-0.746748,0.653954,0.605158,0.457333
...,...,...,...,...,...,...
995,2.031647,-0.741881,1.339140,1.445746,2.044215,1.774157
996,0.054054,1.347925,-0.746748,-0.269803,-0.970952,-0.859491
997,0.054054,1.347925,1.339140,-0.467751,0.125472,-0.201079
998,-1.264341,-0.741881,1.339140,0.126093,0.605158,0.589015


In [11]:
X_train, X_test, y_train, y_test = train_test_split(data_students, y, test_size=0.33, random_state=0)

discriminative = LogisticRegression(random_state = 0)
generative = GaussianNB()

discriminative.fit(X_train, y_train)
generative.fit(X_train, y_train)

GaussianNB()

In [12]:
discriminative.score(X_test, y_test)

0.8878787878787879

In [13]:
generative.score(X_test, y_test)

0.6848484848484848

In [14]:
new_students = data_students.mask(np.random.random(data_students.shape) < .2)
new_students = new_students.fillna(new_students.median())

In [15]:
X_train, X_test, y_train, y_test = train_test_split(new_students, y, test_size=0.33, random_state=0)

discriminative.fit(X_train, y_train)
generative.fit(X_train, y_train)

GaussianNB()

In [16]:
discriminative.score(X_test, y_test)

0.7727272727272727

In [17]:
generative.score(X_test, y_test)

0.6636363636363637