# Pre-requisites

In [23]:
import pandas as pd
import matplotlib.pyplot as plt

In [24]:
from sklearn import preprocessing

# Ingestion

In [25]:
df = pd.read_csv('data/exams.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       100 non-null    object
 1   race/ethnicity               100 non-null    object
 2   parental level of education  100 non-null    object
 3   lunch                        100 non-null    object
 4   test preparation course      100 non-null    object
 5   math score                   100 non-null    int64 
 6   reading score                100 non-null    int64 
 7   writing score                100 non-null    int64 
dtypes: int64(3), object(5)
memory usage: 6.4+ KB


In [26]:
df.sample(5)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
88,female,group B,some high school,free/reduced,completed,53,69,65
41,female,group D,some high school,free/reduced,none,32,53,45
60,female,group C,bachelor's degree,free/reduced,none,47,71,62
66,female,group D,high school,standard,none,62,73,69
4,female,group B,bachelor's degree,standard,completed,67,76,80


# Preprocessing

In [27]:
df.describe()

Unnamed: 0,math score,reading score,writing score
count,100.0,100.0,100.0
mean,66.73,69.98,69.14
std,15.631395,13.732642,14.886792
min,18.0,25.0,20.0
25%,58.0,61.0,62.0
50%,69.0,71.5,69.0
75%,78.25,80.0,81.0
max,96.0,94.0,93.0


## Standardizing numerical features

In [28]:
# standardize variables to have 0 mean and unit variance
df['math score'] = preprocessing.scale(df['math score'])
df['reading score'] = preprocessing.scale(df['reading score'])
df['writing score'] = preprocessing.scale(df['writing score'])

In [29]:
df.describe()

Unnamed: 0,math score,reading score,writing score
count,100.0,100.0,100.0
mean,-2.753353e-16,-3.004541e-16,-4.912737e-17
std,1.005038,1.005038,1.005038
min,-3.133149,-3.291909,-3.317542
25%,-0.561305,-0.6572107,-0.482036
50%,0.1459522,0.1112428,-0.009451687
75%,0.7406911,0.7333242,0.8006929
max,1.881947,1.757929,1.610838


In [30]:
df.sample(5)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
24,female,group D,high school,standard,none,-0.046936,-0.071722,-0.144476
96,male,group C,associate's degree,standard,none,-0.497009,-0.949955,-1.427205
91,male,group D,some high school,standard,none,0.210248,-0.876769,-0.819596
73,female,group E,associate's degree,free/reduced,completed,-0.304121,-0.510839,0.05806
4,female,group B,bachelor's degree,standard,completed,0.01736,0.44058,0.733181


## Encoding categorical features

### ordinal values that have an intrisic order

In [31]:
df['parental level of education'].value_counts()

parental level of education
some college          25
associate's degree    21
some high school      18
high school           16
bachelor's degree     13
master's degree        7
Name: count, dtype: int64

In [32]:
df['parental level of education'].unique()

array(["associate's degree", 'some college', 'high school',
       "bachelor's degree", 'some high school', "master's degree"],
      dtype=object)

In [33]:
listEducationLevels = ['some high school','high school',
                       'some college', "associate's degree",
                       "bachelor's degree", "master's degree"]

In [34]:
# label encoding is best for ordinal variables with multiple categories
encoder = preprocessing.LabelEncoder()

In [35]:
# fitting the encoder with an exhaustive array of values from 'parental level of education'
encoder = encoder.fit(listEducationLevels)

In [36]:
encoder.classes_

array(["associate's degree", "bachelor's degree", 'high school',
       "master's degree", 'some college', 'some high school'],
      dtype='<U18')

In [37]:
# transforming the feature
df['parental level of education'] = encoder.transform(df['parental level of education'].astype('str'))

In [38]:
df['parental level of education'].value_counts()

parental level of education
4    25
0    21
5    18
2    16
1    13
3     7
Name: count, dtype: int64

In [39]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group E,0,standard,none,0.210248,0.44058,-0.009452
1,female,group C,4,standard,none,-1.268562,-1.315885,-1.292181
2,male,group E,2,standard,none,0.531729,0.147836,-0.076964
3,female,group B,4,free/reduced,completed,-1.46145,-1.315885,-1.022132
4,female,group B,1,standard,completed,0.01736,0.44058,0.733181


### categorical values that have NO order

In [None]:
# default dtype is bool
df = pd.get_dummies(df, columns=['gender', 'race/ethnicity', 'lunch', 'test preparation course'], dtype='int')

In [41]:
df.columns

Index(['parental level of education', 'math score', 'reading score',
       'writing score', 'gender_female', 'gender_male',
       'race/ethnicity_group A', 'race/ethnicity_group B',
       'race/ethnicity_group C', 'race/ethnicity_group D',
       'race/ethnicity_group E', 'lunch_free/reduced', 'lunch_standard',
       'test preparation course_completed', 'test preparation course_none'],
      dtype='object')

In [43]:
df.sample(5)

Unnamed: 0,parental level of education,math score,reading score,writing score,gender_female,gender_male,race/ethnicity_group A,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,lunch_free/reduced,lunch_standard,test preparation course_completed,test preparation course_none
15,3,1.17469,0.660138,1.070741,0,1,0,0,1,0,0,0,1,1,0
5,4,-1.075674,-1.315885,-1.022132,1,0,0,1,0,0,0,1,0,0,1
7,4,-0.239824,0.513766,0.598157,1,0,0,0,1,0,0,0,1,0,1
62,0,-0.175528,-0.657211,-0.482036,0,1,0,1,0,0,0,1,0,1,0
31,2,-0.239824,-0.071722,-0.144476,1,0,0,0,1,0,0,1,0,0,1


# export

In [44]:
df.to_csv('data/exams-treated.csv')