In [1]:
import pandas as pd

In [2]:
pd.__version__

'0.23.4'

In [68]:
# Reading exam data
exam_data = pd.read_csv('./data/exams.csv')
exam_data

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,male,group E,associate's degree,standard,completed,79,75,81
1,female,group C,associate's degree,free/reduced,none,56,65,64
2,male,group D,bachelor's degree,standard,none,86,68,74
3,female,group A,bachelor's degree,standard,none,68,78,76
4,female,group D,high school,free/reduced,none,49,68,61
5,male,group D,some high school,free/reduced,none,49,47,41
6,male,group C,some high school,standard,none,87,78,70
7,male,group C,associate's degree,standard,none,74,80,76
8,female,group B,some high school,standard,none,50,44,48
9,male,group A,some college,standard,none,40,23,23


In [69]:
math_average = exam_data['math score'].mean()
reading_average = exam_data['reading score'].mean()
writing_average = exam_data['writing score'].mean()

print("Math avg: ", math_average)
print('Reading avg: ', reading_average)
print("Writing avg: ", writing_average)

Math avg:  65.06
Reading avg:  67.28
Writing avg:  66.47


In [70]:
# sklearn has a module called preprocessing that helps with standardization
from sklearn import preprocessing

exam_data[['math score']] = preprocessing.scale(exam_data[['math score']])
exam_data[['reading score']] = preprocessing.scale(exam_data[['reading score']])
exam_data[['writing score']] = preprocessing.scale(exam_data[['writing score']])

exam_data


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,male,group E,associate's degree,standard,completed,0.994557,0.574138,1.049901
1,female,group C,associate's degree,free/reduced,none,-0.646391,-0.169564,-0.178476
2,male,group D,bachelor's degree,standard,none,1.493976,0.053547,0.544099
3,female,group A,bachelor's degree,standard,none,0.209756,0.797248,0.688613
4,female,group D,high school,free/reduced,none,-1.145810,0.053547,-0.395248
5,male,group D,some high school,free/reduced,none,-1.145810,-1.508227,-1.840397
6,male,group C,some high school,standard,none,1.565321,0.797248,0.255069
7,male,group C,associate's degree,standard,none,0.637829,0.945989,0.688613
8,female,group B,some high school,standard,none,-1.074464,-1.731338,-1.334595
9,male,group A,some college,standard,none,-1.787920,-3.293112,-3.141031


In [71]:
# Need to convert text values into numbers
# Can use label encoding or one-hot encoding (see other notebook for explanation of these)
le = preprocessing.LabelEncoder()
exam_data['gender'] = le.fit_transform(exam_data['gender'].astype(str))
exam_data.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,1,group E,associate's degree,standard,completed,0.994557,0.574138,1.049901
1,0,group C,associate's degree,free/reduced,none,-0.646391,-0.169564,-0.178476
2,1,group D,bachelor's degree,standard,none,1.493976,0.053547,0.544099
3,0,group A,bachelor's degree,standard,none,0.209756,0.797248,0.688613
4,0,group D,high school,free/reduced,none,-1.14581,0.053547,-0.395248


In [72]:
# gives unique values of gender that the column holds (it's supposed to give 'male', 'female'. 
# If it doesn't, rerun the cell for reading the data)
le.classes_

array(['female', 'male'], dtype=object)

In [73]:
# One-hot representation
pd.get_dummies(exam_data['race/ethnicity'])

Unnamed: 0,group A,group B,group C,group D,group E
0,0,0,0,0,1
1,0,0,1,0,0
2,0,0,0,1,0
3,1,0,0,0,0
4,0,0,0,1,0
5,0,0,0,1,0
6,0,0,1,0,0
7,0,0,1,0,0
8,0,1,0,0,0
9,1,0,0,0,0


In [74]:
exam_data = pd.get_dummies(exam_data, columns=['race/ethnicity'])
exam_data

Unnamed: 0,gender,parental level of education,lunch,test preparation course,math score,reading score,writing score,race/ethnicity_group A,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E
0,1,associate's degree,standard,completed,0.994557,0.574138,1.049901,0,0,0,0,1
1,0,associate's degree,free/reduced,none,-0.646391,-0.169564,-0.178476,0,0,1,0,0
2,1,bachelor's degree,standard,none,1.493976,0.053547,0.544099,0,0,0,1,0
3,0,bachelor's degree,standard,none,0.209756,0.797248,0.688613,1,0,0,0,0
4,0,high school,free/reduced,none,-1.145810,0.053547,-0.395248,0,0,0,1,0
5,1,some high school,free/reduced,none,-1.145810,-1.508227,-1.840397,0,0,0,1,0
6,1,some high school,standard,none,1.565321,0.797248,0.255069,0,0,1,0,0
7,1,associate's degree,standard,none,0.637829,0.945989,0.688613,0,0,1,0,0
8,0,some high school,standard,none,-1.074464,-1.731338,-1.334595,0,1,0,0,0
9,1,some college,standard,none,-1.787920,-3.293112,-3.141031,1,0,0,0,0


In [76]:
# Changing multiple columns to one-hot representation
exam_data = pd.get_dummies(exam_data, columns=['parental level of education',
                                               'lunch',
                                               'test preparation course'])

KeyError: "['parental level of education' 'lunch' 'test preparation course'] not in index"

In [78]:
exam_data.head()

Unnamed: 0,gender,math score,reading score,writing score,race/ethnicity_group A,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_associate's degree,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school,lunch_free/reduced,lunch_standard,test preparation course_completed,test preparation course_none
0,1,0.994557,0.574138,1.049901,0,0,0,0,1,1,0,0,0,0,0,0,1,1,0
1,0,-0.646391,-0.169564,-0.178476,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1
2,1,1.493976,0.053547,0.544099,0,0,0,1,0,0,1,0,0,0,0,0,1,0,1
3,0,0.209756,0.797248,0.688613,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1
4,0,-1.14581,0.053547,-0.395248,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1
