In and out path are declared here. In path is high school data stored inside the directory named Data HS.

In [1]:
in_path = "../data/raw/Data HS/hs_data.xls"
out_path = "../data/interim/HS_master.pkl"

In [2]:
from os.path import dirname
import os, sys, inspect

currentdir = os.getcwd()
parentdir = dirname(currentdir)

sys.path.insert(0,parentdir)

# Libs
Imported libraries needed here.

In [3]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np 

from src.utils import dump_to_pickle

pd.set_option('display.max_columns', 100)

# Import Table

In [4]:
hs_raw = pd.read_excel(in_path,
                      usecols=[0,6,7,8],
                      converters={'EMPLID': str})

In [5]:
hs_raw.shape

(60976, 4)

In [6]:
hs_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60976 entries, 0 to 60975
Data columns (total 4 columns):
EMPLID            60976 non-null object
TEST_ID           60062 non-null object
TEST_COMPONENT    60062 non-null object
SCORE             60062 non-null float64
dtypes: float64(1), object(3)
memory usage: 1.9+ MB


In [7]:
hs_raw.isnull().sum()

EMPLID              0
TEST_ID           914
TEST_COMPONENT    914
SCORE             914
dtype: int64

# Cleaning

Renaming column is performed to make the name similar to the one in other dataset for merging purpose later.

In [8]:
#rename column
hs_master = hs_raw.rename(columns={
    'EMPLID': 'NIM',
})

All rows with missing values are dropped as it's not large enough to be significant.

In [9]:
#drop missing values
hs_master.dropna(inplace=True)

Duplicted rows are also dropped.

In [10]:
#drop duplicates
hs_master.drop_duplicates(subset=['NIM', 'TEST_ID', 'TEST_COMPONENT', 'SCORE'], inplace=True)

USM data is dropped since the data can only be collected when students have already applied to UPH.

In [11]:
#drop USM
hs_master = hs_master[hs_master['TEST_ID'] != 'USM']

International curriculums are filtered out since they don't provide enough information about the test components. Therefore, only Indonesian curriculums are used.

In [12]:
filtered_curr = [
    'DA101SOC', 'DA102SOC', 'DA111SOC', 'DA112SOC',
    'DA101SCI', 'DA102SCI', 'DA111SCI', 'DA112SCI',
    'JHS FINAL'
]

In [13]:
hs_master = hs_master[hs_master['TEST_ID'].isin(filtered_curr)]

# Pivot
Pivot table is performed so that each row will represent one student.

In [14]:
pvt = hs_master.pivot_table(index='NIM', columns='TEST_COMPONENT', values='SCORE').reset_index()

In [15]:
pvt.shape

(2666, 10)

In [16]:
rearr_cols = [
    'NIM',
    'ENG',
    'MATH',
    'BIO',
    'CHEM',
    'PHY',
    'ECON',
    'GEO',
    'SOC',
    'FINAL'
]

In [17]:
pvt = pvt[rearr_cols]

In [18]:
pvt.head()

TEST_COMPONENT,NIM,ENG,MATH,BIO,CHEM,PHY,ECON,GEO,SOC,FINAL
0,1011180001,73.25,70.75,,,,86.5,73.75,79.25,30.8
1,1011180002,77.75,64.75,,,,79.25,80.0,76.25,25.95
2,1011180003,70.25,66.75,,,,79.5,77.5,82.25,27.4
3,1011180004,82.25,85.0,,,,71.75,77.75,72.75,28.4
4,1011180005,85.25,78.0,80.25,75.5,78.5,,,,33.9


# Dump to Pickle

In [19]:
dump_to_pickle(pvt, out_path)