Input file of this notebook is the integrated data from previous notebook.

In [1]:
in_file_path = "../data/processed/data_master.pkl"
out_file_path = "../data/processed/data_model_noss.pkl"

In [2]:
from os.path import dirname
import os, sys, inspect

currentdir = os.getcwd()
parentdir = dirname(currentdir)

sys.path.insert(0,parentdir)

# Libs
Importing libraries.

In [3]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 

from feature_engine.categorical_encoders import MeanCategoricalEncoder
from sklearn.preprocessing import StandardScaler, LabelEncoder

from src.utils import dump_to_pickle

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

# Load Data

In [4]:
df = pd.read_pickle(in_file_path)

In [5]:
df.shape

(2101, 25)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2101 entries, 0 to 2354
Data columns (total 25 columns):
NIM                  2101 non-null object
form_number          2101 non-null object
major_code_opcs      2101 non-null object
major_name_opcs      2101 non-null object
Last_CGPA            2101 non-null float64
ENG                  2101 non-null float64
MATH                 2101 non-null float64
BIO                  2101 non-null float64
CHEM                 2101 non-null float64
PHY                  2101 non-null float64
ECON                 2101 non-null float64
GEO                  2101 non-null float64
SOC                  2101 non-null float64
FINAL                2101 non-null float64
major_code_oa        2101 non-null object
major_name_oa        2101 non-null object
gender               2101 non-null object
school_prop          2101 non-null object
school_name          2002 non-null object
school_state         2101 non-null object
school_geo_unit      2101 non-null object
c

In [7]:
df.isnull().sum()

NIM                   0
form_number           0
major_code_opcs       0
major_name_opcs       0
Last_CGPA             0
ENG                   0
MATH                  0
BIO                   0
CHEM                  0
PHY                   0
ECON                  0
GEO                   0
SOC                   0
FINAL                 0
major_code_oa         0
major_name_oa         0
gender                0
school_prop           0
school_name          99
school_state          0
school_geo_unit       0
curriculum_name       0
fail                  0
has_changed_major     0
faculty               0
dtype: int64

In [8]:
df.head()

Unnamed: 0,NIM,form_number,major_code_opcs,major_name_opcs,Last_CGPA,ENG,MATH,BIO,CHEM,PHY,ECON,GEO,SOC,FINAL,major_code_oa,major_name_oa,gender,school_prop,school_name,school_state,school_geo_unit,curriculum_name,fail,has_changed_major,faculty
0,1011180001,1831100382,1011,Manajemen,3.25,73.25,70.75,0.0,0.0,0.0,86.5,73.75,79.25,30.8,1011,Manajemen,Female,SMA,SMAN 1 SIDIKALANG,SUMUT,SUM,Social,0,0,BS
1,1011180002,1811100709,1011,Manajemen,1.06,77.75,64.75,0.0,0.0,0.0,79.25,80.0,76.25,25.95,1011,Manajemen,Male,SMA,SMA Islam Al-Azhar 8 Bekasi,JABAR,JAV,Social,1,0,BS
2,1011180003,1811100388,1011,Manajemen,2.07,70.25,66.75,0.0,0.0,0.0,79.5,77.5,82.25,27.4,1011,Manajemen,Male,SMA,SMA Katolik St. Peter,NTT,SUN,Social,0,0,BS
3,1011180004,1831100016,1011,Manajemen,2.91,82.25,85.0,0.0,0.0,0.0,71.75,77.75,72.75,28.4,1011,Manajemen,Male,SMA,SMA El Shadai Magelang,JATENG,JAV,Social,0,0,BS
4,1011180005,1811100684,1011,Manajemen,3.28,85.25,78.0,80.25,75.5,78.5,0.0,0.0,0.0,33.9,1011,Manajemen,Male,SMK,SMK Eran Batu 2,SULSEL,SUL,Science,0,0,BS


# Feature Engineering

#### Drop Unused Features
ID features and redundant features are dropped here to optimize the model performance. Features with high cardinality are also dropped. 

In [9]:
id_cols = [
    'NIM', 
    'form_number',
    'gender'
]

cols_to_drop = [
    'Last_CGPA',
    'major_code_opcs',
    'major_code_oa',
    'major_name_oa',
    'school_name',
    'curriculum_name',
    'has_changed_major'
]

In [10]:
data = df.drop(id_cols+cols_to_drop, axis=1)

#### Rename Columns

In [11]:
data.rename(columns={
    'major_name_opcs': 'major_name',
    'ENG': 'hs_eng',
    'MATH': 'hs_math',
    'BIO': 'hs_bio',
    'CHEM': 'hs_chem',
    'PHY': 'hs_phy',
    'ECON': 'hs_econ',
    'GEO': 'hs_geo',
    'SOC': 'hs_soc',
    'FINAL': 'hs_final',
}, inplace=True)

## Transform Categorical Features
Categorical features need to be transformed before it is fit to the machine learning model.

In [12]:
target_var = ['fail']

cat_vars = [
    'major_name',
    'school_prop',
    'school_geo_unit',
    'school_state',
    'faculty'
]

num_vars = [c for c in data.drop(target_var, axis=1).columns.values if c not in cat_vars]

### Encode Categorical Features

In [13]:
cat_df = data[cat_vars+target_var]

In [14]:
encoded_data = cat_df.copy()

###### Label Encoder
Here we use `LabelEncoder` for simplicity.

In [15]:
le = LabelEncoder()

In [16]:
for col in cat_vars:
    encoded_data[col] = le.fit_transform(encoded_data[col]) 

In [17]:
#save encoded dict
le.fit(cat_df['major_name'])
major_le_dict = dict(zip(le.classes_, le.transform(le.classes_)))
dump_to_pickle(major_le_dict, "../app/data/major_le_dict.pkl")

The encoded data will look like this. No more `string` data type.

In [18]:
encoded_data.head()

Unnamed: 0,major_name,school_prop,school_geo_unit,school_state,faculty,fail
0,14,4,6,33,0,0
1,14,4,0,7,0,1
2,14,4,7,21,0,0
3,14,4,0,9,0,0
4,14,5,5,27,0,0


Here we get the numerical data to be concated with encoded data later.

In [19]:
num_data = data[num_vars]

In [20]:
num_data.head()

Unnamed: 0,hs_eng,hs_math,hs_bio,hs_chem,hs_phy,hs_econ,hs_geo,hs_soc,hs_final
0,73.25,70.75,0.0,0.0,0.0,86.5,73.75,79.25,30.8
1,77.75,64.75,0.0,0.0,0.0,79.25,80.0,76.25,25.95
2,70.25,66.75,0.0,0.0,0.0,79.5,77.5,82.25,27.4
3,82.25,85.0,0.0,0.0,0.0,71.75,77.75,72.75,28.4
4,85.25,78.0,80.25,75.5,78.5,0.0,0.0,0.0,33.9


#### Concat
The numerical and encoded categorical data are then merged again to make the whole model data.

In [21]:
transformed_data = pd.concat([num_data, encoded_data], axis=1)

In [22]:
transformed_data.head()

Unnamed: 0,hs_eng,hs_math,hs_bio,hs_chem,hs_phy,hs_econ,hs_geo,hs_soc,hs_final,major_name,school_prop,school_geo_unit,school_state,faculty,fail
0,73.25,70.75,0.0,0.0,0.0,86.5,73.75,79.25,30.8,14,4,6,33,0,0
1,77.75,64.75,0.0,0.0,0.0,79.25,80.0,76.25,25.95,14,4,0,7,0,1
2,70.25,66.75,0.0,0.0,0.0,79.5,77.5,82.25,27.4,14,4,7,21,0,0
3,82.25,85.0,0.0,0.0,0.0,71.75,77.75,72.75,28.4,14,4,0,9,0,0
4,85.25,78.0,80.25,75.5,78.5,0.0,0.0,0.0,33.9,14,5,5,27,0,0


# Dump to Pickle

In [23]:
dump_to_pickle(transformed_data, out_file_path)