# Data Processing

## Setup Notebook

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Import Library

In [55]:
import numpy as np
import pandas as pd
import os
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

import ipywidgets as widgets
from IPython.display import display, clear_output

In [6]:
warnings.filterwarnings('ignore')

## Load Data

In [9]:
# change directory to preprocessing

os.chdir("./drive/MyDrive/preprocessing")

In [10]:
! pwd

/content/drive/MyDrive/preprocessing


In [12]:
# read data

df = pd.read_csv("data/data_kebotakan.csv")

In [13]:
df.head(10)

Unnamed: 0,umur,jenis_kelamin,pekerjaan,provinsi,gaji,is_menikah,is_keturunan,berat,tinggi,sampo,is_merokok,pendidikan,stress,botak_prob
0,27.0,Perempuan,PNS,Bengkulu,7957453.0,1.0,0.0,54.315053,170.428542,Pantone,1.0,S1,5.0,0.605974
1,53.0,Perempuan,PNS,Bandung,7633003.0,1.0,0.0,72.873404,165.530097,Pantone,0.0,S1,7.0,0.53286
2,37.0,Perempuan,Pegawai swasta,Bandung,6637625.0,1.0,0.0,46.321533,154.599388,Moonsilk,0.0,S1,4.0,0.418442
3,36.0,Perempuan,Pengangguran,Palu,3624871.0,1.0,0.0,51.539781,167.340481,Deadbuoy,1.0,SD,9.0,0.80405
4,38.0,Laki-laki,Freelance,Palangkaraya,6031808.0,1.0,0.0,60.726909,165.514773,Merpati,1.0,S2,1.0,0.368371
5,55.0,Perempuan,PNS,Palangkaraya,9213032.0,1.0,1.0,54.287045,179.235145,Pantone,0.0,S1,1.0,0.732562
6,40.0,Perempuan,Freelance,Serang,10682560.0,1.0,0.0,54.824881,177.431122,Shoulder & Head,0.0,S1,2.0,0.192604
7,47.0,Laki-laki,Pegawai swasta,Banda Aceh,4508321.0,1.0,0.0,74.795152,170.540938,Shoulder & Head,1.0,S2,3.0,0.540582
8,41.0,Laki-laki,Freelance,Palu,9846426.0,1.0,0.0,55.049547,157.192078,Pantone,0.0,SMA,6.0,0.36796
9,46.0,Perempuan,Freelance,Palembang,9257426.0,1.0,0.0,52.764656,174.39717,Shoulder & Head,0.0,S1,4.0,0.403653


## Data Preprocessing

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7917 entries, 0 to 7916
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   umur           7832 non-null   float64
 1   jenis_kelamin  7840 non-null   object 
 2   pekerjaan      7760 non-null   object 
 3   provinsi       7831 non-null   object 
 4   gaji           7843 non-null   float64
 5   is_menikah     7845 non-null   float64
 6   is_keturunan   7828 non-null   float64
 7   berat          7861 non-null   float64
 8   tinggi         7843 non-null   float64
 9   sampo          7858 non-null   object 
 10  is_merokok     7847 non-null   float64
 11  pendidikan     7847 non-null   object 
 12  stress         7853 non-null   float64
 13  botak_prob     7838 non-null   float64
dtypes: float64(9), object(5)
memory usage: 866.0+ KB


### 1. Convert Column To Correct Type

In [32]:
# def convert_col(df:pd.DataFrame):
#   df['umur'] = df['umur'].astype('Int64', errors = 'ignore')
#   df['is_menikah'] = df['is_menikah'].astype('Int64', errors = 'ignore')
#   df['is_keturunan'] = df['is_keturunan'].astype('Int64', errors = 'ignore')
#   df['is_merokok'] = df['is_merokok'].astype('Int64', errors = 'ignore')
#   return df

In [28]:
# df_cl1 = convert_col(df)

In [29]:
# df_cl1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7917 entries, 0 to 7916
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   umur           7832 non-null   Int64  
 1   jenis_kelamin  7840 non-null   object 
 2   pekerjaan      7760 non-null   object 
 3   provinsi       7831 non-null   object 
 4   gaji           7843 non-null   float64
 5   is_menikah     7845 non-null   Int64  
 6   is_keturunan   7828 non-null   Int64  
 7   berat          7861 non-null   float64
 8   tinggi         7843 non-null   float64
 9   sampo          7858 non-null   object 
 10  is_merokok     7847 non-null   Int64  
 11  pendidikan     7847 non-null   object 
 12  stress         7853 non-null   float64
 13  botak_prob     7838 non-null   float64
dtypes: Int64(4), float64(5), object(5)
memory usage: 897.0+ KB


In [30]:
df_cl1.head()

Unnamed: 0,umur,jenis_kelamin,pekerjaan,provinsi,gaji,is_menikah,is_keturunan,berat,tinggi,sampo,is_merokok,pendidikan,stress,botak_prob
0,27,Perempuan,PNS,Bengkulu,7957453.0,1,0,54.315053,170.428542,Pantone,1,S1,5.0,0.605974
1,53,Perempuan,PNS,Bandung,7633003.0,1,0,72.873404,165.530097,Pantone,0,S1,7.0,0.53286
2,37,Perempuan,Pegawai swasta,Bandung,6637625.0,1,0,46.321533,154.599388,Moonsilk,0,S1,4.0,0.418442
3,36,Perempuan,Pengangguran,Palu,3624871.0,1,0,51.539781,167.340481,Deadbuoy,1,SD,9.0,0.80405
4,38,Laki-laki,Freelance,Palangkaraya,6031808.0,1,0,60.726909,165.514773,Merpati,1,S2,1.0,0.368371


In [36]:
def convert_col(df:pd.DataFrame):
  type_dict = {
      'umur' : int,
      'is_menikah' : int,
      'is_keturunan' : int,
      'is_merokok' : int
  }
  df = df.astype(type_dict, errors = 'ignore')
  return df

In [37]:
df_cl2 = convert_col(df)

In [38]:
df_cl2

Unnamed: 0,umur,jenis_kelamin,pekerjaan,provinsi,gaji,is_menikah,is_keturunan,berat,tinggi,sampo,is_merokok,pendidikan,stress,botak_prob
0,27,Perempuan,PNS,Bengkulu,7.957453e+06,1,0,54.315053,170.428542,Pantone,1,S1,5.0,0.605974
1,53,Perempuan,PNS,Bandung,7.633003e+06,1,0,72.873404,165.530097,Pantone,0,S1,7.0,0.532860
2,37,Perempuan,Pegawai swasta,Bandung,6.637625e+06,1,0,46.321533,154.599388,Moonsilk,0,S1,4.0,0.418442
3,36,Perempuan,Pengangguran,Palu,3.624871e+06,1,0,51.539781,167.340481,Deadbuoy,1,SD,9.0,0.804050
4,38,Laki-laki,Freelance,Palangkaraya,6.031808e+06,1,0,60.726909,165.514773,Merpati,1,S2,1.0,0.368371
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7912,32,Laki-laki,Pegawai swasta,Yogyakarta,6.024409e+06,1,0,44.432438,154.578859,Deadbuoy,0,SMA,9.0,0.471229
7913,34,Laki-laki,Pegawai swasta,Manado,1.007043e+07,1,0,49.389914,158.782726,Shoulder & Head,1,S1,9.0,0.543821
7914,58,Laki-laki,Freelance,Mamuju,8.010815e+06,1,0,54.351968,154.478087,Deadbuoy,1,S2,6.0,0.643453
7915,30,,PNS,Palu,9.059906e+06,1,0,57.646930,163.377717,Deadbuoy,0,S1,10.0,0.540056


In [39]:
df_cl2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7917 entries, 0 to 7916
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   umur           7832 non-null   Int64  
 1   jenis_kelamin  7840 non-null   object 
 2   pekerjaan      7760 non-null   object 
 3   provinsi       7831 non-null   object 
 4   gaji           7843 non-null   float64
 5   is_menikah     7845 non-null   Int64  
 6   is_keturunan   7828 non-null   Int64  
 7   berat          7861 non-null   float64
 8   tinggi         7843 non-null   float64
 9   sampo          7858 non-null   object 
 10  is_merokok     7847 non-null   Int64  
 11  pendidikan     7847 non-null   object 
 12  stress         7853 non-null   float64
 13  botak_prob     7838 non-null   float64
dtypes: Int64(4), float64(5), object(5)
memory usage: 897.0+ KB


In [46]:
# widget
# create dropdown that give us flexibility to choose the the column
dropdown = widgets.Dropdown(
    options = df.columns,
    value = df.columns[0],
    description = 'Select Column:',
    disabled = False
)
display(dropdown)

Dropdown(description='Select Column:', options=('umur', 'jenis_kelamin', 'pekerjaan', 'provinsi', 'gaji', 'is_…

In [47]:
# display data where selected column is chosen

def missing_diagnosed(dropdown):
  return df[df[dropdown].isna()]

In [58]:
def show_nan(df):
  dropdown = widgets.Dropdown(
    options = df.columns,
    value = df.columns[0],
    description = 'Select Column:',
    )
  output = widgets.Output()
  display(dropdown, output)

  # show df where the selected column is missing
  def on_change(change): # change is directory that contains information whether there is change made during widget/dropdown selection
    if change['type'] == 'change' and change['name'] == 'value':
      column = change['new']
      with output:
        clear_output(wait = True)
        display(df[df[column].isna()])

  dropdown.observe(on_change)

In [59]:
show_nan(df_cl2)

Dropdown(description='Select Column:', options=('umur', 'jenis_kelamin', 'pekerjaan', 'provinsi', 'gaji', 'is_…

Output()

In [None]:
df

Missing Value Handling:
-