In [1]:
import re

import pandas as pd
import numpy as np

from datetime import datetime

Transform

In [2]:
def pick_date(datetime: str):
    return datetime[:10]

def filter_date(date: str, minyear: int, maxyear: int):
    year, month, day = date.split('-')

    if minyear <= int(year) <= maxyear:
        return date
    else:
        return np.nan

def get_age(date):
    return (datetime.now() - date) // np.timedelta64(1, 'Y')

In [3]:
df_applicant = pd.read_csv('data/df_applicant.csv')

  df_applicant = pd.read_csv('data/df_applicant.csv')


In [4]:
# 1. Manage duplicates
# 2. Fill null
# 3. ObjectID as index to overcome redundant
# 4. Replace bad values in every columns

In [5]:
df_applicant.drop(columns=['Unnamed: 0'], inplace=True)
df_applicant.set_index(['ApplicantID'], inplace=True)

df_applicant = df_applicant.fillna(0)
df_applicant.ExpectedSalary = df_applicant.ExpectedSalary.apply(lambda x: 0 if x < 1_000_000 else x)

df_app_obj = df_applicant.select_dtypes(object).astype(str).applymap(str.lower)
df_app_int = df_applicant.select_dtypes(int)
df_app_flt = df_applicant.select_dtypes(float).astype(int)

df_applicant = pd.concat([df_app_obj, df_app_int, df_app_flt], axis=1)
df_applicant.DiseaseHistory = df_applicant.DiseaseHistory.apply(lambda x: str(0) if not (all(chr.isalpha() or chr.isspace() for chr in x) and len(x) != 1) else x.strip())

In [6]:
# Step cleansing DiseaseHistory
# 1. Menghapus kata yang mengandung "tidak"
# 2. Mengambil kata yang terkait penyakit

- `dict_disease` diambil dari internet menggunakan *dict_disease.ipynb*
- `dict_disease` akan menjadi wadah kata-kata tentang jenis penyakit
- `diy_disease` diambil dari pemilihan manual berdasarkan pengurangan `diy_stopwords`

In [7]:
# dict_disease = [i for i in dict_disease if len(i) > 1]

# # cp: cerebral palsy
# # dm: diabetes melitus
# # tb: tuberkulosis
# # db: demam berdarah

# dict_disease = [i for i in dict_disease if (len(i) == 2 and i in ['cp', 'dm', 'tb', 'db']) or len(i) > 2]
# pd.DataFrame(dict_disease).to_csv('dict_disease_edited.csv', header=False, index=False)

In [8]:
# load dict_disease yang telah dibuat semi-manual
dict_disease = [i[0] for i in pd.read_csv('dict_disease_edited.csv', names=['diy_disease'], na_filter=False).values]

# menggabungkan disease & no disease dan melakukan assignment langsung ke dalam kolom DiseaseHistory
df_applicant.DiseaseHistory = pd.concat([
    df_applicant.DiseaseHistory[df_applicant.DiseaseHistory.str.contains('|'.join(dict_disease)) == True],
    df_applicant.DiseaseHistory[df_applicant.DiseaseHistory.str.contains('|'.join(dict_disease)) == False].apply(lambda x: str(0) if x else str(0)),
])

In [9]:
df_applicant.head(2)

Unnamed: 0_level_0,DiseaseHistory,Dob,DriverLicenseType,Gender,IsUsingGlasses,MaritalStatus,Nationality,Strengthness,Weaknesses,TypeOfVehicle,Height,ExpectedSalary,CityID,ProvinceID
ApplicantID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0,2018-04-10 09:55:35.1320927,0,0,False,0,0,0,0,0,0,0,0,0
2,0,2018-04-10 00:00:00.0000000,a,male,False,0,indonesia,strength,weak,mobil,0,0,1,1


In [10]:
# batas umur pekerja yang optimal diasumsikan umur 17-65 tahun
df_applicant.Dob = pd.to_datetime(
    df_applicant.Dob.map(pick_date).apply(lambda x: filter_date(x, 1958, 2006) if len(x) > 1 else np.nan)
).map(get_age).fillna(0).astype(int)

In [11]:
df_applicant.rename(columns={'Dob': 'Age'}, inplace=True)

In [12]:
df_applicant.head(3)

Unnamed: 0_level_0,DiseaseHistory,Age,DriverLicenseType,Gender,IsUsingGlasses,MaritalStatus,Nationality,Strengthness,Weaknesses,TypeOfVehicle,Height,ExpectedSalary,CityID,ProvinceID
ApplicantID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0,0,0,0,False,0,0,0,0,0,0,0,0,0
2,0,0,a,male,False,0,indonesia,strength,weak,mobil,0,0,1,1
3,0,29,c,male,False,single,wni,"pekerja keras,teliti, ambisius","pelupa,tidak cepat tanggap",sepeda motor,178,4000000,1,1


In [13]:
dict_indonesia = ['인도네시아', 'nesia','wni','jawa','batak','indo','idn','minang','jakarta','betawi','ina','bugis','java','sunda','papua','sumatera','nkri']
dict_chinese = ['chinese', 'tionghoa', 'chinesse']

df_applicant.Nationality = pd.concat([
    # bad values
    df_applicant.Nationality[
        ~df_applicant.Nationality.map(str.lower).str.contains('|'.join(dict_indonesia + dict_chinese + ['melayu']))
].apply(lambda x: str(0) if x else str(0)),
    # indonesia
    df_applicant.Nationality[
        (df_applicant.Nationality.map(str.lower).str.contains('|'.join(dict_indonesia))) & ~(df_applicant.Nationality.map(str.lower).str.contains('chine'))
].apply(lambda x: 'indonesia' if x else 'indonesia'),
    # chinese
    df_applicant.Nationality[
        df_applicant.Nationality.map(str.lower).str.contains('|'.join(dict_chinese))
].apply(lambda x: 'chinese' if x else 'chinese'),
    # melayu
    df_applicant.Nationality[
        df_applicant.Nationality.map(str.lower).str.contains('melayu')
].apply(lambda x: 'melayu' if x else 'melayu'),
], verify_integrity=True)

In [14]:
df_applicant.head(3)

Unnamed: 0_level_0,DiseaseHistory,Age,DriverLicenseType,Gender,IsUsingGlasses,MaritalStatus,Nationality,Strengthness,Weaknesses,TypeOfVehicle,Height,ExpectedSalary,CityID,ProvinceID
ApplicantID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0,0,0,0,False,0,0,0,0,0,0,0,0,0
2,0,0,a,male,False,0,indonesia,strength,weak,mobil,0,0,1,1
3,0,29,c,male,False,single,indonesia,"pekerja keras,teliti, ambisius","pelupa,tidak cepat tanggap",sepeda motor,178,4000000,1,1


In [15]:
df_applicant.Strengthness.replace(['-', ''], '0', inplace=True)

In [16]:
words_strengthness = ' '.join(list(set(df_applicant.Strengthness.str.replace('[^a-zA-Z]', ' ').apply(lambda x: re.sub('\s+', ' ', x)).values)))

  words_strengthness = ' '.join(list(set(df_applicant.Strengthness.str.replace('[^a-zA-Z]', ' ').apply(lambda x: re.sub('\s+', ' ', x)).values)))


In [17]:
dict_strengthness = ['dan','and','dengan','yang','to','dalam','dapat','hal','saya', ' ', 'diri', 'in', 'memiliki', 'suka', 'mau', 'individu', 'maupun', 'can', 'secara', 'orang', 'tidak', 'di', 'ms', 'terhadap', 'untuk', 'have', 'selalu', 'on', 'sama', 'with', 'dibawah', 'the', 'of', 'my', 'ingin', 'or', 'serta', 'am', 'mempunyai', 'as', 'self', 'pada', 'be', 'sesuatu', 'atau', 'adalah', 'yg', 'at', 'for']

tes = pd.Series(words_strengthness.split(' ')).apply(lambda x: '' if len(x) == 1 else x)[~pd.Series(words_strengthness.split(' ')).str.contains('|'.join(dict_strengthness))]

In [18]:
# # pending dulu dan kita lanjut ke kolom berikutnya
# tes[tes != ''].value_counts()

In [19]:
df_applicant.head(3)

Unnamed: 0_level_0,DiseaseHistory,Age,DriverLicenseType,Gender,IsUsingGlasses,MaritalStatus,Nationality,Strengthness,Weaknesses,TypeOfVehicle,Height,ExpectedSalary,CityID,ProvinceID
ApplicantID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0,0,0,0,False,0,0,0,0,0,0,0,0,0
2,0,0,a,male,False,0,indonesia,strength,weak,mobil,0,0,1,1
3,0,29,c,male,False,single,indonesia,"pekerja keras,teliti, ambisius","pelupa,tidak cepat tanggap",sepeda motor,178,4000000,1,1


In [20]:
dict_motor = ['motor', 'roda dua', 'roda 2', 'nmax', 'beat', 'peda', 'vega', 'vario', 'supra']
dict_mobil = ['mobil', 'jazz', 'ertiga', 'car', 'lmpv']

df_applicant.TypeOfVehicle = pd.concat([
df_applicant.TypeOfVehicle[df_applicant.TypeOfVehicle.str.contains('|'.join(dict_motor))].apply(lambda x: 'motor' if x else 'motor'),
df_applicant.TypeOfVehicle[df_applicant.TypeOfVehicle.str.contains('|'.join(dict_mobil))].apply(lambda x: 'mobil' if x else 'mobil'),
df_applicant.TypeOfVehicle[
        ~(df_applicant.TypeOfVehicle.str.contains('|'.join(dict_mobil))) & ~(df_applicant.TypeOfVehicle.str.contains('|'.join(dict_motor)))
    ].apply(lambda x: str(0) if x else str(0)),
])

In [21]:
df_applicant.head(3)

Unnamed: 0_level_0,DiseaseHistory,Age,DriverLicenseType,Gender,IsUsingGlasses,MaritalStatus,Nationality,Strengthness,Weaknesses,TypeOfVehicle,Height,ExpectedSalary,CityID,ProvinceID
ApplicantID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0,0,0,0,False,0,0,0,0,0,0,0,0,0
2,0,0,a,male,False,0,indonesia,strength,weak,mobil,0,0,1,1
3,0,29,c,male,False,single,indonesia,"pekerja keras,teliti, ambisius","pelupa,tidak cepat tanggap",motor,178,4000000,1,1


In [22]:
# memberikan batas ketinggian normal adalah 140-200
df_applicant.Height = df_applicant.Height.apply(lambda x: 0 if x > 200 or x < 140 else x)

In [23]:
df_applicant.head(3)

Unnamed: 0_level_0,DiseaseHistory,Age,DriverLicenseType,Gender,IsUsingGlasses,MaritalStatus,Nationality,Strengthness,Weaknesses,TypeOfVehicle,Height,ExpectedSalary,CityID,ProvinceID
ApplicantID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0,0,0,0,False,0,0,0,0,0,0,0,0,0
2,0,0,a,male,False,0,indonesia,strength,weak,mobil,0,0,1,1
3,0,29,c,male,False,single,indonesia,"pekerja keras,teliti, ambisius","pelupa,tidak cepat tanggap",motor,178,4000000,1,1


In [24]:
df_applicant.to_csv('data/cdf_applicant.csv')