In [46]:
import numpy as np
import pandas as pd

import tensorflow as tf

from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

In [47]:
pd.set_option('display.max_colwidth', 10)
pd.set_option('display.max_columns', None)

In [48]:
cdf_applicant = pd.read_csv('data/cdf_applicant.csv')
cdf_applicant_experience = pd.read_csv('data/cdf_applicant_experience.csv')
cdf_pipeline = pd.read_csv('data/cdf_pipeline.csv')
cdf_stage = pd.read_csv('data/cdf_stage.csv')
cdf_job = pd.read_csv('data/cdf_job.csv')
df_function_position = pd.read_csv('data/df_function_position.csv')

In [49]:
cdf_applicant.head(2)

Unnamed: 0,ApplicantID,DiseaseHistory,Age,DriverLicenseType,Gender,IsUsingGlasses,MaritalStatus,Nationality,Strengthness,Weaknesses,TypeOfVehicle,Height,ExpectedSalary,CityID,ProvinceID
0,1,0,0,0,0,False,0,0,0,0,0,0,0,0,0
1,2,0,0,a,male,False,0,indonesia,strength,weak,mobil,0,0,1,1


In [50]:
cdf_applicant_experience.head(2)

Unnamed: 0,ApplicantID,Industry,CompanyName,JobDescription,Position,Salary,YearsOfExperience
0,1,jasa k...,pt wir...,pembua...,staff ...,3500000,4
1,3,teleko...,pt ico...,mainte...,sales ...,3666666,1


In [51]:
cdf_pipeline.head(2)

Unnamed: 0,PipelineID,ApplicantID,JobID,StageID
0,1,3,6,10
1,2,8,6,10


In [52]:
cdf_stage.head(2)

Unnamed: 0,StageID,Label
0,10,Rejected
1,11,Hired


In [53]:
cdf_job.head(2)

Unnamed: 0,JobID,Description,EducationLevelID,FunctionPositionID,JobStatus,JobTitle,Requirement,SalaryMax,SalaryMin,HiredQuota,DepartmentID,CityID,CompanyID,ProvinceID,SkipTest,MajorID,DbName,UsiaMax,UsingGlasses,IQMin,IsSpecificEducationLevel,DriverLicenseType,IsSpesificDriverLicense,Gender,IsSpecificAge,IsSpecificGender,IsSpecificIQ,IsSpecificMarital,MaritalStatus,AdsStatisticID
0,1,under ...,4,1,Close,night ...,with y...,3600000,3300000,1,1,1,7,1,True,1,WarnaW...,0.0,False,0,False,0,False,0,False,False,False,False,0,0
1,2,becomi...,2,2,Close,book k...,bachel...,4250000,3500000,1,2,1,7,1,True,2,WarnaW...,0.0,False,0,False,0,False,0,False,False,False,False,0,0


In [54]:
cdf_job = cdf_job[['JobID', 'FunctionPositionID', 'JobTitle']]

In [69]:
df_merged = pd.merge(cdf_applicant, cdf_applicant_experience, on=['ApplicantID'])
df_merged = pd.merge(df_merged, cdf_pipeline, on=['ApplicantID'])
df_merged = pd.merge(df_merged, cdf_stage, on=['StageID'])
df_merged = pd.merge(df_merged, cdf_job, on=['JobID'])
df_merged = pd.merge(df_merged, df_function_position, on=['FunctionPositionID'])

In [70]:
df_merged.isna().sum()

ApplicantID               0
DiseaseHistory            0
Age                       0
DriverLicenseType         0
Gender                    0
IsUsingGlasses            0
MaritalStatus             0
Nationality               0
Strengthness              0
Weaknesses                0
TypeOfVehicle             0
Height                    0
ExpectedSalary            0
CityID                    0
ProvinceID                0
Industry                  4
CompanyName               0
JobDescription          297
Position                  0
Salary                    0
YearsOfExperience         0
PipelineID                0
JobID                     0
StageID                   0
Label                     0
FunctionPositionID        0
JobTitle                  0
Unnamed: 0                0
FunctionPositionName      0
dtype: int64

In [71]:
df_merged = df_merged.fillna('')

In [72]:
df_merged.head(2)

Unnamed: 0.1,ApplicantID,DiseaseHistory,Age,DriverLicenseType,Gender,IsUsingGlasses,MaritalStatus,Nationality,Strengthness,Weaknesses,TypeOfVehicle,Height,ExpectedSalary,CityID,ProvinceID,Industry,CompanyName,JobDescription,Position,Salary,YearsOfExperience,PipelineID,JobID,StageID,Label,FunctionPositionID,JobTitle,Unnamed: 0,FunctionPositionName
0,3,0,29,c,male,False,single,indonesia,pekerj...,pelupa...,motor,178,4000000,1,1,teleko...,pt ico...,mainte...,sales ...,3666666,1,1,6,10,Rejected,6,market...,40,MARKET...
1,268,0,34,c,male,False,single,indonesia,"jujur,...",lupa w...,motor,167,3000000,1,1,agriku...,cv pat...,pengad...,staff ...,1600000,1,86,6,10,Rejected,6,market...,40,MARKET...


In [73]:
df_merged = df_merged.drop(columns=['PipelineID', 'JobID', 'Unnamed: 0'])
df_merged.FunctionPositionName = df_merged.FunctionPositionName.map(str.lower)

In [74]:
df_merged.head(2)

Unnamed: 0,ApplicantID,DiseaseHistory,Age,DriverLicenseType,Gender,IsUsingGlasses,MaritalStatus,Nationality,Strengthness,Weaknesses,TypeOfVehicle,Height,ExpectedSalary,CityID,ProvinceID,Industry,CompanyName,JobDescription,Position,Salary,YearsOfExperience,StageID,Label,FunctionPositionID,JobTitle,FunctionPositionName
0,3,0,29,c,male,False,single,indonesia,pekerj...,pelupa...,motor,178,4000000,1,1,teleko...,pt ico...,mainte...,sales ...,3666666,1,10,Rejected,6,market...,market...
1,268,0,34,c,male,False,single,indonesia,"jujur,...",lupa w...,motor,167,3000000,1,1,agriku...,cv pat...,pengad...,staff ...,1600000,1,10,Rejected,6,market...,market...


In [75]:
df_merged.Label = np.where(df_merged.Label == 'Rejected', 0, 1)

In [76]:
# df_merged.rename({'Label': 'target'}, inplace=True)

In [77]:
df_merged.head(2)

Unnamed: 0,ApplicantID,DiseaseHistory,Age,DriverLicenseType,Gender,IsUsingGlasses,MaritalStatus,Nationality,Strengthness,Weaknesses,TypeOfVehicle,Height,ExpectedSalary,CityID,ProvinceID,Industry,CompanyName,JobDescription,Position,Salary,YearsOfExperience,StageID,Label,FunctionPositionID,JobTitle,FunctionPositionName
0,3,0,29,c,male,False,single,indonesia,pekerj...,pelupa...,motor,178,4000000,1,1,teleko...,pt ico...,mainte...,sales ...,3666666,1,10,0,6,market...,market...
1,268,0,34,c,male,False,single,indonesia,"jujur,...",lupa w...,motor,167,3000000,1,1,agriku...,cv pat...,pengad...,staff ...,1600000,1,10,0,6,market...,market...


In [78]:
train, test = train_test_split(df_merged, test_size=0.2, stratify=df_merged.Label)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

4096 train examples
1025 validation examples
1281 test examples


In [79]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('Label')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

In [80]:
df_merged.head(2)

Unnamed: 0,ApplicantID,DiseaseHistory,Age,DriverLicenseType,Gender,IsUsingGlasses,MaritalStatus,Nationality,Strengthness,Weaknesses,TypeOfVehicle,Height,ExpectedSalary,CityID,ProvinceID,Industry,CompanyName,JobDescription,Position,Salary,YearsOfExperience,StageID,Label,FunctionPositionID,JobTitle,FunctionPositionName
0,3,0,29,c,male,False,single,indonesia,pekerj...,pelupa...,motor,178,4000000,1,1,teleko...,pt ico...,mainte...,sales ...,3666666,1,10,0,6,market...,market...
1,268,0,34,c,male,False,single,indonesia,"jujur,...",lupa w...,motor,167,3000000,1,1,agriku...,cv pat...,pengad...,staff ...,1600000,1,10,0,6,market...,market...


In [81]:
bs = 5 # A small batch sized is used for demonstration purposes
train_ds = df_to_dataset(train, batch_size=bs)
val_ds = df_to_dataset(val, shuffle=False, batch_size=bs)
test_ds = df_to_dataset(test, shuffle=False, batch_size=bs)

In [83]:
for feature_batch, label_batch in train_ds.take(1):
    print('Every feature:', list(feature_batch.keys()))
    print('A batch of ages:', feature_batch['Position'])
    print('A batch of targets:', label_batch )

Every feature: ['ApplicantID', 'DiseaseHistory', 'Age', 'DriverLicenseType', 'Gender', 'IsUsingGlasses', 'MaritalStatus', 'Nationality', 'Strengthness', 'Weaknesses', 'TypeOfVehicle', 'Height', 'ExpectedSalary', 'CityID', 'ProvinceID', 'Industry', 'CompanyName', 'JobDescription', 'Position', 'Salary', 'YearsOfExperience', 'StageID', 'FunctionPositionID', 'JobTitle', 'FunctionPositionName']
A batch of ages: tf.Tensor(
[b'it specialist it specialist project engineer it senior staff'
 b'tax staff ga staff hrga staff hrga staff general affair'
 b'staf purchasing promotion' b'legal officer'
 b'internship engineering architect freelancer furniture designer'], shape=(5,), dtype=string)
A batch of targets: tf.Tensor([1 1 0 0 0], shape=(5,), dtype=int32)


In [84]:
# We will use this batch to demonstrate several types of feature columns
example_batch = next(iter(train_ds))[0]

In [85]:
# A utility method to create a feature column
# and to transform a batch of data
def demo(feature_column):
    feature_layer = layers.DenseFeatures(feature_column)
    print(feature_layer(example_batch).numpy())

In [86]:
photo_count = feature_column.numeric_column('ExpectedSalary')
demo(photo_count)

Instructions for updating:
Use Keras preprocessing layers instead, either directly or via the `tf.keras.utils.FeatureSpace` utility. Each of `tf.feature_column.*` has a functional equivalent in `tf.keras.layers` for feature preprocessing when training a Keras model.
[[5000000.]
 [5000000.]
 [5000000.]
 [4000000.]
 [6000000.]]


In [90]:
df_merged.Gender.unique()

array(['male', 'female', '0'], dtype=object)

In [91]:
gender = feature_column.categorical_column_with_vocabulary_list(
      'Gender', ['male', 'female', '0'])

gender_one_hot = feature_column.indicator_column(gender)
demo(gender_one_hot)

[[0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]]


In [97]:
# Notice the input to the embedding column is the categorical column
# we previously created
breed1 = feature_column.categorical_column_with_vocabulary_list(
      'FunctionPositionID', df_merged.FunctionPositionID.unique())
breed1_embedding = feature_column.embedding_column(breed1, dimension=8)
demo(breed1_embedding)

[[-0.29726297  0.05939105 -0.22620197 -0.42720336 -0.2497986   0.2511663
   0.6022742   0.11829188]
 [ 0.07246079 -0.0653692  -0.3529021   0.23984692  0.4450512  -0.05414472
   0.28347608  0.21550731]
 [ 0.07246079 -0.0653692  -0.3529021   0.23984692  0.4450512  -0.05414472
   0.28347608  0.21550731]
 [ 0.01209226 -0.2985672  -0.14645423  0.04204448  0.2513114  -0.53816956
  -0.2013514  -0.27199522]
 [-0.06898602 -0.3995801   0.16116111  0.05360485  0.43102828  0.3337322
  -0.5062982  -0.19090657]]
