In [1]:
#conda install -c conda-forge category_encoders

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.impute import KNNImputer

import category_encoders as ce

In [3]:
# Import train and test set

df_train = pd.read_csv('../raw_data/aug_train.csv')
df_test = pd.read_csv('../raw_data/aug_test.csv')
print(df_train.shape, df_test.shape)

(19158, 14) (2129, 13)


# I. Data Wrangling

In [4]:
df_train.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


In [5]:
df_train.columns

Index(['enrollee_id', 'city', 'city_development_index', 'gender',
       'relevent_experience', 'enrolled_university', 'education_level',
       'major_discipline', 'experience', 'company_size', 'company_type',
       'last_new_job', 'training_hours', 'target'],
      dtype='object')

In [6]:
# Create list of ids that belong to each df

test_id = df_test.enrollee_id.values.tolist()
train_id = df_test.enrollee_id.values.tolist()

In [7]:
# Merge train and test for cleaning
df = df_train.merge(df_test, how='outer', on=['enrollee_id', 'city', 'city_development_index', 'gender', 
                                              'relevent_experience', 'enrolled_university', 'education_level',
                                              'major_discipline', 'experience', 'company_size', 'company_type',
                                              'last_new_job', 'training_hours'])
df.shape

(21287, 14)

In [8]:
df.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


In [9]:
# Number of NAs in each column

perc_na = round(df.isna().sum() * 100 / len(df), 2)

pd.DataFrame({'Total NA':df.isna().sum(), '% NA':perc_na})

Unnamed: 0,Total NA,% NA
enrollee_id,0,0.0
city,0,0.0
city_development_index,0,0.0
gender,5016,23.56
relevent_experience,0,0.0
enrolled_university,417,1.96
education_level,512,2.41
major_discipline,3125,14.68
experience,70,0.33
company_size,6560,30.82


In [10]:
df.dtypes

enrollee_id                 int64
city                       object
city_development_index    float64
gender                     object
relevent_experience        object
enrolled_university        object
education_level            object
major_discipline           object
experience                 object
company_size               object
company_type               object
last_new_job               object
training_hours              int64
target                    float64
dtype: object

In [11]:
cat_cols = []
ord_cols = []

#### 1) Enrollee_id

In [12]:
# Convert enrollee_id to string
df['enrollee_id'] = df['enrollee_id'].astype(str)

#### 2) City

In [13]:
df['city'].value_counts()

city_103    4828
city_21     3020
city_16     1701
city_114    1491
city_160     958
            ... 
city_8         4
city_111       3
city_129       3
city_171       2
city_140       1
Name: city, Length: 123, dtype: int64

There are 123 different cities total, so encoding might not be possible.

#### 3) City development index

In [14]:
df.city_development_index.describe()

count    21287.000000
mean         0.828462
std          0.123537
min          0.448000
25%          0.739000
50%          0.903000
75%          0.920000
max          0.949000
Name: city_development_index, dtype: float64

#### 4) Gender

In [15]:
len(df.gender), df.gender.notna().sum()

(21287, 16271)

In [16]:
# Gender distribution (of not null values)
pd.DataFrame({'Total':df['gender'].value_counts(), '%':round(df['gender'].value_counts() / df['gender'].notna().sum() * 100, 2)})

Unnamed: 0,Total,%
Male,14681,90.23
Female,1375,8.45
Other,215,1.32


In [17]:
cat_cols.append('gender')

#### 5) Relevent experience

In [18]:
df['relevent_experience'].value_counts()

Has relevent experience    15316
No relevent experience      5971
Name: relevent_experience, dtype: int64

In [19]:
# Change relevant experience to binary values
df['relevent_experience'] = np.where(df['relevent_experience']=='Has relevent experience', 1, 0)

#### 6) Enrolled university

In [20]:
df['enrolled_university'].value_counts()

no_enrollment       15336
Full time course     4192
Part time course     1342
Name: enrolled_university, dtype: int64

In [21]:
# Rename values
df['enrolled_university'] = np.where(df['enrolled_university']=='Full time course', 'full_time', df['enrolled_university'])
df['enrolled_university'] = np.where(df['enrolled_university']=='Part time course', 'part_time', df['enrolled_university'])

df['enrolled_university'].value_counts()

no_enrollment    15336
full_time         4192
part_time         1342
Name: enrolled_university, dtype: int64

In [22]:
cat_cols.append('enrolled_university')

#### 7) Education level

In [23]:
df['education_level'].value_counts()

Graduate          12867
Masters            4857
High School        2239
Phd                 468
Primary School      344
Name: education_level, dtype: int64

In [24]:
cat_cols.append('education_level')

#### 8) Major discipline

In [25]:
# Rename column
df.rename(columns={'major_discipline':'major'}, inplace=True)

In [26]:
df['major'].value_counts()

STEM               16113
Humanities           749
Other                421
Business Degree      364
Arts                 270
No Major             245
Name: major, dtype: int64

In [27]:
cat_cols.append('major')

#### 9) Experience

In [28]:
df['experience'].value_counts()

>20    3669
5      1593
4      1548
3      1508
6      1346
2      1255
7      1144
9      1093
10     1081
8       884
11      750
15      745
14      641
1       605
<1      596
16      576
12      546
13      453
17      378
19      333
18      306
20      167
Name: experience, dtype: int64

In [29]:
ord_cols.append('experience')

#### 10) Company size

In [30]:
df['company_size'].value_counts()

50-99        3421
100-500      2889
10000+       2236
10/49        1643
<10          1471
1000-4999    1471
500-999       965
5000-9999     631
Name: company_size, dtype: int64

In [31]:
# Fix value 10/49 so that it's 10-49
df['company_size'] = np.where(df['company_size']=='10/49', '10-49', df['company_size'])

# Change 100-500 to 100-499
df['company_size'] = np.where(df['company_size']=='100-500', '100-499', df['company_size'])

df[['company_size']][15:25]

Unnamed: 0,company_size
15,10-49
16,50-99
17,5000-9999
18,10000+
19,
20,100-499
21,100-499
22,
23,1000-4999
24,1000-4999


In [32]:
ord_cols.append('company_size')

#### 11) Company type

In [33]:
df['company_type'].value_counts()

Pvt Ltd                10958
Funded Startup          1098
Public Sector           1082
Early Stage Startup      668
NGO                      574
Other                    133
Name: company_type, dtype: int64

In [34]:
#Rename some values

df['company_type'] = np.where(df['company_type']=='Pvt Ltd', 'private_limited', df['company_type'])
df['company_type'] = np.where(df['company_type']=='Funded Startup', 'startup_funded', df['company_type'])
df['company_type'] = np.where(df['company_type']=='Early Stage Startup', 'startup_earlystage', df['company_type'])
df['company_type'] = np.where(df['company_type']=='Public Sector', 'public_sector', df['company_type'])

In [35]:
cat_cols.append('company_type')

#### 12) Last new job

In [36]:
df['last_new_job'].value_counts()

1        8924
>4       3643
2        3242
never    2710
3        1157
4        1148
Name: last_new_job, dtype: int64

In [37]:
ord_cols.append('last_new_job')

#### 13) Training hours

In [38]:
df['training_hours'].value_counts()

28     354
18     332
12     332
22     307
50     307
      ... 
294      6
234      6
272      6
286      5
238      4
Name: training_hours, Length: 241, dtype: int64

#### 14) Target

In [39]:
df['target'].value_counts()

0.0    14381
1.0     4777
Name: target, dtype: int64

#### 15) Lists for encoding

In [40]:
# Affirm column names are in appropriate lists
cat_cols, ord_cols

(['gender', 'enrolled_university', 'education_level', 'major', 'company_type'],
 ['experience', 'company_size', 'last_new_job'])

# II. Preprocessing

### a) Encode Categorical Columns

In [41]:
df.dtypes[df.dtypes=='object']

enrollee_id            object
city                   object
gender                 object
enrolled_university    object
education_level        object
major                  object
experience             object
company_size           object
company_type           object
last_new_job           object
dtype: object

#### 1) Ordinal Columns

In [42]:
ord_cols

['experience', 'company_size', 'last_new_job']

In [43]:
df[ord_cols].head()

Unnamed: 0,experience,company_size,last_new_job
0,>20,,1
1,15,50-99,>4
2,5,,never
3,<1,,never
4,>20,50-99,4


In [44]:
ord_map = [
    {'col': 'experience', 
     'mapping': {'<1':0, '1':1, '2':2, '3':3, '4':4, '5':5, '6':6, '7':7, '8':8, '9':9, '10':10, '11':11,
                 '12':12,'13':13,'14':14,'15':15,'16':16,'17':17,'18':18,'19':19,'20':20,'>20':21}},
    {'col': 'company_size',
     'mapping': {'<10': 0, '10-49': 1, '50-99': 2, '100-499':3, '500-999':4, '1000-4999':5, '5000-9999':6, '10000+':7}},
    {'col': 'last_new_job',
     'mapping': {'never': 0, '1': 1, '2': 2, '3':3, '4':4, '>4':5}}
]

In [45]:
encoder = ce.OrdinalEncoder(cols=ord_cols, mapping=ord_map, return_df=True, handle_unknown='return_nan', handle_missing='return_nan')

df = encoder.fit_transform(df)

In [46]:
df[ord_cols].head()

Unnamed: 0,experience,company_size,last_new_job
0,21.0,,1.0
1,15.0,2.0,5.0
2,5.0,,0.0
3,0.0,,0.0
4,21.0,2.0,4.0


In [47]:
df[ord_cols].dtypes

experience      float64
company_size    float64
last_new_job    float64
dtype: object

After imputing missing values, these columns will be converted to int

#### 2) One-Hot Encoding Columns

In [50]:
cat_cols

['gender', 'enrolled_university', 'education_level', 'major', 'company_type']

In [51]:
encoder = ce.OneHotEncoder(cols=cat_cols, return_df=True, use_cat_names=True,
                           handle_unknown='return_nan', handle_missing='return_nan')

df = encoder.fit_transform(df)

In [52]:
cat_cols2 = []
for i in cat_cols:
    for col_name in df.columns:
        if i in col_name:
            cat_cols2.append(col_name)
df[cat_cols2].head()

Unnamed: 0,gender_Male,gender_nan,gender_Female,gender_Other,enrolled_university_no_enrollment,enrolled_university_full_time,enrolled_university_nan,enrolled_university_part_time,education_level_Graduate,education_level_Masters,...,major_Humanities,major_No Major,major_Other,company_type_nan,company_type_private_limited,company_type_startup_funded,company_type_startup_earlystage,company_type_Other,company_type_public_sector,company_type_NGO
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,,,,,,,
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,,,,,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,,,,,,,
3,,,,,,,,,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


#### 3) Ensure no more object dtypes (other than identifier)

In [53]:
df.dtypes[df.dtypes=='object']

enrollee_id    object
city           object
dtype: object

### b) Impute Missing Values

#### 1) Ordinal Columns

In [54]:
# Number of NAs in each column

perc_na = round(df[ord_cols].isna().sum() * 100 / len(df[ord_cols]), 2)

pd.DataFrame({'Total NA':df[ord_cols].isna().sum(), '% NA':perc_na})

Unnamed: 0,Total NA,% NA
experience,70,0.33
company_size,6560,30.82
last_new_job,463,2.18


In [55]:
KNN_imputer = KNNImputer()

df[ord_cols] = KNN_imputer.fit_transform(df[ord_cols])
df[ord_cols].head()

Unnamed: 0,experience,company_size,last_new_job
0,21.0,2.6,1.0
1,15.0,2.0,5.0
2,5.0,3.0,0.0
3,0.0,1.0,0.0
4,21.0,2.0,4.0


In [56]:
# Convert ord cols to int
df[ord_cols] = round(df[ord_cols], 0)
df[ord_cols].head()

Unnamed: 0,experience,company_size,last_new_job
0,21.0,3.0,1.0
1,15.0,2.0,5.0
2,5.0,3.0,0.0
3,0.0,1.0,0.0
4,21.0,2.0,4.0


In [57]:
len(df['experience'].value_counts())

22

In [58]:
df['company_size'].value_counts()

3.0    6348
2.0    4988
7.0    2236
1.0    1970
4.0    1819
5.0    1757
0.0    1471
6.0     698
Name: company_size, dtype: int64

In [59]:
df['last_new_job'].value_counts()

1.0    9098
5.0    3643
2.0    3385
0.0    2789
3.0    1198
4.0    1174
Name: last_new_job, dtype: int64

Looks good!

#### 2) One-Hot Columns

In [60]:
# Number of NAs in each column

perc_na = round(df[cat_cols2].isna().sum() * 100 / len(df[cat_cols2]), 2)

pd.DataFrame({'Total NA':df[cat_cols2].isna().sum(), '% NA':perc_na})

Unnamed: 0,Total NA,% NA
gender_Male,5016,23.56
gender_nan,5016,23.56
gender_Female,5016,23.56
gender_Other,5016,23.56
enrolled_university_no_enrollment,417,1.96
enrolled_university_full_time,417,1.96
enrolled_university_nan,417,1.96
enrolled_university_part_time,417,1.96
education_level_Graduate,512,2.41
education_level_Masters,512,2.41


In [61]:
nan_cols = ['gender_nan', 'enrolled_university_nan', 'education_level_nan', 'major_nan', 'company_type_nan']

# Remove NaN cols from df
df.drop(columns=nan_cols, inplace=True)
df.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender_Male,gender_Female,gender_Other,relevent_experience,enrolled_university_no_enrollment,enrolled_university_full_time,enrolled_university_part_time,...,company_size,company_type_private_limited,company_type_startup_funded,company_type_startup_earlystage,company_type_Other,company_type_public_sector,company_type_NGO,last_new_job,training_hours,target
0,8949,city_103,0.92,1.0,0.0,0.0,1,1.0,0.0,0.0,...,3.0,,,,,,,1.0,36,1.0
1,29725,city_40,0.776,1.0,0.0,0.0,0,1.0,0.0,0.0,...,2.0,1.0,0.0,0.0,0.0,0.0,0.0,5.0,47,0.0
2,11561,city_21,0.624,,,,0,0.0,1.0,0.0,...,3.0,,,,,,,0.0,83,0.0
3,33241,city_115,0.789,,,,0,,,,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,52,1.0
4,666,city_162,0.767,1.0,0.0,0.0,1,1.0,0.0,0.0,...,2.0,0.0,1.0,0.0,0.0,0.0,0.0,4.0,8,0.0


In [62]:
# Remove NaN columns from catcols2
[cat_cols2.remove(i) for i in nan_cols]
cat_cols2

['gender_Male',
 'gender_Female',
 'gender_Other',
 'enrolled_university_no_enrollment',
 'enrolled_university_full_time',
 'enrolled_university_part_time',
 'education_level_Graduate',
 'education_level_Masters',
 'education_level_High School',
 'education_level_Phd',
 'education_level_Primary School',
 'major_STEM',
 'major_Business Degree',
 'major_Arts',
 'major_Humanities',
 'major_No Major',
 'major_Other',
 'company_type_private_limited',
 'company_type_startup_funded',
 'company_type_startup_earlystage',
 'company_type_Other',
 'company_type_public_sector',
 'company_type_NGO']

In [63]:
gender_cols = ['gender_Male', 'gender_Female', 'gender_Other']
enrolled_cols = ['enrolled_university_no_enrollment', 'enrolled_university_full_time', 'enrolled_university_part_time']
ed_level_cols = ['education_level_Graduate', 'education_level_Masters', 'education_level_High School',
                 'education_level_Phd', 'education_level_Primary School']
major_cols = ['major_STEM', 'major_Business Degree', 'major_Arts', 'major_Humanities', 'major_No Major', 'major_Other']
company = ['company_type_Pvt Ltd', 'company_type_Funded Startup', 'company_type_Early Stage Startup', 'company_type_Other',
           'company_type_Public Sector', 'company_type_NGO']

In [64]:
KNN_imputer = KNNImputer(n_neighbors=1)

df[cat_cols2] = KNN_imputer.fit_transform(df[cat_cols2])
df[cat_cols2].head()

Unnamed: 0,gender_Male,gender_Female,gender_Other,enrolled_university_no_enrollment,enrolled_university_full_time,enrolled_university_part_time,education_level_Graduate,education_level_Masters,education_level_High School,education_level_Phd,...,major_Arts,major_Humanities,major_No Major,major_Other,company_type_private_limited,company_type_startup_funded,company_type_startup_earlystage,company_type_Other,company_type_public_sector,company_type_NGO
0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [65]:
# Convert ord cols to int
df[cat_cols2] = round(df[cat_cols2], 0)
df[cat_cols2].head()

Unnamed: 0,gender_Male,gender_Female,gender_Other,enrolled_university_no_enrollment,enrolled_university_full_time,enrolled_university_part_time,education_level_Graduate,education_level_Masters,education_level_High School,education_level_Phd,...,major_Arts,major_Humanities,major_No Major,major_Other,company_type_private_limited,company_type_startup_funded,company_type_startup_earlystage,company_type_Other,company_type_public_sector,company_type_NGO
0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [66]:
df[gender_cols].value_counts()

gender_Male  gender_Female  gender_Other
1.0          0.0            0.0             18635
0.0          1.0            0.0              2418
             0.0            1.0               234
dtype: int64

In [67]:
df[major_cols].value_counts()

major_STEM  major_Business Degree  major_Arts  major_Humanities  major_No Major  major_Other
1.0         0.0                    0.0         0.0               0.0             0.0            19185
0.0         0.0                    0.0         1.0               0.0             0.0              758
                                               0.0               0.0             1.0              421
            1.0                    0.0         0.0               0.0             0.0              379
            0.0                    1.0         0.0               0.0             0.0              286
                                   0.0         0.0               1.0             0.0              258
dtype: int64

Also looks okay for now.

In [70]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
city_development_index,21287.0,0.828462,0.123537,0.448,0.739,0.903,0.92,0.949
gender_Male,21287.0,0.875417,0.330253,0.0,1.0,1.0,1.0,1.0
gender_Female,21287.0,0.11359,0.317321,0.0,0.0,0.0,0.0,1.0
gender_Other,21287.0,0.010993,0.10427,0.0,0.0,0.0,0.0,1.0
relevent_experience,21287.0,0.7195,0.449254,0.0,0.0,1.0,1.0,1.0
enrolled_university_no_enrollment,21287.0,0.738526,0.439448,0.0,0.0,1.0,1.0,1.0
enrolled_university_full_time,21287.0,0.198431,0.398828,0.0,0.0,0.0,0.0,1.0
enrolled_university_part_time,21287.0,0.063043,0.243046,0.0,0.0,0.0,0.0,1.0
education_level_Graduate,21287.0,0.628412,0.483241,0.0,0.0,1.0,1.0,1.0
education_level_Masters,21287.0,0.228261,0.419722,0.0,0.0,0.0,0.0,1.0


# III. Save Clean df

In [68]:
df.to_csv('../data/HRAnalytics_clean.csv')