In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
df = pd.read_csv('../data/HRAnalytics_clean.csv', index_col=0)
df.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender_Male,gender_Female,gender_Other,relevent_experience,enrolled_university_no_enrollment,enrolled_university_full_time,enrolled_university_part_time,...,company_size,company_type_private_limited,company_type_startup_funded,company_type_startup_earlystage,company_type_Other,company_type_public_sector,company_type_NGO,last_new_job,training_hours,target
0,8949,city_103,0.92,1.0,0.0,0.0,1,1.0,0.0,0.0,...,3.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,36,1.0
1,29725,city_40,0.776,1.0,0.0,0.0,0,1.0,0.0,0.0,...,2.0,1.0,0.0,0.0,0.0,0.0,0.0,5.0,47,0.0
2,11561,city_21,0.624,0.0,1.0,0.0,0,0.0,1.0,0.0,...,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,83,0.0
3,33241,city_115,0.789,1.0,0.0,0.0,0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,52,1.0
4,666,city_162,0.767,1.0,0.0,0.0,1,1.0,0.0,0.0,...,2.0,0.0,1.0,0.0,0.0,0.0,0.0,4.0,8,0.0


In [7]:
df.shape

(21287, 32)

# I. Encoding Categorical Columns

In [None]:
df.dtypes[df.dtypes=='object']

### a) Ordinal Columns

In [None]:
ord_cols = ['experience', 'company_size', 'last_new_job']

In [None]:
df[ord_cols].head()

In [None]:
ord_map = [
    {'col': 'experience', 
     'mapping': {'<1':0, '1-5':1, '6-10':2, '11-15':3, '16-20':4,'>20':5}},
    {'col': 'company_size',
     'mapping': {'<10': 0, '10-49': 1, '50-99': 2, '100-499':3, '500-999':4, '1000-4999':5, '5000-9999':6, '10000+':7}},
    {'col': 'last_new_job',
     'mapping': {'never': 0, '1': 1, '2': 2, '3':3, '4':4, '>4':5}}
]

In [None]:
ord_encoder = ce.OrdinalEncoder(
    cols=ord_cols, mapping=ord_map, return_df=True,
    handle_unknown='return_nan', handle_missing='return_nan'
)

df = ord_encoder.fit_transform(df)

In [None]:
df[ord_cols].head()

In [None]:
df[ord_cols].dtypes

After imputing missing values, these columns will be converted to int

### b) One-Hot Encode Columns

In [None]:
cat_cols = ['gender', 'enrolled_university', 'education_level', 'major', 'company_type']

In [None]:
oh_encoder = ce.OneHotEncoder(cols=cat_cols, return_df=True, use_cat_names=True,
                           handle_unknown='return_nan', handle_missing='return_nan')

df = oh_encoder.fit_transform(df)

In [None]:
cat_cols2 = []
for i in cat_cols:
    for col_name in df.columns:
        if i in col_name:
            cat_cols2.append(col_name)
df[cat_cols2].head()

### c) Ensure all column dtypes are correct

In [None]:
df.dtypes

# II. Imputing Missing Values

In [None]:
df.isna().sum()

### a) Ordinal Columns

In [None]:
# Number of NAs in each column

perc_na = round(df[ord_cols].isna().sum() * 100 / len(df[ord_cols]), 2)

pd.DataFrame({'Total NA':df[ord_cols].isna().sum(), '% NA':perc_na})

In [None]:
KNN_imputer = KNNImputer()

df[ord_cols] = KNN_imputer.fit_transform(df[ord_cols])
df[ord_cols].head()

In [None]:
# Convert ord cols to int
df[ord_cols] = round(df[ord_cols], 0)
df[ord_cols].head()

In [None]:
df['experience'].value_counts()

In [None]:
df['company_size'].value_counts()

In [1]:
df['last_new_job'].value_counts()

NameError: name 'df' is not defined

### b) One-Hot Columns

In [None]:
# Number of NAs in each column

perc_na = round(df[cat_cols2].isna().sum() * 100 / len(df[cat_cols2]), 2)

pd.DataFrame({'Total NA':df[cat_cols2].isna().sum(), '% NA':perc_na})

In [None]:
nan_cols = ['gender_nan', 'enrolled_university_nan', 'education_level_nan', 'major_nan', 'company_type_nan']

# Remove NaN cols from df
df.drop(columns=nan_cols, inplace=True)
df.head()

In [None]:
# Remove NaN columns from catcols2
[cat_cols2.remove(i) for i in nan_cols]
cat_cols2

In [None]:
gender_cols = ['gender_Male', 'gender_Female', 'gender_Other']
enrolled_cols = ['enrolled_university_no_enrollment', 'enrolled_university_full_time', 'enrolled_university_part_time']
ed_level_cols = ['education_level_Graduate', 'education_level_Masters', 'education_level_High School',
                 'education_level_Phd', 'education_level_Primary School']
major_cols = ['major_STEM', 'major_Business Degree', 'major_Arts', 'major_Humanities', 'major_No Major', 'major_Other']
company = ['company_type_Pvt Ltd', 'company_type_Funded Startup', 'company_type_Early Stage Startup', 'company_type_Other',
           'company_type_Public Sector', 'company_type_NGO']

In [None]:
KNN_imputer = KNNImputer(n_neighbors=1)

df[cat_cols2] = KNN_imputer.fit_transform(df[cat_cols2])
df[cat_cols2].head()

In [None]:
# Convert ord cols to int
df[cat_cols2] = round(df[cat_cols2], 0)
df[cat_cols2].head()

In [None]:
df[gender_cols].value_counts()

In [None]:
df[major_cols].value_counts()

In [None]:
df.isna().sum()