In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.impute import KNNImputer

import category_encoders as ce

In [2]:
df = pd.read_csv('../data/df_clean_nulls2.csv', index_col=0).drop(columns='city')
df.head()

Unnamed: 0,enrollee_id,city_development_index,gender,relevent_experience,enrolled_university,education_level,major,company_size,company_type,last_new_job,training_hours,target,exp_range,city_group
0,8949,0.92,Male,1,no_enrollment,Graduate,STEM,,,1,36,1.0,>20,0
1,29725,0.776,Male,0,no_enrollment,Graduate,STEM,50-99,private_limited,>4,47,0.0,11-15,3
2,11561,0.624,,0,full_time,Graduate,STEM,,,never,83,0.0,2-5,1
3,33241,0.789,,0,,Graduate,Business Degree,,private_limited,never,52,1.0,<=1,3
4,666,0.767,Male,1,no_enrollment,Masters,STEM,50-99,startup_funded,4,8,0.0,>20,3


In [3]:
df.shape

(21037, 14)

# I. Separate Train and Test Sets

First, I'll separate the pre-defined train and test sets. This test set does not include the target feature, and therefore cannot be used for training or validation.

In [4]:
train_df = df.loc[df['target'].notna()]
test_df = df.loc[df['target'].isna()]

train_df.shape, test_df.shape

((18908, 14), (2129, 14))

The train_df will now be split into its own train/test set for encoding/imputing.

In [5]:
# Split into test/train

#X = train_df.drop(columns='target')
#y = train_df.target

train, test = train_test_split(train_df, random_state=0)

#X_train.shape, y_train.shape, X_test.shape, y_test.shape
train.shape, test.shape

((14181, 14), (4727, 14))

# I. Encoding Categorical Columns

In [6]:
#preprocess_ord = make_pipeline(
#    ce.OrdinalEncoder(
#        cols=ord_cols, mapping=ord_map, return_df=True, handle_unknown='return_nan', handle_missing='return_nan'),
#    KNNImputer()
#)

#preprocess_cat = make_pipeline(
#    ce.OneHotEncoder(
#        cols=cat_cols, return_df=True, use_cat_names=True, handle_unknown='return_nan', handle_missing='return_nan'),
#    KNNImputer(n_neighbors=1)
#)

In [7]:
#preprocessor = ColumnTransformer(
#    transformers = [
#        ('ord', preprocess_ord, ord_cols),
#        ('cat', preprocess_cat, cat_cols)
#    ],
#    remainder='passthrough', n_jobs=-1, 
#)

In [8]:
train.dtypes[train.dtypes=='object']

gender                 object
enrolled_university    object
education_level        object
major                  object
company_size           object
company_type           object
last_new_job           object
exp_range              object
dtype: object

### a) Ordinal Columns

In [9]:
ord_cols = ['education_level', 'company_size', 'last_new_job', 'exp_range']

In [10]:
train['education_level'].value_counts()

Graduate          8731
Masters           3267
High School       1458
Phd                297
Primary School     228
Name: education_level, dtype: int64

In [11]:
ord_map = [
    {'col': 'education_level',
    'mapping': {'Primary School':0, 'High School':1, 'Graduate':2, 'Masters':3, 'Phd':4}},
    {'col': 'company_size',
     'mapping': {'<10': 0, '10-49': 1, '50-99': 2, '100-499':3, '500-999':4, '1000-4999':5, '5000-9999':6, '10000+':7}},
    {'col': 'last_new_job',
     'mapping': {'never': 0, '1': 1, '2': 2, '3':3, '4':4, '>4':5}},
    {'col': 'exp_range', 
     'mapping': {'<=1':0, '2-5':1, '6-10':2, '11-15':3, '16-20':4,'>20':5}}
]

In [12]:
ord_encoder = ce.OrdinalEncoder(
    cols=ord_cols, mapping=ord_map, return_df=True,
    handle_unknown='return_nan', handle_missing='return_nan'
)

ord_encoder.fit(train)

train = ord_encoder.transform(train)
test = ord_encoder.transform(test)

In [13]:
train.shape, test.shape

((14181, 14), (4727, 14))

In [14]:
# Original df

df[ord_cols].head()

Unnamed: 0,education_level,company_size,last_new_job,exp_range
0,Graduate,,1,>20
1,Graduate,50-99,>4,11-15
2,Graduate,,never,2-5
3,Graduate,,never,<=1
4,Masters,50-99,4,>20


In [15]:
# Encoded df

train[ord_cols].head()

Unnamed: 0,education_level,company_size,last_new_job,exp_range
2913,3.0,1.0,1.0,1.0
13038,3.0,7.0,2.0,2.0
13226,2.0,1.0,1.0,1.0
10167,0.0,,0.0,0.0
16867,2.0,2.0,4.0,5.0


In [16]:
train[ord_cols].dtypes

education_level    float64
company_size       float64
last_new_job       float64
exp_range          float64
dtype: object

Looks good. After imputing missing values, these columns will be converted to int.

### b) One-Hot Encode Columns

In [17]:
cat_cols = ['gender', 'enrolled_university', 'major', 'company_type']

In [18]:
oh_encoder = ce.OneHotEncoder(cols=cat_cols, return_df=True, use_cat_names=True,
                           handle_unknown='return_nan', handle_missing='return_nan')

oh_encoder.fit(train)

train = oh_encoder.transform(train)
test = oh_encoder.transform(test)

In [19]:
train_df.shape, test_df.shape

((18908, 14), (2129, 14))

In [20]:
# Create new list of cat_cols

cat_cols2 = []
for i in cat_cols:
    for col_name in train.columns:
        if i in col_name:
            cat_cols2.append(col_name)
cat_cols2

['gender_nan',
 'gender_Male',
 'gender_Female',
 'gender_Other',
 'enrolled_university_no_enrollment',
 'enrolled_university_full_time',
 'enrolled_university_part_time',
 'enrolled_university_nan',
 'major_STEM',
 'major_Other',
 'major_nan',
 'major_Humanities',
 'major_Arts',
 'major_No Major',
 'major_Business Degree',
 'company_type_private_limited',
 'company_type_startup_earlystage',
 'company_type_nan',
 'company_type_public_sector',
 'company_type_startup_funded',
 'company_type_NGO',
 'company_type_Other']

In [21]:
# Original df

df[cat_cols].head()

Unnamed: 0,gender,enrolled_university,major,company_type
0,Male,no_enrollment,STEM,
1,Male,no_enrollment,STEM,private_limited
2,,full_time,STEM,
3,,,Business Degree,private_limited
4,Male,no_enrollment,STEM,startup_funded


In [22]:
# After encoding

train[cat_cols2].head()

Unnamed: 0,gender_nan,gender_Male,gender_Female,gender_Other,enrolled_university_no_enrollment,enrolled_university_full_time,enrolled_university_part_time,enrolled_university_nan,major_STEM,major_Other,...,major_Arts,major_No Major,major_Business Degree,company_type_private_limited,company_type_startup_earlystage,company_type_nan,company_type_public_sector,company_type_startup_funded,company_type_NGO,company_type_Other
2913,,,,,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
13038,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
13226,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
10167,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,,,...,,,,,,,,,,
16867,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


### c) Ensure all column dtypes are correct

In [23]:
train.dtypes

enrollee_id                            int64
city_development_index               float64
gender_nan                           float64
gender_Male                          float64
gender_Female                        float64
gender_Other                         float64
relevent_experience                    int64
enrolled_university_no_enrollment    float64
enrolled_university_full_time        float64
enrolled_university_part_time        float64
enrolled_university_nan              float64
education_level                      float64
major_STEM                           float64
major_Other                          float64
major_nan                            float64
major_Humanities                     float64
major_Arts                           float64
major_No Major                       float64
major_Business Degree                float64
company_size                         float64
company_type_private_limited         float64
company_type_startup_earlystage      float64
company_ty

# II. Imputing Missing Values

### a) Ordinal Columns

In [24]:
# Number of NAs in each column

perc_na = round(train_df[ord_cols].isna().sum() * 100 / len(train_df[ord_cols]), 2)

pd.DataFrame({'Total NA':train_df[ord_cols].isna().sum(), '% NA':perc_na})

Unnamed: 0,Total NA,% NA
education_level,284,1.5
company_size,5695,30.12
last_new_job,335,1.77
exp_range,46,0.24


In [25]:
# Number of NAs in each column

perc_na = round(test_df[ord_cols].isna().sum() * 100 / len(test_df[ord_cols]), 2)

pd.DataFrame({'Total NA':test_df[ord_cols].isna().sum(), '% NA':perc_na})

Unnamed: 0,Total NA,% NA
education_level,52,2.44
company_size,622,29.22
last_new_job,40,1.88
exp_range,5,0.23


In [26]:
KNN_imputer = KNNImputer()

KNN_imputer.fit(train_df[ord_cols])

train_df[ord_cols] = KNN_imputer.transform(train_df[ord_cols])
test_df[ord_cols] = KNN_imputer.transform(test_df[ord_cols])

ValueError: could not convert string to float: 'Graduate'

In [None]:
train_df[ord_cols].isna().sum()

In [None]:
# Convert ord cols to int

train_df[ord_cols] = round(train_df[ord_cols], 0)
test_df[ord_cols] = round(test_df[ord_cols], 0)

train_df[ord_cols].head()

In [None]:
train_df['education_level'].value_counts(), test_df['education_level'].value_counts()

In [None]:
train_df['company_size'].value_counts()

In [None]:
train_df['last_new_job'].value_counts()

In [None]:
train_df['exp_range'].value_counts()

The distributions don't seem to have changed.

In [None]:
# Convert ord cols to int

train_df[ord_cols] = train_df[ord_cols].astype('int64')
test_df[ord_cols] = test_df[ord_cols].astype('int64')

In [None]:
train_df[ord_cols].dtypes, test_df[ord_cols].dtypes

### b) One-Hot Columns

In [None]:
# Number of NAs in each column

perc_na = round(train_df[cat_cols2].isna().sum() * 100 / len(train_df[cat_cols2]), 2)

pd.DataFrame({'Total NA':train_df[cat_cols2].isna().sum(), '% NA':perc_na})

In [None]:
gender_cols = ['gender_Male', 'gender_Female', 'gender_Other']
enrolled_cols = ['enrolled_university_no_enrollment', 'enrolled_university_full_time', 'enrolled_university_part_time']
major_cols = ['major_STEM', 'major_Business Degree', 'major_Arts', 'major_Humanities', 'major_No Major', 'major_Other']
company = ['company_type_private_limited', 'company_type_startup_funded', 'company_type_startup_earlystage', 'company_type_Other',
           'company_type_public_sector', 'company_type_NGO']

In [None]:
nan_cols = ['gender_nan', 'enrolled_university_nan', 'major_nan', 'company_type_nan']

# Remove NaN columns from catcols2
[cat_cols2.remove(i) for i in nan_cols]

# Remove NaN cols from df
train_df.drop(columns=nan_cols, inplace=True)
test_df.drop(columns=nan_cols, inplace=True)

cat_cols2

In [None]:
KNN_imputer = KNNImputer(n_neighbors=1)

KNN_imputer.fit(train_df[cat_cols2])

train_df[cat_cols2] = KNN_imputer.transform(train_df[cat_cols2])
test_df[cat_cols2] = KNN_imputer.transform(test_df[cat_cols2])

In [None]:
train_df[cat_cols2].isna().sum()

In [None]:
# Convert cat cols to int

train_df[cat_cols2] = round(train_df[cat_cols2], 0)
test_df[cat_cols2] = round(test_df[cat_cols2], 0)

train_df[cat_cols2].head()

In [None]:
train_df[cat_cols2] = train_df[cat_cols2].astype('int64')
test_df[cat_cols2] = test_df[cat_cols2].astype('int64')

train_df[cat_cols2].dtypes

In [None]:
train_df[gender_cols].value_counts()

In [None]:
df['major'].value_counts()

In [None]:
train_df[major_cols].value_counts()

In [None]:
df['company_type'].value_counts()

In [None]:
pd.DataFrame(train_df[company].value_counts())

This distribution doesn't seem to follow the original exactly after KNN imputation.

In [None]:
train_df.isna().sum(), test_df.isna().sum()

In [None]:
# Check for multicollinearity again, now that all columns are numerical and imputed

plt.figure(figsize=(30,30))
sns.heatmap(train_df.corr(), annot=True, cmap='YlGnBu', mask=np.triu(train_df.corr()));

There seems to be some multicollinearity between one-hot encoded columns. For correlation coefficients above |.60|, one column will be dropped.

These are:
- gender_female * gender_male
- enrolled_full_time * no_enrollment

In [None]:
train_df.drop(columns=['gender_Male', 'enrolled_university_no_enrollment'], inplace=True)
test_df.drop(columns=['gender_Male', 'enrolled_university_no_enrollment'], inplace=True)

# III. Save Train/Test Data

In [None]:
# Merge train datasets for model training

#df_train = X_train.copy()
#df_train['target'] = y_train
#train_df.shape

In [None]:
train_df.shape, test_df.shape

In [None]:
train_df.to_csv('../data/Train_Final.csv')
test_df.to_csv('../data/Test_Final.csv')