In [30]:
import numpy as np 
import pandas as pd 

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR
from sklearn import svm


from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_validate

In [2]:
train_data_temp = pd.read_csv('./data/archive/aug_train.csv')
train_data = train_data_temp.dropna()
train_data.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0
7,402,city_46,0.762,Male,Has relevent experience,no_enrollment,Graduate,STEM,13,<10,Pvt Ltd,>4,18,1.0
8,27107,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,7,50-99,Pvt Ltd,1,46,1.0
11,23853,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,5,5000-9999,Pvt Ltd,1,108,0.0


# Data pre-processing

In [3]:
train_data.isnull().sum()

enrollee_id               0
city                      0
city_development_index    0
gender                    0
relevent_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
company_size              0
company_type              0
last_new_job              0
training_hours            0
target                    0
dtype: int64

In [4]:
num_col = train_data.copy()
num_col.loc[train_data['relevent_experience'] == 'No relevent experience', 'relevent_experience'] = 0
num_col.loc[train_data['relevent_experience'] == 'Has relevent experience', 'relevent_experience'] = 1
num_col['relevent_experience']= num_col['relevent_experience'].astype(int)

In [5]:
num_col.loc[train_data['experience'] == '>20', 'experience']= 20
num_col.loc[train_data['experience'] == '<1', 'experience']= 1
num_col['experience']= num_col['experience'].astype(int)

In [6]:
num_col.loc[train_data['last_new_job'] == '>4', 'last_new_job']= 4
num_col.loc[train_data['last_new_job']== 'never', 'last_new_job']= 1000
num_col['last_new_job']= num_col['last_new_job'].astype(int)

In [7]:
enc = OrdinalEncoder()
num_col['gender'] = enc.fit_transform(
                    num_col['gender'].values.reshape(-1,1), 
                    ['Male', 'Female', 'Other'])
num_col['education_level'] = enc.fit_transform(
                            num_col['education_level'].values.reshape(-1,1),
                            ['Graduate', 'Masters', 'Phd'])
num_col['enrolled_university'] = enc.fit_transform(
                                num_col['enrolled_university'].values.reshape(-1,1), 
                                ['no_enrollment', 'Part time course', 'Full time course'])
num_col['company_size'] = enc.fit_transform(
                    num_col['gender'].values.reshape(-1,1), 
                    ['<10', '10/49', '50-99', '100-500', '500-999', '1000-4999', '5000-9999', '10000+'])

In [8]:
set(train_data['company_type'].values)

{'Early Stage Startup',
 'Funded Startup',
 'NGO',
 'Other',
 'Public Sector',
 'Pvt Ltd'}

In [9]:
set(train_data['major_discipline'].values)

{'Arts', 'Business Degree', 'Humanities', 'No Major', 'Other', 'STEM'}

In [10]:
sca = StandardScaler()
num_col.loc[:, ["city_development_index", "training_hours"]] = sca.fit_transform(num_col.loc[:, ["city_development_index", "training_hours"]])

In [11]:
X= num_col.drop(['city','enrollee_id', 'target'], axis= 1)
y= num_col['target'].astype(int)

In [12]:
def ohe_features(cat_cols, X):
    ohe_transform= ColumnTransformer(transformers= [('cat', OneHotEncoder(), cat_cols)], sparse_threshold= 0)
    ohe_df= pd.DataFrame(ohe_transform.fit_transform(X), columns= ohe_transform.get_feature_names())
    X[ohe_transform.get_feature_names()]= ohe_transform.fit_transform(X)
    X= X.drop(cat_cols, axis= 1)
    return X 

In [13]:
cat_cols= ['major_discipline', 'company_type']
X= ohe_features(cat_cols, X)

In [14]:
X.dtypes

city_development_index         float64
gender                         float64
relevent_experience              int64
enrolled_university            float64
education_level                float64
experience                       int64
company_size                   float64
last_new_job                     int64
training_hours                 float64
cat__x0_Arts                   float64
cat__x0_Business Degree        float64
cat__x0_Humanities             float64
cat__x0_No Major               float64
cat__x0_Other                  float64
cat__x0_STEM                   float64
cat__x1_Early Stage Startup    float64
cat__x1_Funded Startup         float64
cat__x1_NGO                    float64
cat__x1_Other                  float64
cat__x1_Public Sector          float64
cat__x1_Pvt Ltd                float64
dtype: object

# Models

### DecisionTreeClassifier

In [15]:
dt= DecisionTreeClassifier()
dt_grid= {'max_depth': [None, 2, 5, 10, 15], 'min_samples_split': [2, 10, 100, 500], 'random_state': [50]}
search= GridSearchCV(dt, dt_grid, scoring= 'roc_auc', n_jobs= -1, cv= 10)
search.fit(X, y)
print('Best hyperparameter settings:', search.best_params_)
print('Corresponding ROC-AUC score (avg of 10-fold CV):', search.best_score_)

Best hyperparameter settings: {'max_depth': 10, 'min_samples_split': 500, 'random_state': 50}
Corresponding ROC-AUC score (avg of 10-fold CV): 0.7636794772730393


### XGBClassifier

In [28]:
xgb= XGBClassifier(objective= 'binary:logistic', random_state= 24, n_estimators= 100, max_depth= 3, learning_rate= 0.1, alpha= 10)
xgb_cv= cross_validate(xgb, X, y, n_jobs= -1, scoring= 'roc_auc', cv= 10)
print('XGBoost model ROC-AUC score:', np.mean(xgb_cv['test_score']))

XGBoost model ROC-AUC score: 0.7681965313656063


### LogisticRegression

In [26]:
logreg=LogisticRegression(solver="lbfgs", multi_class="auto",max_iter=5000)
log_grid={"C":np.logspace(-3,3,7), 'penalty': ['l2']}
search=GridSearchCV(logreg, log_grid, cv=10)
search.fit(X,y)
print("tuned hpyerparameters :(best parameters) ",search.best_params_)
print("accuracy :",search.best_score_)

tuned hpyerparameters :(best parameters)  {'C': 10.0, 'penalty': 'l2'}
accuracy : 0.8469006883479648


### SVC

In [37]:
svc_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100]},
                    {'kernel': ['linear'], 'C': [1, 10, 100]}]
svc = svm.SVC()
search= GridSearchCV(svc, svc_grid, scoring= 'precision', n_jobs= -1, cv= 10)
search.fit(X, y)
print('Best hyperparameter settings:', search.best_params_)
print('Corresponding ROC-AUC score (avg of 10-fold CV):', search.best_score_)

Best hyperparameter settings: {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
Corresponding ROC-AUC score (avg of 10-fold CV): 0.5731549429567722
