In [21]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR
from sklearn import svm


from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_validate

import joblib

In [22]:
data_temp = pd.read_csv('./data/archive/aug_train.csv')
data = data_temp.dropna()
data.head()

y = data['target'].astype(int)
data = data.drop(['city','enrollee_id', 'target'], axis= 1)

# Data pre-processing

In [23]:
data.isnull().sum()

city_development_index    0
gender                    0
relevent_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
company_size              0
company_type              0
last_new_job              0
training_hours            0
dtype: int64

In [24]:
import marshal
from types import FunctionType
from sklearn.base import BaseEstimator, TransformerMixin

class MyFunctionTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, f):
        self.func = f
    def __call__(self, X):
        return self.func(X)
    def __getstate__(self):
        self.func_name = self.func.__name__
        self.func_code = marshal.dumps(self.func.__code__)
        del self.func
        return self.__dict__
    def __setstate__(self, d):
        d["func"] = FunctionType(marshal.loads(d["func_code"]), globals(), d["func_name"])
        del d["func_name"]
        del d["func_code"]
        self.__dict__ = d
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return self.func(X)

In [25]:
def trans_func(data):
    num_col = data.copy()
    num_col.loc[data['relevent_experience'] == 'No relevent experience', 'relevent_experience'] = 0
    num_col.loc[data['relevent_experience'] == 'Has relevent experience', 'relevent_experience'] = 1
    num_col['relevent_experience']= num_col['relevent_experience'].astype(int)

    num_col.loc[data['experience'] == '>20', 'experience']= 20
    num_col.loc[data['experience'] == '<1', 'experience']= 1
    num_col['experience']= num_col['experience'].astype(int)

    num_col.loc[data['last_new_job'] == '>4', 'last_new_job']= 4
    num_col.loc[data['last_new_job']== 'never', 'last_new_job']= 1000
    num_col['last_new_job']= num_col['last_new_job'].astype(int)

    num_col.loc[data['gender'] == 'Male', 'gender'] = 0
    num_col.loc[data['gender'] == 'Female', 'gender'] = 1
    num_col.loc[data['gender'] == 'Other', 'gender'] = 2
    num_col['gender']= num_col['gender'].astype(int)

    num_col.loc[data['education_level'] == 'Graduate', 'education_level'] = 0
    num_col.loc[data['education_level'] == 'Masters', 'education_level'] = 1
    num_col.loc[data['education_level'] == 'Phd', 'education_level'] = 2
    num_col['education_level']= num_col['education_level'].astype(int)

    num_col.loc[data['enrolled_university'] == 'no_enrollment', 'enrolled_university'] = 0
    num_col.loc[data['enrolled_university'] == 'Part time course', 'enrolled_university'] = 1
    num_col.loc[data['enrolled_university'] == 'Full time course', 'enrolled_university'] = 2
    num_col['enrolled_university']= num_col['enrolled_university'].astype(int)


    num_col.loc[data['company_size'] == '<10', 'company_size'] = 0
    num_col.loc[data['company_size'] == '10/49', 'company_size'] = 1
    num_col.loc[data['company_size'] == '50-99', 'company_size'] = 2
    num_col.loc[data['company_size'] == '100-500', 'company_size'] = 3
    num_col.loc[data['company_size'] == '500-999', 'company_size'] = 4
    num_col.loc[data['company_size'] == '1000-4999', 'company_size'] = 5
    num_col.loc[data['company_size'] == '5000-9999', 'company_size'] = 6
    num_col.loc[data['company_size'] == '10000+', 'company_size'] = 7
    num_col['company_size']= num_col['company_size'].astype(int)


#     enc = OrdinalEncoder()
#     num_col['education_level'] = enc.fit_transform(
#                             num_col['education_level'].values.reshape(-1,1),
#                             ['Graduate', 'Masters', 'Phd'])
#     num_col['enrolled_university'] = enc.fit_transform(
#                                 num_col['enrolled_university'].values.reshape(-1,1), 
#                                 ['no_enrollment', 'Part time course', 'Full time course'])
#     num_col['company_size'] = enc.fit_transform(
#                     num_col['company_size'].values.reshape(-1,1), 
#                     ['<10', '10/49', '50-99', '100-500', '500-999', '1000-4999', '5000-9999', '10000+'])

#     print(num_col)

    num_col.loc[data['major_discipline'] == 'Arts', 'major_discipline'] = 0
    num_col.loc[data['major_discipline'] == 'Business Degree', 'major_discipline'] = 1
    num_col.loc[data['major_discipline'] == 'Humanities', 'major_discipline'] = 2
    num_col.loc[data['major_discipline'] == 'No Major', 'major_discipline'] = 3
    num_col.loc[data['major_discipline'] == 'Other', 'major_discipline'] = 4
    num_col.loc[data['major_discipline'] == 'STEM', 'major_discipline'] = 5
    num_col['major_discipline']= num_col['major_discipline'].astype(int)


    num_col.loc[data['company_type'] == 'Early Stage Startup', 'company_type'] = 0
    num_col.loc[data['company_type'] == 'Funded Startup', 'company_type'] = 1
    num_col.loc[data['company_type'] == 'NGO', 'company_type'] = 2
    num_col.loc[data['company_type'] == 'Other', 'company_type'] = 3
    num_col.loc[data['company_type'] == 'Public Sector', 'company_type'] = 4
    num_col.loc[data['company_type'] == 'Pvt Ltd', 'company_type'] = 5
    num_col['company_type']= num_col['company_type'].astype(int)

    X = num_col


#     cat_cols= ['major_discipline', 'company_type']
#     ohe_transform= ColumnTransformer(transformers= [('cat', OneHotEncoder(), cat_cols)], sparse_threshold= 0)
#     ohe_df= pd.DataFrame(ohe_transform.fit_transform(X), columns= ohe_transform.get_feature_names())
#     X[ohe_transform.get_feature_names()]= ohe_transform.fit_transform(X)
#     X= X.drop(cat_cols, axis= 1)


#     sca = StandardScaler()
#     X.loc[:, ["city_development_index", "training_hours"]] = sca.fit_transform(num_col.loc[:, ["city_development_index", "training_hours"]])
    return X



In [26]:
set(data['company_type'].values)

{'Early Stage Startup',
 'Funded Startup',
 'NGO',
 'Other',
 'Public Sector',
 'Pvt Ltd'}

In [27]:
set(data['major_discipline'].values)

{'Arts', 'Business Degree', 'Humanities', 'No Major', 'Other', 'STEM'}

# Models

### DecisionTreeClassifier

In [28]:
from sklearn.preprocessing import FunctionTransformer
transformer = FunctionTransformer(trans_func)

In [29]:
X = trans_func(data)

In [40]:
type(X)

pandas.core.frame.DataFrame

In [30]:
dt= DecisionTreeClassifier()
dt_grid= {'max_depth': [None, 2, 5, 10, 15], 'min_samples_split': [2, 10, 100, 500], 'random_state': [50]}

dt_search= GridSearchCV(dt, dt_grid, scoring= 'roc_auc', n_jobs= -1, cv= 10)
dt_search.fit(X, y)
print('Best hyperparameter settings:', dt_search.best_params_)
print('Corresponding ROC-AUC score (avg of 10-fold CV):', dt_search.best_score_)

Best hyperparameter settings: {'max_depth': 10, 'min_samples_split': 500, 'random_state': 50}
Corresponding ROC-AUC score (avg of 10-fold CV): 0.767355061030622


In [31]:
dt.get_params().keys()

dict_keys(['ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'random_state', 'splitter'])

### XGBClassifier

In [32]:
xgb= XGBClassifier(objective= 'binary:logistic', random_state= 24, n_estimators= 100, max_depth= 3, learning_rate= 0.1, alpha= 10)
xgb_cv= cross_validate(xgb, X, y, n_jobs= -1, scoring= 'roc_auc', cv= 10)
print('XGBoost model ROC-AUC score:', np.mean(xgb_cv['test_score']))

XGBoost model ROC-AUC score: 0.7696893906010527


### LogisticRegression

In [33]:
logreg=LogisticRegression(solver="lbfgs", multi_class="auto",max_iter=5000)
log_grid={"C":np.logspace(-3,3,7), 'penalty': ['l2']}
search=GridSearchCV(logreg, log_grid, cv=10)
search.fit(X,y)
print("tuned hpyerparameters :(best parameters) ",search.best_params_)
print("accuracy :",search.best_score_)

tuned hpyerparameters :(best parameters)  {'C': 1000.0, 'penalty': 'l2'}
accuracy : 0.8454482990822028


### SVC

In [20]:
svc_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100]},
                    {'kernel': ['linear'], 'C': [1, 10, 100]}]
svc = svm.SVC()
svc_search= GridSearchCV(svc, svc_grid, scoring= 'precision', n_jobs= -1, cv= 10)
svc_search.fit(X, y)
print('Best hyperparameter settings:', search.best_params_)
print('Corresponding ROC-AUC score (avg of 10-fold CV):', search.best_score_)

Best hyperparameter settings: {'C': 1000.0, 'penalty': 'l2'}
Corresponding ROC-AUC score (avg of 10-fold CV): 0.8454482990822028


In [44]:
# import marshal
# from types import FunctionType
# from sklearn.base import BaseEstimator, TransformerMixin

# class MyFunctionTransformer(BaseEstimator, TransformerMixin):
#     def __init__(self, f):
#         self.func = f
#     def __call__(self, X):
#         return self.func(X)
#     def __getstate__(self):
#         self.func_name = self.func.__name__
#         self.func_code = marshal.dumps(self.func.__code__)
#         del self.func
#         return self.__dict__
#     def __setstate__(self, d):
#         d["func"] = FunctionType(marshal.loads(d["func_code"]), globals(), d["func_name"])
#         del d["func_name"]
#         del d["func_code"]
#         self.__dict__ = d
#     def fit(self, X, y=None):
#         return self
#     def transform(self, X):
#         return self.func(X)

In [36]:
X = trans_func(data)
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=0)

log_reg=LogisticRegression(solver="lbfgs", multi_class="auto",max_iter=5000)
log_reg.fit(X_train, y_train)
log_reg.score(X_test, y_test)

0.8365341670388566

In [37]:
print('Exporting the trained model')
# joblib.dump(log_reg_pipe, 'log_reg_pip.pkl', compress=1)
joblib.dump(log_reg, 'log_reg.joblib')


Exporting the trained model


['log_reg.joblib']