In [1]:
import pandas as pd
import numpy as np

In [2]:
from pathlib import Path
import sys
sys.path.insert(1, str(Path("../").resolve()))

# importing data pipelines
from scripts.pipelines.a_feature_selection import FeatureSelectionPipeline
from scripts.pipelines.b_data_preparation import DataPreparationPipeline
from scripts.pipelines.d_data_preprocessing import DataPreprocessingPipeline
from scripts.pipelines.e_feature_selection_2 import FeatureSelectionPipeline2
from scripts.utils import load_csv

<br>
<br>
<br>

Now that we have our model ready, let's check it's performance on test data

First we'll transform our training and test data through data pipelines.<br>
Then we'll train the model using train data.<br>
And later we'll test our model on test data.

<br>
<br>
<br>

In [3]:
# loading training data
train_data = load_csv("../pipeline_data/1_train_data.csv")
train_data.head(3)

Unnamed: 0,Unique_ID,C1,C2,C3,C4,C5,C6,C7,C8,N1,...,N28,N29,N30,N31,N32,N33,N34,N35,Unique_ID.1,Dependent_Variable
0,Candidate_16310,1,1,31,1,1,False,4,True,35.0,...,0.0,0.0,1000.0,442.56,32.0,40.0,90.47,16.0,Candidate_16310,0
1,Candidate_1629,2,3,17,18,1,False,7,True,20.39,...,0.0,0.0,3000.0,0.0,-30.0,200.0,267.11,5.0,Candidate_1629,1
2,Candidate_52001,1,0,0,2,0,False,0,True,28.0,...,,,,,,42.0,86.86,15.0,Candidate_52001,1


In [4]:
# loading test data
test_data = load_csv("../pipeline_data/1_test_data.csv")
test_data.head(3)

Unnamed: 0,Unique_ID,C1,C2,C3,C4,C5,C6,C7,C8,N1,...,N28,N29,N30,N31,N32,N33,N34,N35,Unique_ID.1,Dependent_Variable
0,Candidate_10005,1,0,5,0,0,True,0,True,23.0,...,,,,,,300.0,580.65,27.0,Candidate_10005,1
1,Candidate_17632,1,0,2,1,0,False,0,True,10.5,...,,,,,,130.0,208.53,33.0,Candidate_17632,0
2,Candidate_29026,1,0,3,41,2,True,4,True,13.74,...,,,,,,100.0,170.26,20.0,Candidate_29026,0


In [5]:
print(train_data.shape)
print(test_data.shape)

(28050, 46)
(5000, 46)


In [6]:
# test features
x_test = test_data.iloc[:, :-2]
x_test.head(3)

Unnamed: 0,Unique_ID,C1,C2,C3,C4,C5,C6,C7,C8,N1,...,N26,N27,N28,N29,N30,N31,N32,N33,N34,N35
0,Candidate_10005,1,0,5,0,0,True,0,True,23.0,...,,,,,,,,300.0,580.65,27.0
1,Candidate_17632,1,0,2,1,0,False,0,True,10.5,...,,,,,,,,130.0,208.53,33.0
2,Candidate_29026,1,0,3,41,2,True,4,True,13.74,...,,,,,,,,100.0,170.26,20.0


In [7]:
# test target
y_test = test_data.iloc[:, -1]
y_test.head(3)

0    1
1    0
2    0
Name: Dependent_Variable, dtype: int64

In [8]:
print(x_test.shape)
print(y_test.shape)

(5000, 44)
(5000,)


<br>
<br>
<br>

### Pipeline

In [9]:
from sklearn.pipeline import Pipeline

In [10]:
full_pipeline = Pipeline([
    ('feature_selection', FeatureSelectionPipeline()),
    ('data_preparation', DataPreparationPipeline(imputation='aggr')),
    ('data_preprocessing', DataPreprocessingPipeline()),
    ('feature_selection_2', FeatureSelectionPipeline2()),
])

<br>
<br>

In [11]:
# processing training data
train_transformed = full_pipeline.fit_transform(train_data)
train_transformed.head(3)

Unnamed: 0,C6,C1_2,C1_3,C2_0,C2_3,C3_0,C4_1,C4_41,C5_1,C5_4,...,N15,N17,N18,N19,N22,N23,N24,N33,N35,Dependent_Variable
0,0,0,0,0,0,0,1,0,1,0,...,0.0,0.026433,0.735849,-1.076459,-1.0,0.6875,-0.317073,-0.471698,-0.333333,0
1,0,1,0,0,1,0,0,0,1,0,...,0.0,-4.699768,-1.056604,-0.473977,-1.0,-0.25,-0.731707,1.037736,-1.555556,1
2,0,0,0,1,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,-0.4375,-0.780488,-0.45283,-0.444444,1


In [12]:
print(train_transformed.shape)
train_transformed.isna().any().any()

(28050, 30)


False

<br>
<br>

In [13]:
# processing test data
x_test_transformed = full_pipeline.transform(x_test)
x_test_transformed.head(3)

Unnamed: 0,C6,C1_2,C1_3,C2_0,C2_3,C3_0,C4_1,C4_41,C5_1,C5_4,...,N12,N15,N17,N18,N19,N22,N23,N24,N33,N35
0,1,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.6875,0.853659,1.981132,0.888889
1,0,0,0,1,0,0,1,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,-1.0625,1.170732,0.377358,1.555556
2,1,0,0,1,0,0,0,1,0,0,...,0.0,0.0,-1.823559,-0.09434,-1.112701,-1.0,-0.6875,-0.04878,0.09434,0.111111


In [14]:
print(x_test_transformed.shape)
x_test_transformed.isna().any().any()

(5000, 29)


False

In [15]:
# verifying columns
pd.Series(train_transformed.columns[:-1] == x_test_transformed.columns).value_counts()

True    29
dtype: int64

<br>
<br>
<br>

In [16]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

In [18]:
# x_train & y_train
x_train = train_transformed.drop(['Dependent_Variable'], axis=1)
y_train = train_transformed[['Dependent_Variable']]
y_train = np.ravel(y_train)
print(x_train.shape)
print(y_train.shape)

(28050, 29)
(28050,)


In [19]:
# best set of parameters
best_params = {
    'n_estimators': 700,
    'learning_rate': 0.1921286133667085,
    'num_leaves': 2360,
    'max_depth': 9,
    'min_data_in_leaf': 1000,
    'lambda_l1': 30,
    'lambda_l2': 20,
    'min_gain_to_split': 2.993720055467331,
    'bagging_fraction': 0.9,
    'bagging_freq': 1,
    'feature_fraction': 0.5
}

In [20]:
clf = LGBMClassifier(**best_params, random_state=42)

In [21]:
# training our model
clf.fit(x_train, y_train)



In [22]:
# score on training data
predictions = clf.predict_proba(x_train)[:, 1]
roc_auc_score(y_train, predictions)

0.766015054907841

In [23]:
# cross validation score
scores = cross_val_score(clf, X=x_train, y=y_train, scoring='roc_auc', cv=10, n_jobs=-1)
scores.mean()

0.7532942629529488

<br>
<br>

Making predictions on test data

In [24]:
predictions = clf.predict_proba(x_test_transformed)[:, 1]
roc_auc_score(np.ravel(y_test), predictions)

0.7617964289783631

Our model performed well on the test data.<br>
<br>
This indicates that the model is able to generalize well and<br>
it is not overfitted on the training data