In [1]:
import pandas as pd
import numpy as np

In [2]:
from pathlib import Path
import sys
sys.path.insert(1, str(Path("../").resolve()))

# importing data pipelines
from scripts.pipelines.a_feature_selection import FeatureSelectionPipeline
from scripts.pipelines.b_data_preparation import DataPreparationPipeline
from scripts.pipelines.d_data_preprocessing import DataPreprocessingPipeline
from scripts.pipelines.e_feature_selection_2 import FeatureSelectionPipeline2
from scripts.utils import load_csv

<br>
<br>
<br>

### Training on full data

Now that we have our model ready, let's train it on complete dataset one last time.

In [3]:
# loading raw x_train
x_train = pd.read_csv("../Training/X_train.csv")
x_train.head(3)

Unnamed: 0,Unique_ID,C1,C2,C3,C4,C5,C6,C7,C8,N1,...,N26,N27,N28,N29,N30,N31,N32,N33,N34,N35
0,Candidate_5926,1,0,11,31,0,False,0,True,23.75,...,,,,,,,,58.0,113.39,12.0
1,Candidate_48134,1,4,2,66,2,False,1,True,11.05,...,,,,,,,,160.0,262.1,17.0
2,Candidate_51717,1,0,19,2,0,False,0,True,29.0,...,,,,,,,,24.0,50.29,18.0


In [4]:
# loading raw y_train
y_train = pd.read_csv("../Training/Y_train.csv")
y_train.head(3)

Unnamed: 0,Unique_ID,Dependent_Variable
0,Candidate_5926,1
1,Candidate_48134,0
2,Candidate_51717,1


In [5]:
# combining training data
total_train = pd.concat([x_train, y_train], axis=1)
total_train.head(3)

Unnamed: 0,Unique_ID,C1,C2,C3,C4,C5,C6,C7,C8,N1,...,N28,N29,N30,N31,N32,N33,N34,N35,Unique_ID.1,Dependent_Variable
0,Candidate_5926,1,0,11,31,0,False,0,True,23.75,...,,,,,,58.0,113.39,12.0,Candidate_5926,1
1,Candidate_48134,1,4,2,66,2,False,1,True,11.05,...,,,,,,160.0,262.1,17.0,Candidate_48134,0
2,Candidate_51717,1,0,19,2,0,False,0,True,29.0,...,,,,,,24.0,50.29,18.0,Candidate_51717,1


In [6]:
# loading raw x_test
x_test = pd.read_csv("../Test/X_test.csv")
x_test.head(3)

Unnamed: 0,Unique_ID,C1,C2,C3,C4,C5,C6,C7,C8,N1,...,N26,N27,N28,N29,N30,N31,N32,N33,N34,N35
0,Candidate_1602,1,0,0,23,0,True,0,True,18.0,...,,,,,,,,50.0,90.38,23.0
1,Candidate_29650,1,0,2,4,2,True,2,True,16.75,...,,,,,,,,300.0,532.93,16.0
2,Candidate_31061,1,2,3,38,1,False,4,True,29.99,...,6.0,6.0,0.0,0.0,5000.0,4334.59,-82.0,80.0,169.78,22.0


In [7]:
print(total_train.shape)
print(x_test.shape)

(33050, 46)
(11017, 44)


<br>
<br>
<br>

### Data Pipeline

In [8]:
from sklearn.pipeline import Pipeline

In [9]:
# full data pipeline
full_pipeline = Pipeline([
    ('feature_selection', FeatureSelectionPipeline()),
    ('data_preparation', DataPreparationPipeline(imputation='aggr')),
    ('data_preprocessing', DataPreprocessingPipeline()),
    ('feature_selection_2', FeatureSelectionPipeline2()),
])

In [10]:
# transforming training data
train_transformed = full_pipeline.fit_transform(total_train)
train_transformed.head()

Unnamed: 0,C6,C1_2,C1_3,C2_0,C2_3,C3_0,C4_1,C4_41,C5_1,C5_4,...,N15,N17,N18,N19,N22,N23,N24,N33,N35,Dependent_Variable
0,0,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,-0.38075,-0.707317,-0.301887,-0.777778,1
1,0,0,0,0,0,0,0,0,0,0,...,0.0,-0.617004,-0.943396,0.802356,0.0,0.0,1.95122,0.660377,-0.222222,0
2,0,0,0,1,0,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,-0.625,0.609756,-0.622642,-0.111111,1
3,0,0,0,0,0,0,0,0,1,0,...,0.0,0.165575,0.622642,-0.472476,-1.0,-0.4375,-0.513073,-0.188679,0.888889,0
4,1,0,0,0,0,0,1,0,1,0,...,0.0,1.218203,0.773585,0.005352,-1.0,0.125,1.02439,0.09434,0.222222,0


In [11]:
print(train_transformed.shape)
train_transformed.isna().any().any()

(33050, 30)


False

<br>
<br>

In [12]:
# transforming test data
x_test_transformed = full_pipeline.transform(x_test)
x_test_transformed.head(3)

Unnamed: 0,C6,C1_2,C1_3,C2_0,C2_3,C3_0,C4_1,C4_41,C5_1,C5_4,...,N12,N15,N17,N18,N19,N22,N23,N24,N33,N35
0,1,0,0,1,0,1,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.8125,-0.243902,-0.377358,0.444444
1,1,0,0,1,0,0,0,0,0,0,...,0.0,0.0,1.235835,0.509434,0.50622,1.0,-0.125,0.472146,1.981132,-0.333333
2,0,0,0,0,0,0,0,0,1,0,...,0.0,1.0,-0.602708,0.433962,-0.743567,1.0,0.25,-0.317073,-0.09434,0.333333


In [13]:
print(x_test_transformed.shape)
x_test_transformed.isna().any().any()

(11017, 29)


False

In [14]:
# verifying columns
pd.Series(train_transformed.columns[:-1] == x_test_transformed.columns).value_counts()

True    29
dtype: int64

<br>
<br>
<br>

In [15]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

Training LGBM model

In [16]:
# x_train & y_train
x_train = train_transformed.drop(['Dependent_Variable'], axis=1)
y_train = train_transformed[['Dependent_Variable']]
y_train = np.ravel(y_train)
print(x_train.shape)
print(y_train.shape)

(33050, 29)
(33050,)


In [17]:
# best set of parameters
best_params = {
    'n_estimators': 700,
    'learning_rate': 0.1921286133667085,
    'num_leaves': 2360,
    'max_depth': 9,
    'min_data_in_leaf': 1000,
    'lambda_l1': 30,
    'lambda_l2': 20,
    'min_gain_to_split': 2.993720055467331,
    'bagging_fraction': 0.9,
    'bagging_freq': 1,
    'feature_fraction': 0.5
}

In [18]:
# model
clf = LGBMClassifier(**best_params, random_state=42)

In [19]:
# training our model
clf.fit(x_train, y_train)



In [20]:
# score on training data
predictions = clf.predict_proba(x_train)[:, 1]
roc_auc_score(y_train, predictions)

0.7675201778402554

In [21]:
# cross validation score
scores = cross_val_score(clf, X=x_train, y=y_train, scoring='roc_auc', cv=10, n_jobs=-1)
scores.mean()

0.7555115241131726

<br>
<br>

Making predictions on test data

In [22]:
predictions = clf.predict_proba(x_test_transformed)[:, 1]

In [23]:
predictions

array([0.37156964, 0.37375949, 0.44793456, ..., 0.50902132, 0.20251667,
       0.18362535])

<br>
<br>
<br>

### Submission

In [24]:
# loading sample submission
submission = pd.read_csv("../sample_submission_file.csv")
submission.head()

Unnamed: 0,Unique_ID,Class_1_Probability
0,Candidate_17537,0.5
1,Candidate_21230,0.5
2,Candidate_41937,0.5
3,Candidate_19223,0.5
4,Candidate_34995,0.5


In [25]:
# replacing sample values with actual predictions
submission['Class_1_Probability'] = predictions
submission.head()

Unnamed: 0,Unique_ID,Class_1_Probability
0,Candidate_17537,0.37157
1,Candidate_21230,0.373759
2,Candidate_41937,0.447935
3,Candidate_19223,0.182268
4,Candidate_34995,0.389754


In [26]:
# saving as a csv file
submission.to_csv("../final_submission.csv", index=None)