In [None]:
import pandas as pd
import numpy as np

from sklearn.utils import class_weight

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

########################SET folder to project directory path#####################
folder=""

## Loading Training - Test Sets

In [None]:
test_df = pd.read_csv(folder+'data/processed_data/test_df.csv', delimiter=',')
test_df_final = pd.read_csv(folder+'data/processed_data/test_df_final.csv', delimiter=',')
test_df_final = test_df_final.loc[:, ~test_df_final.columns.isin(['SK_ID_CURR'])]

##
train_df_train = pd.read_csv(folder+'data/processed_data/train_df_train.csv', delimiter=',')
train_df_test = pd.read_csv(folder+'data/processed_data/train_df_test.csv', delimiter=',')

X_train_original = train_df_train.loc[:, ~train_df_train.columns.isin(['TARGET', 'SK_ID_CURR'])]
y_train_original = train_df_train.loc[:, train_df_train.columns == 'TARGET']
X_test_original = train_df_test.loc[:, ~train_df_test.columns.isin(['TARGET', 'SK_ID_CURR'])]
y_test_original = train_df_test.loc[:, train_df_test.columns == 'TARGET']

## Assigning class weights

In [None]:
classes_weights = class_weight.compute_sample_weight(
    class_weight='balanced',
    y = y_train_original
)
classes_weights

## Model Trainning (XGBOOST)

In [None]:
model = XGBClassifier(random_state = 15, eta = 0.3)

model.fit(X_train_original, y_train_original, sample_weight=classes_weights)

## Model Results

Evaluation on trainning set

In [None]:
y_pred = model.predict(X_train_original)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_train_original, predictions)
print("Training Set Accuracy: %.2f%%" % (accuracy * 100.0))

Evaluation on the validation set (train_df_test)

In [None]:
y_pred = model.predict(X_test_original)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test_original, predictions)
print("TestSet Set Accuracy: %.2f%%" % (accuracy * 100.0))

### Predict labels for test set (test_df) instances
The following fields are added to the test set (test_df) that contains the data shown to participants:
- Predicted_Result : 0 : 'Rejected', 1:  'Accepted'
- Prediction_Confidence_Accepted
- Prediction_Confidence_Rejected

In [None]:
y_pred_test = model.predict(test_df_final)
predictions_test = [round(value) for value in y_pred_test]

In [None]:
# predictions_test

In [None]:
# test_df.drop(columns = ["Predicted_Result","Prediction_Confidence_Accepted","Prediction_Confidence_Rejected"], inplace=True)

In [None]:
test_df.insert(loc=1, column="Predicted_Result", value=predictions_test)
# test_df["Predicted_Result"] = predictions_test

# Labels
test_df.replace({'Predicted_Result' : { 0 : 'Rejected', 1:  'Accepted'}},inplace=True)

# Prediction Confidence

accepted = []
rejected = []
for i in range(len(test_df_final)):
    accepted.append(round(100 * model.predict_proba(test_df_final)[i][1],3))
    rejected.append(round(100 * model.predict_proba(test_df_final)[i][0],3))
    

test_df.insert(loc=2, column="Prediction_Confidence_Accepted", value=accepted)
test_df.insert(loc=3, column="Prediction_Confidence_Rejected", value=rejected)

test_df

In [None]:
test_df.to_csv(folder+"data/processed_data/Model_Predictions.csv", index=False)

### Retrieval of features' names and importance (weight)

The following fields will be added to the test set (test_df):


In [None]:
ft_df = pd.DataFrame({"Feature_Name": model.get_booster().feature_names,
     'Importance': model.feature_importances_,
    })

# ft_df["DP_Difference"] = dp_diff_list
# ft_df["DP_Ratio"] = dp_ratio_list
# ft_df["SR"] = sr_list

ft_df = ft_df.T
ft_df.rename(columns=ft_df.iloc[0],inplace =True)


ft_df.rename(columns={'CODE_GENDER_LE':'CODE_GENDER','NAME_CONTRACT_TYPE_LE': 'NAME_CONTRACT_TYPE',
                             'FLAG_OWN_CAR_LE':"FLAG_OWN_CAR" , 'FLAG_OWN_REALTY_LE':'FLAG_OWN_REALTY', 
                              'NAME_TYPE_SUITE_LE':"NAME_TYPE_SUITE",'NAME_INCOME_TYPE_LE':"NAME_INCOME_TYPE",
                              
                              'NAME_EDUCATION_TYPE_LE': "NAME_EDUCATION_TYPE",'NAME_FAMILY_STATUS_LE':"NAME_FAMILY_STATUS",
                              'NAME_HOUSING_TYPE_LE':"NAME_HOUSING_TYPE", 'OCCUPATION_TYPE_LE':"OCCUPATION_TYPE",
                              'WEEKDAY_APPR_PROCESS_START_LE':"WEEKDAY_APPR_PROCESS_START",'ORGANIZATION_TYPE_LE':"ORGANIZATION_TYPE"
                             }, inplace=True)



In [None]:
ft_df.to_csv(folder+"data/processed_data/FeatureImportance.csv", index=True)