In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

In [58]:
#load data
train_df = pd.read_csv('Assignment_Train.csv')
test_df = pd.read_csv('Assignment_Test.csv')

In [59]:
#to convert cibil score into float
train_df['Cibil Score'] = pd.to_numeric(train_df['Cibil Score'], errors='coerce')
test_df['Cibil Score'] = pd.to_numeric(test_df['Cibil Score'], errors='coerce')

In [63]:
def preprocess_data(df):
    # Convert to datetime
    df['APPLICATION LOGIN DATE'] = pd.to_datetime(df['APPLICATION LOGIN DATE'], format='%m/%d/%Y', errors='coerce')
    
    # Extract features from date after converting to datetime
    df['APPLICATION_MONTH'] = df['APPLICATION LOGIN DATE'].dt.month
    df['APPLICATION_DAY'] = df['APPLICATION LOGIN DATE'].dt.day
    df['APPLICATION_DAYOFWEEK'] = df['APPLICATION LOGIN DATE'].dt.dayofweek
    
    #categorical variables missing value
    categorical_features = df.select_dtypes(include=['object']).columns
    df[categorical_features] = df[categorical_features].fillna('Unknown')
    
    
    #boolean variables missing
    boolean_features = [col for col in df.columns if col.startswith('Phone Social Premium.')]
    for feature in boolean_features:
        df[feature] = df[feature].fillna(0).astype(int)


    #numerical variable missing
    numerical_features_int = df.select_dtypes(include=['int64']).columns
    numerical_features_float = df.select_dtypes(include=['float64']).columns    
    
    num_imputer_for_int = SimpleImputer(strategy='mean')
    for feature in numerical_features_int:
        if feature in df.columns:
            df[[feature]] = num_imputer_for_int.fit_transform(df[[feature]])
    
    num_imputer_for_float = SimpleImputer(strategy='mean')
    for feature in numerical_features_float:
        if feature in df.columns:
            df[[feature]] = num_imputer_for_float.fit_transform(df[[feature]])

In [64]:
preprocess_data(train_df)
preprocess_data(test_df)

print(train_df.isna().sum())  #to check for null values presnt

DEALER ID                                 0
APPLICATION LOGIN DATE                 3540
HDB BRANCH NAME                           0
HDB BRANCH STATE                          0
FIRST NAME                                0
MIDDLE NAME                               0
LAST NAME                                 0
mobile                                    0
AADHAR VERIFIED                           0
Cibil Score                               0
MOBILE VERIFICATION                       0
DEALER NAME                               0
TOTAL ASSET COST                          0
ASSET CTG                                 0
ASSET MODEL NO                            0
APPLIED AMOUNT                            0
PRIMARY ASSET MAKE                        0
Primary Asset Model No                    0
Personal Email Address                    0
MARITAL STATUS                            0
GENDER                                    0
DOB                                       0
AGE                             

In [65]:
train_df.dtypes   #to check type of variables

DEALER ID                                     float64
APPLICATION LOGIN DATE                 datetime64[ns]
HDB BRANCH NAME                                object
HDB BRANCH STATE                               object
FIRST NAME                                     object
MIDDLE NAME                                    object
LAST NAME                                      object
mobile                                        float64
AADHAR VERIFIED                                object
Cibil Score                                   float64
MOBILE VERIFICATION                              bool
DEALER NAME                                    object
TOTAL ASSET COST                              float64
ASSET CTG                                      object
ASSET MODEL NO                                float64
APPLIED AMOUNT                                float64
PRIMARY ASSET MAKE                             object
Primary Asset Model No                         object
Personal Email Address      

In [66]:
#to convert categorical values to integer values for logistic regression
label_encoder = LabelEncoder()
categorical_features = train_df.select_dtypes(include=['object']).columns
for i in categorical_features:
   train_df[i] = label_encoder.fit_transform(train_df[i])

categorical_features = test_df.select_dtypes(include=['object']).columns
for i in categorical_features:
   test_df[i] = label_encoder.fit_transform(test_df[i])

In [67]:
train_df   

Unnamed: 0,DEALER ID,APPLICATION LOGIN DATE,HDB BRANCH NAME,HDB BRANCH STATE,FIRST NAME,MIDDLE NAME,LAST NAME,mobile,AADHAR VERIFIED,Cibil Score,...,Phone Social Premium.whatsapp,Phone Social Premium.yatra,Phone Social Premium.zoho,phone_digitalage,phone_nameMatchScore,phone_phoneFootprintStrengthOverall,Application Status,APPLICATION_MONTH,APPLICATION_DAY,APPLICATION_DAYOFWEEK
0,106989.0,2022-07-20,140,4,4023,1204,490,9.210574e+09,0,726.000000,...,0,0,0,5324.0,67.222222,0,0,7.0,20.00000,2.000000
1,108975.0,2022-07-28,397,2,207,1204,1409,8.877987e+09,0,706.402118,...,0,0,0,1998.0,100.000000,0,0,7.0,28.00000,3.000000
2,111004.0,2022-07-15,130,24,239,1204,2898,8.910862e+09,0,737.000000,...,0,0,0,-1.0,-1.000000,1,0,7.0,15.00000,4.000000
3,192020.0,NaT,446,21,76,1204,2684,9.758428e+09,0,713.000000,...,0,0,0,1998.0,72.777778,0,0,7.0,21.58808,2.935759
4,55095.0,2022-07-15,340,5,2689,404,127,9.687028e+09,0,669.000000,...,0,0,0,1998.0,68.095238,0,1,7.0,15.00000,4.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,105101.0,NaT,163,21,105,1204,1916,8.400645e+09,0,706.402118,...,0,0,0,-1.0,60.576923,2,0,7.0,21.58808,2.935759
9996,85054.0,NaT,72,2,4042,594,2102,9.708884e+09,0,706.402118,...,0,0,0,1998.0,71.078431,0,0,7.0,21.58808,2.935759
9997,53710.0,NaT,317,15,3425,1204,2,9.888532e+09,0,706.402118,...,1,0,0,1988.0,100.000000,2,0,7.0,21.58808,2.935759
9998,89240.0,2022-07-29,337,21,3425,1204,2684,8.923338e+09,0,706.402118,...,0,0,0,1096.0,-1.000000,1,0,7.0,29.00000,4.000000


In [68]:
X = train_df.drop(columns=['Application Status', 'DEALER ID','MOBILE VERIFICATION','AADHAR VERIFIED','APPLICATION LOGIN DATE'])  # Drop target and UID columns and i dropped some extra columns cuz they were redundant('MOBILE VERIFICATION','AADHAR VERIFIED','APPLICATION LOGIN DATE')
y = train_df['Application Status']

# For test data, drop UID column only
X_test = test_df.drop(columns=['DEALER ID','UID','MOBILE VERIFICATION','AADHAR VERIFIED','APPLICATION LOGIN DATE'])

In [69]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)     #80-20 split to check the model

In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# Logistic Regression
log_reg = LogisticRegression()       #using regresion cuz of binary output
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_val)

# Random Forest
rf = RandomForestClassifier()             
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_val)

# XGBoost
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_val)


In [47]:
print("Logistic Regression Accuracy:", accuracy_score(y_val, y_pred_log_reg))
print("Random Forest Accuracy:", accuracy_score(y_val, y_pred_rf))
print("XGBoost Accuracy:", accuracy_score(y_val, y_pred_xgb))

print("Logistic Regression Classification Report:\n", classification_report(y_val, y_pred_log_reg))
print("Random Forest Classification Report:\n", classification_report(y_val, y_pred_rf))
print("XGBoost Classification Report:\n", classification_report(y_val, y_pred_xgb))
#rf has best accuracy

Logistic Regression Accuracy: 0.6635
Random Forest Accuracy: 0.88
XGBoost Accuracy: 0.8895
Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.66      1.00      0.80      1327
           1       0.00      0.00      0.00       673

    accuracy                           0.66      2000
   macro avg       0.33      0.50      0.40      2000
weighted avg       0.44      0.66      0.53      2000

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.91      0.91      1327
           1       0.82      0.83      0.82       673

    accuracy                           0.88      2000
   macro avg       0.86      0.87      0.87      2000
weighted avg       0.88      0.88      0.88      2000

XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.92      0.92      1327
           1       0.84      0.8

In [74]:
test_predictions = rf.predict(X_test)
test_predictions

array([0, 0, 0, ..., 1, 0, 0])

In [76]:
result = pd.DataFrame({
    'UID': test_df['UID'],
    'Prediction': test_predictions
})

result['Prediction'] = result['Prediction'].apply(lambda x: 'APPROVED' if x == 1 else 'DECLINED')
result.to_csv('predictions.csv', index=False)