In [291]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


In [292]:
def detect_data_types(df):
    discrete_text_container = []
    discrete_numeric_container = []
    continuous_container = []
    unknown_container = []

    for column in df.columns:
        col_data = df[column].dropna()
        unique_values = col_data.nunique()

        if col_data.dtype in ['int64', 'float64']:
            if col_data.dtype == 'int64' and unique_values / len(col_data) < 0.05:
                discrete_numeric_container.append(column)
            else:
                continuous_container.append(column)

        elif col_data.dtype == 'object' or col_data.dtype.name == 'category':
            discrete_text_container.append(column)

        elif unique_values / len(col_data) < 0.05:
            discrete_text_container.append(column)
        else:
            unknown_container.append(column)

    return [discrete_text_container, discrete_numeric_container, continuous_container, unknown_container]


def dataset_analysis(df, target_column=None):
    total_rows = len(df)
    unique_rows = len(df.drop_duplicates())
    duplicate_rows = total_rows - unique_rows
    unique_percentage = (unique_rows / total_rows) * 100
    duplicate_percentage = (duplicate_rows / total_rows) * 100

    print('=' * 50)
    print('Dataset Shape (Rows & Columns)')
    print('=' * 50)
    print('Rows :-', df.shape[0])
    print('Columns :-', df.shape[1])
    print(f"Unique Rows: {unique_rows} ({unique_percentage:.2f}%)")
    print(f"Duplicate Rows: {duplicate_rows} ({duplicate_percentage:.2f}%)")

    print("\n" + "=" * 50)
    print("Columns with Null Values and Null Value Counts")
    print("=" * 50)
    null_columns = df.columns[df.isnull().any()]
    if null_columns.empty:
        print("No columns contain null values.")
    else:
        for col in null_columns:
            null_count = df[col].isnull().sum()
            print(f"{col}: {null_count} null values")

    print("\n" + "=" * 50)
    print("Data Type Wise Column Count")
    print("=" * 50)
    print(df.dtypes.value_counts())

    print("\n" + "=" * 50)
    print("Discrete And Continuous Data Analysis")
    print("=" * 50)
    detectDataTypeObj = detect_data_types(df)
    discrete_text_columns, discrete_numeric_columns, continuous_columns, unknown_columns = detectDataTypeObj

    if len(detectDataTypeObj) > 0:
        print('Discrete Text Columns:', ', '.join(discrete_text_columns) if discrete_text_columns else '-----')
        print()
        print('Discrete Numeric Columns:', ', '.join(discrete_numeric_columns) if discrete_numeric_columns else '-----')
        print()
        print('Continuous Columns:', ', '.join(continuous_columns) if continuous_columns else '-----')
        print()
        print('Unknown Columns:', ', '.join(unknown_columns) if unknown_columns else '-----')

    if len(continuous_columns) > 0:
        print("\n" + "=" * 50)
        print("Skewness of Numerical Columns")
        print("=" * 50)
        skewness = df[continuous_columns].skew()
        for col in skewness.index:
            skew_value = skewness[col]
            if skew_value > 1:
                skew_range = "Strong Positive Skew"
                flag = "Consider transformation (e.g., log or square root)"
            elif skew_value >= 0.5:
                skew_range = "Mild Positive Skew"
                flag = ""
            elif -0.5 <= skew_value <= 0.5:
                skew_range = "Approximately Symmetric"
                flag = ""
            elif skew_value <= -1:
                skew_range = "Strong Negative Skew"
                flag = "Consider transformation (e.g., log or square root)"
            elif skew_value <= -0.5:
                skew_range = "Mild Negative Skew"
                flag = ""
            print(f"{col}: {skew_value:.2f} -> {skew_range}. {flag}")
    else:
        print("No numeric columns available for Skewness Analysis.")

    # Constant Unique Values
    print("\n" + "=" * 50)
    print("Constant Unique Values in Categorical Columns")
    print("=" * 50)
    categorical_cols = discrete_text_columns + discrete_numeric_columns
    if categorical_cols:
        for col in categorical_cols:
            unique_vals = df[col].dropna().unique()
            if len(unique_vals) == 1:
                sorted_vals = sorted(map(str, unique_vals))
                print(f"{col}: {sorted_vals}\n")

    # Unique Values in Categorical Columns
    print("\n" + "=" * 50)
    print("Unique Values in Categorical Columns")
    print("=" * 50)
    if categorical_cols:
        for col in categorical_cols:
            unique_vals = df[col].dropna().unique()
            if len(unique_vals) != 1:
                sorted_vals = sorted(map(str, unique_vals))
                if len(unique_vals)>100:
                    print(f"{col}: Greater than 100 unique values \n")
                else:
                    print(f"{col}: {sorted_vals}\n")

    # Target column imbalance analysis
    if target_column:
        print(f"\n{'=' * 50}\nImbalance Analysis for '{target_column}'\n{'=' * 50}")
        target_counts = df[target_column].value_counts()
        target_percentage = target_counts / len(df) * 100
        for label, count in target_counts.items():
            print(f"{label}: {count} ({target_percentage[label]:.2f}%)")

        imbalance_ratio = target_counts.min() / target_counts.max()
        print(f"\nThe dataset is {'imbalanced' if imbalance_ratio < 0.25 else 'balanced'} with respect to '{target_column}'.")


In [293]:
train_df=pd.read_csv('train.csv')
test_df=pd.read_csv('test.csv')


In [294]:
train_df.columns

Index(['ID', 'Customer_ID', 'Month', 'Name', 'Age', 'SSN', 'Occupation',
       'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
       'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Type_of_Loan',
       'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Credit_History_Age',
       'Payment_of_Min_Amount', 'Total_EMI_per_month',
       'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance',
       'Credit_Score'],
      dtype='object')

<section>
  <h2>📊 Credit Score Classification – Feature Descriptions</h2>
  <table border="1" cellpadding="10" cellspacing="0" style="border-collapse: collapse; width: 100%;">
    <thead style="background-color: #f2f2f2;">
      <tr>
        <th>Feature Name</th>
        <th>Description</th>
      </tr>
    </thead>
    <tbody>
      <tr><td>ID</td><td>Unique identifier for each record</td></tr>
      <tr><td>Customer_ID</td><td>Unique identifier assigned to each customer</td></tr>
      <tr><td>Month</td><td>The month for which the record is collected (e.g., Jan-2022)</td></tr>
      <tr><td>Name</td><td>Name of the customer</td></tr>
      <tr><td>Age</td><td>Age of the customer</td></tr>
      <tr><td>SSN</td><td>Social Security Number (used for identity, may be masked)</td></tr>
      <tr><td>Occupation</td><td>Profession or job role of the customer</td></tr>
      <tr><td>Annual_Income</td><td>Yearly income of the customer in local currency</td></tr>
      <tr><td>Monthly_Inhand_Salary</td><td>Monthly take-home salary (after deductions)</td></tr>
      <tr><td>Num_Bank_Accounts</td><td>Total number of bank accounts held by the customer</td></tr>
      <tr><td>Num_Credit_Card</td><td>Total number of credit cards owned</td></tr>
      <tr><td>Interest_Rate</td><td>Interest rate applied to the customer's loans or credit</td></tr>
      <tr><td>Num_of_Loan</td><td>Number of loans currently active under the customer's name</td></tr>
      <tr><td>Type_of_Loan</td><td>Types of loans taken (e.g., Personal, Auto, Home – could be comma-separated)</td></tr>
      <tr><td>Delay_from_due_date</td><td>Average delay in payment from the actual due date (in days)</td></tr>
      <tr><td>Num_of_Delayed_Payment</td><td>Number of times payments were delayed</td></tr>
      <tr><td>Changed_Credit_Limit</td><td>Amount by which the credit limit has changed recently</td></tr>
      <tr><td>Num_Credit_Inquiries</td><td>Number of inquiries made for credit approval (hard pulls)</td></tr>
      <tr><td>Credit_Mix</td><td>Category of credit usage (Standard, Good, Bad)</td></tr>
      <tr><td>Outstanding_Debt</td><td>Total unpaid debt</td></tr>
      <tr><td>Credit_Utilization_Ratio</td><td>Ratio of used credit to available credit (%)</td></tr>
      <tr><td>Credit_History_Age</td><td>Duration of credit history (e.g., 5 Years 6 Months)</td></tr>
      <tr><td>Payment_of_Min_Amount</td><td>Whether the minimum payment was made (Yes, No, NM)</td></tr>
      <tr><td>Total_EMI_per_month</td><td>Total EMI amount payable monthly</td></tr>
      <tr><td>Amount_invested_monthly</td><td>Monthly investment amount from the customer's income</td></tr>
      <tr><td>Payment_Behaviour</td><td>Pattern of payments (e.g., High_spent_Large_value_payments, etc.)</td></tr>
      <tr><td>Monthly_Balance</td><td>Estimated balance left in the account at the end of the month</td></tr>
      <tr><td><strong>Credit_Score</strong></td><td><strong>Target Variable – The creditworthiness of the customer (Good, Standard, Poor)</strong></td></tr>
    </tbody>
  </table>
</section>


In [295]:
train_df.head()

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,_,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.94496,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good
2,0x1604,CUS_0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736783,Good
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good


In [296]:
test_df.columns

Index(['ID', 'Customer_ID', 'Month', 'Name', 'Age', 'SSN', 'Occupation',
       'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
       'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Type_of_Loan',
       'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Credit_History_Age',
       'Payment_of_Min_Amount', 'Total_EMI_per_month',
       'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance'],
      dtype='object')

In [297]:
train_df=train_df.drop(columns=['ID','Customer_ID','Name','SSN'],axis=1)
test_df=test_df.drop(columns=['ID','Customer_ID','Name','SSN'],axis=1)

In [298]:
dataset_analysis(train_df,target_column='Credit_Score')

Dataset Shape (Rows & Columns)
Rows :- 100000
Columns :- 24
Unique Rows: 100000 (100.00%)
Duplicate Rows: 0 (0.00%)

Columns with Null Values and Null Value Counts
Monthly_Inhand_Salary: 15002 null values
Type_of_Loan: 11408 null values
Num_of_Delayed_Payment: 7002 null values
Num_Credit_Inquiries: 1965 null values
Credit_History_Age: 9030 null values
Amount_invested_monthly: 4479 null values
Monthly_Balance: 1200 null values

Data Type Wise Column Count
object     16
float64     4
int64       4
Name: count, dtype: int64

Discrete And Continuous Data Analysis
Discrete Text Columns: Month, Age, Occupation, Annual_Income, Num_of_Loan, Type_of_Loan, Num_of_Delayed_Payment, Changed_Credit_Limit, Credit_Mix, Outstanding_Debt, Credit_History_Age, Payment_of_Min_Amount, Amount_invested_monthly, Payment_Behaviour, Monthly_Balance, Credit_Score

Discrete Numeric Columns: Num_Bank_Accounts, Num_Credit_Card, Interest_Rate, Delay_from_due_date

Continuous Columns: Monthly_Inhand_Salary, Num_Credit

<h3>Data Preprocessing &  EDA</h3>

In [299]:
train_df.columns = train_df.columns.str.lower()
test_df.columns = test_df.columns.str.lower()

In [300]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 24 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   month                     100000 non-null  object 
 1   age                       100000 non-null  object 
 2   occupation                100000 non-null  object 
 3   annual_income             100000 non-null  object 
 4   monthly_inhand_salary     84998 non-null   float64
 5   num_bank_accounts         100000 non-null  int64  
 6   num_credit_card           100000 non-null  int64  
 7   interest_rate             100000 non-null  int64  
 8   num_of_loan               100000 non-null  object 
 9   type_of_loan              88592 non-null   object 
 10  delay_from_due_date       100000 non-null  int64  
 11  num_of_delayed_payment    92998 non-null   object 
 12  changed_credit_limit      100000 non-null  object 
 13  num_credit_inquiries      98035 non-null   fl

In [301]:
train_df.select_dtypes(include='object').columns

Index(['month', 'age', 'occupation', 'annual_income', 'num_of_loan',
       'type_of_loan', 'num_of_delayed_payment', 'changed_credit_limit',
       'credit_mix', 'outstanding_debt', 'credit_history_age',
       'payment_of_min_amount', 'amount_invested_monthly', 'payment_behaviour',
       'monthly_balance', 'credit_score'],
      dtype='object')

In [302]:
train_df['month'].value_counts()

month
January     12500
February    12500
March       12500
April       12500
May         12500
June        12500
July        12500
August      12500
Name: count, dtype: int64

In [303]:
#print(train_df['age'].value_counts().sort_index(ascending=True).index.tolist())

In [304]:
train_df.shape

(100000, 24)

In [305]:
print(train_df[train_df['age'].str.contains('_', na=False)].shape)
print(test_df[test_df['age'].str.contains('_', na=False)].shape)

(4939, 24)
(2477, 23)


In [306]:
train_df=train_df[~train_df['age'].str.contains('_', na=False)]
test_df=test_df[~test_df['age'].str.contains('_', na=False)]
print(train_df.shape)
print(test_df.shape)

(95061, 24)
(47523, 23)


In [307]:
train_df['occupation'].value_counts()

occupation
_______          6732
Lawyer           6232
Architect        6063
Engineer         6045
Mechanic         5992
Accountant       5971
Scientist        5968
Media_Manager    5927
Developer        5921
Teacher          5909
Entrepreneur     5865
Doctor           5789
Journalist       5765
Manager          5665
Musician         5644
Writer           5573
Name: count, dtype: int64

In [308]:
train_df['occupation'] = train_df['occupation'].replace('_______', 'Other')
test_df['occupation'] = test_df['occupation'].replace('_______', 'Other')


In [309]:
train_df['occupation'].value_counts().index

Index(['Other', 'Lawyer', 'Architect', 'Engineer', 'Mechanic', 'Accountant',
       'Scientist', 'Media_Manager', 'Developer', 'Teacher', 'Entrepreneur',
       'Doctor', 'Journalist', 'Manager', 'Musician', 'Writer'],
      dtype='object', name='occupation')

In [310]:
train_df['annual_income'].value_counts()

annual_income
17273.83     16
9141.63      15
33029.66     15
20867.67     15
17816.75     15
             ..
57421.58_     1
36135.19_     1
73823.84_     1
35728.54_     1
31906.07_     1
Name: count, Length: 18710, dtype: int64

In [311]:
print(train_df[train_df['annual_income'].str.contains('_', na=False)].shape)
print(test_df[test_df['annual_income'].str.contains('_', na=False)].shape)

(6662, 24)
(3334, 23)


In [312]:
train_df['annual_income'] = train_df['annual_income'].map(lambda x: x.replace('_', '0') if '_' in x else x)
test_df['annual_income'] = test_df['annual_income'].map(lambda x: x.replace('_', '0') if '_' in x else x)

In [313]:
print(train_df[train_df['annual_income'].str.contains('_', na=False)].shape)
print(test_df[test_df['annual_income'].str.contains('_', na=False)].shape)

(0, 24)
(0, 23)


In [314]:
train_df['annual_income'] = train_df['annual_income'].astype(float).astype(int)
test_df['annual_income'] = test_df['annual_income'].astype(float).astype(int)


In [315]:
train_df.columns

Index(['month', 'age', 'occupation', 'annual_income', 'monthly_inhand_salary',
       'num_bank_accounts', 'num_credit_card', 'interest_rate', 'num_of_loan',
       'type_of_loan', 'delay_from_due_date', 'num_of_delayed_payment',
       'changed_credit_limit', 'num_credit_inquiries', 'credit_mix',
       'outstanding_debt', 'credit_utilization_ratio', 'credit_history_age',
       'payment_of_min_amount', 'total_emi_per_month',
       'amount_invested_monthly', 'payment_behaviour', 'monthly_balance',
       'credit_score'],
      dtype='object')

In [316]:
print(train_df['monthly_inhand_salary'].value_counts().index)

Index([ 2295.058333333333,          6082.1875,  6358.956666666666,
                  6769.13,  5766.491666666666,          4387.2725,
                  6639.56, 3080.5550000000007,          536.43125,
       1315.5608333333332,
       ...
       1465.1631344336106, 1764.2841666666666,  4502.973455381875,
       11102.135321735655,  4156.588618601189,          458.67125,
        6424.022011446845,          3416.3725,  6472.286396373682,
        6181.746666666668],
      dtype='float64', name='monthly_inhand_salary', length=13219)


In [317]:
print(train_df['monthly_inhand_salary'].isnull().sum())
print(test_df['monthly_inhand_salary'].isnull().sum())


14274
7122


In [318]:
print(train_df['monthly_inhand_salary'].median())
print(test_df['monthly_inhand_salary'].median())

3097.008333333333
3088.475


In [319]:
train_df['monthly_inhand_salary'] = train_df['monthly_inhand_salary'].fillna(train_df['monthly_inhand_salary'].median())
test_df['monthly_inhand_salary'] = test_df['monthly_inhand_salary'].fillna(test_df['monthly_inhand_salary'].median())


In [320]:
train_df['monthly_inhand_salary'] = train_df['monthly_inhand_salary'].astype(float).astype(int)
test_df['monthly_inhand_salary'] = test_df['monthly_inhand_salary'].astype(float).astype(int)

In [321]:
print(train_df['num_of_loan'].value_counts().index.tolist())

['3', '2', '4', '0', '1', '6', '7', '5', '-100', '9', '8', '2_', '4_', '3_', '0_', '1_', '7_', '6_', '5_', '8_', '9_', '1150', '773', '1480', '288', '430', '1228', '141', '697', '1464', '501', '875', '49', '192', '1017', '275', '466', '1365', '33', '936', '955', '50', '251', '1127', '1384', '352', '733', '290', '217', '172', '31', '330', '1236', '1259', '1214', '898', '1354', '404', '855', '1463', '284', '661', '1217', '1353', '95', '1209', '1241', '911', '1181', '737', '1106', '968', '1478', '83', '1196', '497', '1129', '927', '653', '662', '529', '635', '1027_', '897', '1039', '819', '1006', '795', '699', '329', '1451', '484', '132', '300', '1103', '504', '136', '1400', '78', '686', '1091', '816', '1369', '143', '1416', '455', '55', '1096', '323', '1406', '1348', '153', '1461', '905', '1312', '1424', '1154', '1110', '527', '449', '418', '319', '23', '238', '638', '138', '235_', '359', '590', '696', '1185_', '1465', '70', '904', '89', '649', '995', '545', '684', '1135', '1094', '1204'

In [322]:
print(train_df[train_df['num_of_loan'].str.contains('_', na=False)].shape)
print(test_df[test_df['num_of_loan'].str.contains('_', na=False)].shape)

(4564, 24)
(2312, 23)


In [323]:
print(train_df.shape)
print(test_df.shape)

(95061, 24)
(47523, 23)


In [324]:
train_df=train_df[~train_df['num_of_loan'].str.contains('_', na=False)]
test_df=test_df[~test_df['num_of_loan'].str.contains('_', na=False)]
print(train_df.shape)
print(test_df.shape)

(90497, 24)
(45211, 23)


In [325]:
type_of_loan_train_df=train_df['type_of_loan'].value_counts().sort_values().reset_index()
type_of_loan_train_df.head()

Unnamed: 0,type_of_loan,count
0,"Not Specified, Mortgage Loan, Credit-Builder L...",2
1,"Student Loan, Mortgage Loan, Debt Consolidatio...",4
2,"Debt Consolidation Loan, Not Specified, Debt C...",4
3,"Credit-Builder Loan, Debt Consolidation Loan, ...",4
4,"Credit-Builder Loan, Student Loan, Auto Loan, ...",4


In [326]:
type_of_loan_train_df.shape

(6260, 2)

In [327]:
type_of_loan_train_df[type_of_loan_train_df['count']<300].shape

(6250, 2)

In [328]:
type_of_loan_train_df['count'].describe()

count    6260.000000
mean       12.810703
std        46.500931
min         2.000000
25%         7.000000
50%         8.000000
75%         8.000000
max      1264.000000
Name: count, dtype: float64

In [329]:
type_of_loan_list=type_of_loan_train_df[type_of_loan_train_df['count']>300]['type_of_loan'].values

In [330]:
train_df['type_of_loan']=train_df['type_of_loan'].map(lambda x: x if x in type_of_loan_list else 'Other')
test_df['type_of_loan']=test_df['type_of_loan'].map(lambda x: x if x in type_of_loan_list else 'Other')


In [331]:
train_df['type_of_loan'].value_counts()

type_of_loan
Other                      80450
Not Specified               1264
Debt Consolidation Loan     1160
Credit-Builder Loan         1147
Personal Loan               1137
Student Loan                1117
Payday Loan                 1088
Mortgage Loan               1069
Auto Loan                   1040
Home Equity Loan            1025
Name: count, dtype: int64

In [332]:
train_df.select_dtypes(include='object').columns

Index(['month', 'age', 'occupation', 'num_of_loan', 'type_of_loan',
       'num_of_delayed_payment', 'changed_credit_limit', 'credit_mix',
       'outstanding_debt', 'credit_history_age', 'payment_of_min_amount',
       'amount_invested_monthly', 'payment_behaviour', 'monthly_balance',
       'credit_score'],
      dtype='object')

In [333]:
train_df['num_of_delayed_payment'].info()

<class 'pandas.core.series.Series'>
Index: 90497 entries, 0 to 99999
Series name: num_of_delayed_payment
Non-Null Count  Dtype 
--------------  ----- 
84163 non-null  object
dtypes: object(1)
memory usage: 1.4+ MB


In [334]:
print(train_df['num_of_delayed_payment'].isnull().sum())
print(test_df['num_of_delayed_payment'].isnull().sum())

6334
3181


In [335]:
print(train_df[train_df['num_of_delayed_payment'].str.contains('_', na=False)].shape)
print(test_df[test_df['num_of_delayed_payment'].str.contains('_', na=False)].shape)

(2490, 24)
(1284, 23)


In [336]:
print(train_df.shape)
print(test_df.shape)

(90497, 24)
(45211, 23)


In [337]:
train_df=train_df[~train_df['num_of_delayed_payment'].str.contains('_', na=False)]
test_df=test_df[~test_df['num_of_delayed_payment'].str.contains('_', na=False)]
print(train_df.shape)
print(test_df.shape)

(88007, 24)
(43927, 23)


In [338]:
train_df['num_of_delayed_payment'].value_counts().sort_index()

num_of_delayed_payment
-1      276
-2      205
-3       84
0      1445
1      1430
       ... 
969       1
972       1
974       1
98        1
996       1
Name: count, Length: 649, dtype: int64

In [339]:
#(2165+1756)/2=1960

In [340]:
train_df['num_of_delayed_payment']=train_df['num_of_delayed_payment'].fillna(1960)
test_df['num_of_delayed_payment']=test_df['num_of_delayed_payment'].fillna(1960)

In [341]:
train_df['num_of_delayed_payment']=train_df['num_of_delayed_payment'].astype('int64')
test_df['num_of_delayed_payment']=test_df['num_of_delayed_payment'].astype('int64')

In [342]:
train_df.select_dtypes(include='object').columns

Index(['month', 'age', 'occupation', 'num_of_loan', 'type_of_loan',
       'changed_credit_limit', 'credit_mix', 'outstanding_debt',
       'credit_history_age', 'payment_of_min_amount',
       'amount_invested_monthly', 'payment_behaviour', 'monthly_balance',
       'credit_score'],
      dtype='object')

In [343]:
print(train_df['changed_credit_limit'].value_counts())

changed_credit_limit
_                     1831
8.22                   124
11.5                   114
7.35                   112
11.32                  112
                      ... 
-1.18                    1
2.8000000000000007       1
31.18                    1
2.4700000000000006       1
1.4700000000000006       1
Name: count, Length: 4314, dtype: int64


In [344]:
train_df.shape

(88007, 24)

In [345]:
train_df=train_df[~train_df['changed_credit_limit'].astype(str).str.contains('_', na=False)]
test_df=test_df[~test_df['changed_credit_limit'].astype(str).str.contains('_', na=False)]

In [346]:
train_df['changed_credit_limit'].info()

<class 'pandas.core.series.Series'>
Index: 86176 entries, 0 to 99999
Series name: changed_credit_limit
Non-Null Count  Dtype 
--------------  ----- 
86176 non-null  object
dtypes: object(1)
memory usage: 1.3+ MB


In [347]:
train_df['num_of_delayed_payment'] = train_df['num_of_delayed_payment'].astype(float).round(3)
test_df['num_of_delayed_payment']=test_df['num_of_delayed_payment'].astype(float).round(3)

In [348]:
train_df['changed_credit_limit'] = train_df['changed_credit_limit'].astype(float).round(3)
test_df['changed_credit_limit']=test_df['changed_credit_limit'].astype(float).round(3)

In [349]:
train_df.head(2)

Unnamed: 0,month,age,occupation,annual_income,monthly_inhand_salary,num_bank_accounts,num_credit_card,interest_rate,num_of_loan,type_of_loan,...,credit_mix,outstanding_debt,credit_utilization_ratio,credit_history_age,payment_of_min_amount,total_emi_per_month,amount_invested_monthly,payment_behaviour,monthly_balance,credit_score
0,January,23,Scientist,19114,1824,3,4,3,4,Other,...,_,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good
1,February,23,Scientist,19114,3097,3,4,3,4,Other,...,Good,809.98,31.94496,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good


In [350]:
train_df['credit_mix'].value_counts()

credit_mix
Standard    31420
Good        20976
_           17455
Bad         16325
Name: count, dtype: int64

In [351]:
train_df['credit_mix'] = train_df['credit_mix'].replace('_', 'Unknown')
test_df['credit_mix'] = test_df['credit_mix'].replace('_', 'Unknown')

In [352]:
train_df['outstanding_debt'] = train_df['outstanding_debt'].map(lambda x: x.replace('_', '0') if '_' in x else x)
test_df['outstanding_debt'] = test_df['outstanding_debt'].map(lambda x: x.replace('_', '0') if '_' in x else x)

In [353]:
train_df['outstanding_debt'] = train_df['outstanding_debt'].astype(float)
test_df['outstanding_debt']=test_df['outstanding_debt'].astype(float)

In [354]:
print(train_df['credit_history_age'].isnull().sum())
print(test_df['credit_history_age'].isnull().sum())

7730
3792


In [355]:
print(train_df['credit_history_age'].value_counts().sort_index().index.tolist())

['0 Years and 1 Months', '0 Years and 10 Months', '0 Years and 11 Months', '0 Years and 2 Months', '0 Years and 3 Months', '0 Years and 4 Months', '0 Years and 5 Months', '0 Years and 6 Months', '0 Years and 7 Months', '0 Years and 8 Months', '0 Years and 9 Months', '1 Years and 0 Months', '1 Years and 1 Months', '1 Years and 10 Months', '1 Years and 11 Months', '1 Years and 2 Months', '1 Years and 3 Months', '1 Years and 4 Months', '1 Years and 5 Months', '1 Years and 6 Months', '1 Years and 7 Months', '1 Years and 8 Months', '1 Years and 9 Months', '10 Years and 0 Months', '10 Years and 1 Months', '10 Years and 10 Months', '10 Years and 11 Months', '10 Years and 2 Months', '10 Years and 3 Months', '10 Years and 4 Months', '10 Years and 5 Months', '10 Years and 6 Months', '10 Years and 7 Months', '10 Years and 8 Months', '10 Years and 9 Months', '11 Years and 0 Months', '11 Years and 1 Months', '11 Years and 10 Months', '11 Years and 11 Months', '11 Years and 2 Months', '11 Years and 

In [356]:
def credit_age_to_months(age_str):
    if pd.isna(age_str):
        return np.nan  # or return 0 if you prefer to fill with 0
    try:
        years = int(age_str.split(" Years")[0])
        months = int(age_str.split("and ")[1].split(" Months")[0])
        return years * 12 + months
    except Exception as e:
        print(f"Error parsing: {age_str} — {e}")
        return np.nan


In [357]:
train_df['credit_history_months'] = train_df['credit_history_age'].apply(credit_age_to_months)
test_df['credit_history_months'] = test_df['credit_history_age'].apply(credit_age_to_months)

In [358]:
print(train_df['credit_history_months'].median(skipna=True))
print(test_df['credit_history_months'].median(skipna=True))


220.0
225.0


In [359]:
train_df['credit_history_months']=train_df['credit_history_months'].median(skipna=True)
test_df['credit_history_months']=test_df['credit_history_months'].median(skipna=True)

In [360]:
train_df['payment_of_min_amount'].value_counts()

payment_of_min_amount
Yes    45137
No     30711
NM     10328
Name: count, dtype: int64

In [361]:
print(train_df['amount_invested_monthly'].value_counts().index)

Index(['__10000__', '0.0', '58.51597569589465', '130.11542024292334',
       '43.477190144355745', '70.10177420755677', '218.90434353388733',
       '168.413702679309', '450.6460933992599', '173.13865100158367',
       ...
       '75.50497238307716', '199.9885807209508', '64.54974829039807',
       '70.8055497847255', '223.87501818278344', '70.86997036607373',
       '125.95659173463726', '215.48638555294247', '78.49772657092517',
       '24.785216509052056'],
      dtype='object', name='amount_invested_monthly', length=78477)


In [362]:
for i in train_df['amount_invested_monthly'].value_counts().index:
    if '_' in i:
        print(i)

__10000__


In [363]:
train_df['amount_invested_monthly'].replace('__10000__', '10000', inplace=True)
test_df['amount_invested_monthly'].replace('__10000__', '10000', inplace=True)

In [364]:
train_df['amount_invested_monthly']=train_df['amount_invested_monthly'].astype(float).astype(float)
test_df['amount_invested_monthly']=test_df['amount_invested_monthly'].astype(float).astype(float)

In [365]:
print(train_df['amount_invested_monthly'].value_counts().sort_index().index.tolist())

[0.0, 10.010194262612963, 10.011424795004293, 10.03659960594723, 10.053768350640556, 10.068234588368787, 10.071936767841647, 10.107546903087657, 10.11661404301702, 10.122556602879524, 10.13191094078467, 10.141284559167328, 10.143435619371825, 10.23798313481069, 10.249461295432074, 10.283404380293053, 10.288450601497617, 10.296281409168015, 10.310228995372404, 10.315838370630738, 10.323115623053695, 10.336203311020911, 10.364065291734066, 10.411876219433443, 10.418358199236947, 10.421907107773217, 10.421910911931633, 10.45815975835633, 10.46159759179635, 10.467009406197652, 10.47500745842986, 10.477334851645242, 10.47877524239634, 10.48334941409644, 10.508052755582398, 10.527696129312993, 10.575047092263068, 10.580561536124794, 10.602163743770618, 10.612907877548746, 10.630952548649931, 10.638229086509703, 10.672831399078632, 10.681583056092569, 10.682540064142273, 10.70103154648394, 10.701403654499453, 10.710388954758225, 10.725092588605847, 10.734853385214004, 10.744481841528614, 10.7

In [366]:
train_df['total_emi_per_month'].value_counts().index

Index([               0.0, 14.740344070159376,  54.63756583524476,
       25.433569898385635,  92.65003808175555, 160.78815052072378,
        246.9923194537421,  73.12500809716363,  196.5285910727319,
        30.57608482316153,
       ...
                  66283.0,            53413.0,            40027.0,
                  51309.0,  550.4834096961771,            26761.0,
                  37640.0,  449.2693512692168,            16415.0,
                  81751.0],
      dtype='float64', name='total_emi_per_month', length=14484)

In [367]:
train_df['payment_behaviour'].value_counts().index

Index(['Low_spent_Small_value_payments', 'High_spent_Medium_value_payments',
       'Low_spent_Medium_value_payments', 'High_spent_Large_value_payments',
       'High_spent_Small_value_payments', 'Low_spent_Large_value_payments',
       '!@9#%8'],
      dtype='object', name='payment_behaviour')

In [368]:
train_df['payment_behaviour'].replace('!@9#%8', 'Other', inplace=True)
test_df['payment_behaviour'].replace('!@9#%8', 'Other', inplace=True)

In [369]:
train_df['monthly_balance'].value_counts()

monthly_balance
__-333333333333333333333333333__    9
350.982329                          1
411.42712287098345                  1
412.669312                          1
368.154976                          1
                                   ..
394.624914                          1
379.538292                          1
300.008498                          1
396.997157                          1
484.5912142650067                   1
Name: count, Length: 85143, dtype: int64

In [370]:
for i in train_df['monthly_balance'].value_counts().index.to_list():
    value=str(i)
    if '_' in value:
        print(i)
    

__-333333333333333333333333333__


In [371]:
train_df['monthly_balance'].replace('__-333333333333333333333333333__', '0', inplace=True)
test_df['monthly_balance'].replace('__-333333333333333333333333333__', '0', inplace=True)

In [372]:
train_df['credit_score'].value_counts().index

Index(['Standard', 'Poor', 'Good'], dtype='object', name='credit_score')

In [373]:
train_df['credit_utilization_ratio'].value_counts().index

Index([37.753013234947616,  27.49526284641956,   36.9790068767687,
         24.5405098449408, 32.803430858560226,  39.08082343386519,
        41.21236675865955,  36.16692522742003, 24.972852574957464,
        37.78821735830545,
       ...
        33.38101020065065,  34.97789474709241,  33.22495078663659,
       38.550848433956325,  23.93379480196552,  27.26225871052017,
       24.797346908844982, 31.377861869582357,  31.94496005538421,
       26.822619623699016],
      dtype='float64', name='credit_utilization_ratio', length=86176)

In [374]:
train_df['num_credit_inquiries'].median()

np.float64(6.0)

In [375]:
train_df['num_credit_inquiries']=train_df['num_credit_inquiries'].fillna(6)
test_df['num_credit_inquiries']=test_df['num_credit_inquiries'].fillna(6)

In [376]:
train_df['delay_from_due_date'].value_counts().index

Index([15, 13,  8, 14, 10,  7,  9, 11, 12,  6,  5, 18, 19, 27, 24, 16, 17, 20,
       25, 28, 21, 23, 26, 29, 22, 30,  4,  3,  1,  2,  0, 31, 33, 32, 34, 47,
       48, 54, 52, 42, 35, 36, 44, 38, 41, 50, 40, 53, 55, 49, 58, 62, 45, 56,
       60, 51, 57, 39, 59, 46, 37, 43, 61, -1, -2, -3, 63, 64, -4, 65, 66, -5,
       67],
      dtype='int64', name='delay_from_due_date')

In [377]:
train_df['month'].value_counts().index

Index(['January', 'June', 'July', 'March', 'May', 'February', 'April',
       'August'],
      dtype='object', name='month')

In [378]:
print(train_df['num_bank_accounts'].value_counts().index)

Index([   6,    7,    8,    4,    5,    3,    9,   10,    1,    0,
       ...
        756,   75, 1083,  463,  472,  636, 1703, 1194, 1544, 1754],
      dtype='int64', name='num_bank_accounts', length=849)


In [379]:
train_df.columns

Index(['month', 'age', 'occupation', 'annual_income', 'monthly_inhand_salary',
       'num_bank_accounts', 'num_credit_card', 'interest_rate', 'num_of_loan',
       'type_of_loan', 'delay_from_due_date', 'num_of_delayed_payment',
       'changed_credit_limit', 'num_credit_inquiries', 'credit_mix',
       'outstanding_debt', 'credit_utilization_ratio', 'credit_history_age',
       'payment_of_min_amount', 'total_emi_per_month',
       'amount_invested_monthly', 'payment_behaviour', 'monthly_balance',
       'credit_score', 'credit_history_months'],
      dtype='object')

In [380]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 86176 entries, 0 to 99999
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   month                     86176 non-null  object 
 1   age                       86176 non-null  object 
 2   occupation                86176 non-null  object 
 3   annual_income             86176 non-null  int64  
 4   monthly_inhand_salary     86176 non-null  int64  
 5   num_bank_accounts         86176 non-null  int64  
 6   num_credit_card           86176 non-null  int64  
 7   interest_rate             86176 non-null  int64  
 8   num_of_loan               86176 non-null  object 
 9   type_of_loan              86176 non-null  object 
 10  delay_from_due_date       86176 non-null  int64  
 11  num_of_delayed_payment    86176 non-null  float64
 12  changed_credit_limit      86176 non-null  float64
 13  num_credit_inquiries      86176 non-null  float64
 14  credit_mix 

In [381]:
print(train_df.shape)
print(test_df.shape)
train_df=train_df.dropna()
test_df=test_df.dropna()
print(train_df.shape)
print(test_df.shape)

(86176, 25)
(42993, 24)
(74037, 25)
(36993, 24)


In [382]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 74037 entries, 0 to 99999
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   month                     74037 non-null  object 
 1   age                       74037 non-null  object 
 2   occupation                74037 non-null  object 
 3   annual_income             74037 non-null  int64  
 4   monthly_inhand_salary     74037 non-null  int64  
 5   num_bank_accounts         74037 non-null  int64  
 6   num_credit_card           74037 non-null  int64  
 7   interest_rate             74037 non-null  int64  
 8   num_of_loan               74037 non-null  object 
 9   type_of_loan              74037 non-null  object 
 10  delay_from_due_date       74037 non-null  int64  
 11  num_of_delayed_payment    74037 non-null  float64
 12  changed_credit_limit      74037 non-null  float64
 13  num_credit_inquiries      74037 non-null  float64
 14  credit_mix 

In [383]:
dataset_analysis(train_df,target_column='credit_score')

Dataset Shape (Rows & Columns)
Rows :- 74037
Columns :- 25
Unique Rows: 74037 (100.00%)
Duplicate Rows: 0 (0.00%)

Columns with Null Values and Null Value Counts
No columns contain null values.

Data Type Wise Column Count
object     11
float64     8
int64       6
Name: count, dtype: int64

Discrete And Continuous Data Analysis
Discrete Text Columns: month, age, occupation, num_of_loan, type_of_loan, credit_mix, credit_history_age, payment_of_min_amount, payment_behaviour, monthly_balance, credit_score

Discrete Numeric Columns: num_bank_accounts, num_credit_card, interest_rate, delay_from_due_date

Continuous Columns: annual_income, monthly_inhand_salary, num_of_delayed_payment, changed_credit_limit, num_credit_inquiries, outstanding_debt, credit_utilization_ratio, total_emi_per_month, amount_invested_monthly, credit_history_months

Unknown Columns: -----

Skewness of Numerical Columns
annual_income: 12.43 -> Strong Positive Skew. Consider transformation (e.g., log or square root)
mon

In [384]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 74037 entries, 0 to 99999
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   month                     74037 non-null  object 
 1   age                       74037 non-null  object 
 2   occupation                74037 non-null  object 
 3   annual_income             74037 non-null  int64  
 4   monthly_inhand_salary     74037 non-null  int64  
 5   num_bank_accounts         74037 non-null  int64  
 6   num_credit_card           74037 non-null  int64  
 7   interest_rate             74037 non-null  int64  
 8   num_of_loan               74037 non-null  object 
 9   type_of_loan              74037 non-null  object 
 10  delay_from_due_date       74037 non-null  int64  
 11  num_of_delayed_payment    74037 non-null  float64
 12  changed_credit_limit      74037 non-null  float64
 13  num_credit_inquiries      74037 non-null  float64
 14  credit_mix 

In [385]:
train_df.select_dtypes(include='object').columns

Index(['month', 'age', 'occupation', 'num_of_loan', 'type_of_loan',
       'credit_mix', 'credit_history_age', 'payment_of_min_amount',
       'payment_behaviour', 'monthly_balance', 'credit_score'],
      dtype='object')

In [386]:
train_df['credit_mix'].head()

0    Unknown
3       Good
4       Good
5       Good
9       Good
Name: credit_mix, dtype: object

In [387]:
train_df[['age', 'num_of_loan', 'monthly_balance']] = train_df[['age', 'num_of_loan', 'monthly_balance']].astype(float)
test_df[['age', 'num_of_loan', 'monthly_balance']] = test_df[['age', 'num_of_loan', 'monthly_balance']].astype(float)   

In [388]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 74037 entries, 0 to 99999
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   month                     74037 non-null  object 
 1   age                       74037 non-null  float64
 2   occupation                74037 non-null  object 
 3   annual_income             74037 non-null  int64  
 4   monthly_inhand_salary     74037 non-null  int64  
 5   num_bank_accounts         74037 non-null  int64  
 6   num_credit_card           74037 non-null  int64  
 7   interest_rate             74037 non-null  int64  
 8   num_of_loan               74037 non-null  float64
 9   type_of_loan              74037 non-null  object 
 10  delay_from_due_date       74037 non-null  int64  
 11  num_of_delayed_payment    74037 non-null  float64
 12  changed_credit_limit      74037 non-null  float64
 13  num_credit_inquiries      74037 non-null  float64
 14  credit_mix 

In [389]:
train_df.select_dtypes(include='object').columns

Index(['month', 'occupation', 'type_of_loan', 'credit_mix',
       'credit_history_age', 'payment_of_min_amount', 'payment_behaviour',
       'credit_score'],
      dtype='object')

In [391]:
from sklearn.preprocessing import MinMaxScaler

num_cols = train_df.select_dtypes(include=['int64', 'float64']).columns

scaler = MinMaxScaler()

for col in num_cols:
    train_df[col] = scaler.fit_transform(train_df[[col]])
    test_df[col] = scaler.transform(test_df[[col]])


In [392]:
train_df.head()

Unnamed: 0,month,age,occupation,annual_income,monthly_inhand_salary,num_bank_accounts,num_credit_card,interest_rate,num_of_loan,type_of_loan,...,outstanding_debt,credit_utilization_ratio,credit_history_age,payment_of_min_amount,total_emi_per_month,amount_invested_monthly,payment_behaviour,monthly_balance,credit_score,credit_history_months
0,January,0.05686,Scientist,0.000501,0.102074,0.002223,0.002668,0.000345,0.065615,Other,...,0.16202,0.204039,22 Years and 1 Months,No,0.000602,0.008042,High_spent_Small_value_payments,0.19506,Good,0.0
3,April,0.05686,Scientist,0.000501,0.187504,0.002223,0.002668,0.000345,0.065615,Other,...,0.16202,0.360476,22 Years and 4 Months,No,0.000602,0.019946,Low_spent_Small_value_payments,0.139479,Good,0.0
4,May,0.05686,Scientist,0.000501,0.102074,0.002223,0.002668,0.000345,0.065615,Other,...,0.16202,0.134487,22 Years and 5 Months,No,0.000602,0.004142,High_spent_Medium_value_payments,0.213159,Good,0.0
5,June,0.05686,Scientist,0.000501,0.187504,0.002223,0.002668,0.000345,0.065615,Other,...,0.16202,0.219137,22 Years and 6 Months,No,0.000602,0.006243,Other,0.212528,Good,0.0
9,February,0.057404,Teacher,0.001151,0.183478,0.001668,0.002668,0.000863,0.063722,Credit-Builder Loan,...,0.121012,0.606812,26 Years and 8 Months,No,0.000229,0.004039,High_spent_Large_value_payments,0.302484,Good,0.0


In [393]:
from sklearn.preprocessing import LabelEncoder
obj_cols = train_df.select_dtypes(include='object').columns
test_cols = test_df.select_dtypes(include='object').columns

# Apply LabelEncoder to each object column
le = LabelEncoder()
for col in obj_cols:
    train_df[col] = le.fit_transform(train_df[col])

for col in test_cols:
    test_df[col] = le.fit_transform(test_df[col])



In [394]:
train_df.head()

Unnamed: 0,month,age,occupation,annual_income,monthly_inhand_salary,num_bank_accounts,num_credit_card,interest_rate,num_of_loan,type_of_loan,...,outstanding_debt,credit_utilization_ratio,credit_history_age,payment_of_min_amount,total_emi_per_month,amount_invested_monthly,payment_behaviour,monthly_balance,credit_score,credit_history_months
0,3,0.05686,13,0.000501,0.102074,0.002223,0.002668,0.000345,0.065615,6,...,0.16202,0.204039,180,1,0.000602,0.008042,2,0.19506,0,0.0
3,0,0.05686,13,0.000501,0.187504,0.002223,0.002668,0.000345,0.065615,6,...,0.16202,0.360476,185,1,0.000602,0.019946,5,0.139479,0,0.0
4,7,0.05686,13,0.000501,0.102074,0.002223,0.002668,0.000345,0.065615,6,...,0.16202,0.134487,186,1,0.000602,0.004142,1,0.213159,0,0.0
5,5,0.05686,13,0.000501,0.187504,0.002223,0.002668,0.000345,0.065615,6,...,0.16202,0.219137,187,1,0.000602,0.006243,6,0.212528,0,0.0
9,2,0.057404,14,0.001151,0.183478,0.001668,0.002668,0.000863,0.063722,1,...,0.121012,0.606812,237,1,0.000229,0.004039,0,0.302484,0,0.0


In [395]:
train_df.corr()['credit_score'].sort_values(ascending=False)

credit_score                1.000000
changed_credit_limit        0.184853
payment_of_min_amount       0.178674
credit_mix                  0.147379
delay_from_due_date         0.097128
outstanding_debt            0.034698
payment_behaviour           0.017616
occupation                  0.009148
month                       0.006055
num_of_loan                 0.003811
type_of_loan                0.003371
num_credit_inquiries        0.003182
num_bank_accounts           0.002555
num_of_delayed_payment      0.002090
annual_income               0.000176
num_credit_card            -0.000738
age                        -0.002059
interest_rate              -0.003639
total_emi_per_month        -0.007088
amount_invested_monthly    -0.007892
credit_utilization_ratio   -0.013545
credit_history_age         -0.019737
monthly_balance            -0.054716
monthly_inhand_salary      -0.064493
credit_history_months            NaN
Name: credit_score, dtype: float64

In [396]:
train_df.isnull().sum()

month                       0
age                         0
occupation                  0
annual_income               0
monthly_inhand_salary       0
num_bank_accounts           0
num_credit_card             0
interest_rate               0
num_of_loan                 0
type_of_loan                0
delay_from_due_date         0
num_of_delayed_payment      0
changed_credit_limit        0
num_credit_inquiries        0
credit_mix                  0
outstanding_debt            0
credit_utilization_ratio    0
credit_history_age          0
payment_of_min_amount       0
total_emi_per_month         0
amount_invested_monthly     0
payment_behaviour           0
monthly_balance             0
credit_score                0
credit_history_months       0
dtype: int64

In [397]:
train_df.dropna(axis=0, inplace=True)
test_df.dropna(axis=0, inplace=True)

In [398]:
train_df.head()

Unnamed: 0,month,age,occupation,annual_income,monthly_inhand_salary,num_bank_accounts,num_credit_card,interest_rate,num_of_loan,type_of_loan,...,outstanding_debt,credit_utilization_ratio,credit_history_age,payment_of_min_amount,total_emi_per_month,amount_invested_monthly,payment_behaviour,monthly_balance,credit_score,credit_history_months
0,3,0.05686,13,0.000501,0.102074,0.002223,0.002668,0.000345,0.065615,6,...,0.16202,0.204039,180,1,0.000602,0.008042,2,0.19506,0,0.0
3,0,0.05686,13,0.000501,0.187504,0.002223,0.002668,0.000345,0.065615,6,...,0.16202,0.360476,185,1,0.000602,0.019946,5,0.139479,0,0.0
4,7,0.05686,13,0.000501,0.102074,0.002223,0.002668,0.000345,0.065615,6,...,0.16202,0.134487,186,1,0.000602,0.004142,1,0.213159,0,0.0
5,5,0.05686,13,0.000501,0.187504,0.002223,0.002668,0.000345,0.065615,6,...,0.16202,0.219137,187,1,0.000602,0.006243,6,0.212528,0,0.0
9,2,0.057404,14,0.001151,0.183478,0.001668,0.002668,0.000863,0.063722,1,...,0.121012,0.606812,237,1,0.000229,0.004039,0,0.302484,0,0.0


In [399]:
test_df.columns

Index(['month', 'age', 'occupation', 'annual_income', 'monthly_inhand_salary',
       'num_bank_accounts', 'num_credit_card', 'interest_rate', 'num_of_loan',
       'type_of_loan', 'delay_from_due_date', 'num_of_delayed_payment',
       'changed_credit_limit', 'num_credit_inquiries', 'credit_mix',
       'outstanding_debt', 'credit_utilization_ratio', 'credit_history_age',
       'payment_of_min_amount', 'total_emi_per_month',
       'amount_invested_monthly', 'payment_behaviour', 'monthly_balance',
       'credit_history_months'],
      dtype='object')

In [400]:
train_df.to_csv('train_cleaned.csv', index=False)
test_df.to_csv('test_cleaned.csv', index=False)