# PROBLEM STATEMENT
- ✅ TASK 1: Credit Scoring Model
- Objective: Predict an individual's creditworthiness using past financial data.
- Approach: Use classification algorithms like Logistic Regression, Decision Trees, or Random Forest.
- Key Features:
- ● Feature engineering from financial history.
- ● Model accuracy assessment using metrics like Precision, Recall, F1-Score, ROC-AUC.
- ● Dataset could include: income, debts, payment history, etc.

# STEPS
- 1.load the data
- 2.train test split
- 3.feature engg.
- 4.model development
- 5.model evolution

In [9]:
import numpy as np
import pandas as pd


In [10]:
link = "creditwothiness.csv"
df = pd.read_csv(link)
df

Unnamed: 0,Age,Gender,Education,Income,Debt,Credit_Score,Loan_Amount,Loan_Term,Num_Credit_Cards,Payment_History,Employment_Status,Residence_Type,Marital_Status,Creditworthiness
0,56,Female,Master,149406,34089,581,49200,60,4,Bad,Unemployed,Rented,Single,1
1,69,Female,High School,78896,8626,648,20147,24,7,Good,Employed,Mortgaged,Married,1
2,46,Female,Master,119339,46281,329,41307,12,8,Bad,Unemployed,Owned,Single,1
3,32,Male,High School,131067,29403,816,19019,60,8,Bad,Employed,Owned,Single,1
4,60,Male,PhD,38001,30032,673,16317,36,4,Average,Employed,Rented,Married,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,69,Male,Bachelor,124894,46777,364,43499,12,4,Bad,Self-Employed,Owned,Single,0
11996,20,Male,High School,137088,49119,758,44320,24,7,Average,Unemployed,Rented,Single,1
11997,66,Male,High School,26768,2823,523,37458,48,6,Bad,Employed,Owned,Married,1
11998,61,Female,PhD,79214,36508,775,31916,24,7,Average,Employed,Mortgaged,Divorced,1


In [11]:
df.isnull().sum()

Age                  0
Gender               0
Education            0
Income               0
Debt                 0
Credit_Score         0
Loan_Amount          0
Loan_Term            0
Num_Credit_Cards     0
Payment_History      0
Employment_Status    0
Residence_Type       0
Marital_Status       0
Creditworthiness     0
dtype: int64

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Age                12000 non-null  int64 
 1   Gender             12000 non-null  object
 2   Education          12000 non-null  object
 3   Income             12000 non-null  int64 
 4   Debt               12000 non-null  int64 
 5   Credit_Score       12000 non-null  int64 
 6   Loan_Amount        12000 non-null  int64 
 7   Loan_Term          12000 non-null  int64 
 8   Num_Credit_Cards   12000 non-null  int64 
 9   Payment_History    12000 non-null  object
 10  Employment_Status  12000 non-null  object
 11  Residence_Type     12000 non-null  object
 12  Marital_Status     12000 non-null  object
 13  Creditworthiness   12000 non-null  int64 
dtypes: int64(8), object(6)
memory usage: 1.3+ MB


In [13]:
#train test split
from sklearn.model_selection import train_test_split
x = df.drop(columns=['Creditworthiness'])
y = df['Creditworthiness']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

x_train.head()

Unnamed: 0,Age,Gender,Education,Income,Debt,Credit_Score,Loan_Amount,Loan_Term,Num_Credit_Cards,Payment_History,Employment_Status,Residence_Type,Marital_Status
9182,58,Male,Bachelor,95691,48378,574,47591,60,7,Average,Unemployed,Mortgaged,Divorced
11091,43,Female,Master,98840,49187,587,24481,36,7,Bad,Employed,Mortgaged,Single
6428,44,Male,Master,115490,14608,362,37319,12,3,Good,Employed,Rented,Married
288,24,Male,Bachelor,84366,5713,741,31081,48,9,Bad,Unemployed,Mortgaged,Single
2626,46,Female,High School,58829,9359,504,1472,36,2,Average,Self-Employed,Rented,Single


In [14]:
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from sklearn.compose import ColumnTransformer

transformers = ColumnTransformer(
      transformers=[
                  ('tnf1',OrdinalEncoder(),['Gender']),
                  ('tnf2',OneHotEncoder(drop='first'),['Education','Payment_History','Residence_Type','Marital_Status','Employment_Status'])
      ],
      remainder='passthrough')

In [15]:
print(x_train.shape)
x_train_transformed = transformers.fit_transform(x_train)
x_test_transformed = transformers.transform(x_test)

print(x_train_transformed.shape)
print(x_test_transformed.shape)

(9600, 13)
(9600, 19)
(2400, 19)


In [16]:
df_new = pd.DataFrame(x_train_transformed, columns=transformers.get_feature_names_out())
df_new

Unnamed: 0,tnf1__Gender,tnf2__Education_High School,tnf2__Education_Master,tnf2__Education_PhD,tnf2__Payment_History_Bad,tnf2__Payment_History_Good,tnf2__Residence_Type_Owned,tnf2__Residence_Type_Rented,tnf2__Marital_Status_Married,tnf2__Marital_Status_Single,tnf2__Employment_Status_Self-Employed,tnf2__Employment_Status_Unemployed,remainder__Age,remainder__Income,remainder__Debt,remainder__Credit_Score,remainder__Loan_Amount,remainder__Loan_Term,remainder__Num_Credit_Cards
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,58.0,95691.0,48378.0,574.0,47591.0,60.0,7.0
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,43.0,98840.0,49187.0,587.0,24481.0,36.0,7.0
2,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,44.0,115490.0,14608.0,362.0,37319.0,12.0,3.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,24.0,84366.0,5713.0,741.0,31081.0,48.0,9.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,46.0,58829.0,9359.0,504.0,1472.0,36.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9595,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,36.0,83402.0,49145.0,697.0,34651.0,48.0,9.0
9596,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,49.0,111392.0,36195.0,770.0,19187.0,60.0,5.0
9597,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,39.0,128502.0,4979.0,493.0,40153.0,36.0,4.0
9598,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,51.0,128157.0,20859.0,330.0,13504.0,24.0,9.0


In [17]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
lr_model.fit(x_train_transformed, y_train)
y_pred_lr = lr_model.predict(x_test_transformed)



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
#Evolution for LogisticRegression
from sklearn.metrics import f1_score,precision_score,recall_score,roc_auc_score
f1_Score_lr = f1_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr)
recall_lr = recall_score(y_test, y_pred_lr)
roc_auc_lr = roc_auc_score(y_test, y_pred_lr)
print(f"Logistic Regression - \n F1 Score: {f1_Score_lr}\n Precision: {precision_lr}\n Recall: {recall_lr}\n ROC AUC: {roc_auc_lr}")

Logistic Regression - 
 F1 Score: 0.8246816846229187
 Precision: 0.7016666666666667
 Recall: 1.0
 ROC AUC: 0.5
