In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, RepeatedKFold, GridSearchCV
from sklearn.preprocessing import RobustScaler,  OrdinalEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

In [2]:
df = pd.read_csv("./data_baru/new_data.csv")
df

Unnamed: 0.1,Unnamed: 0,net_income,loan_amount,avg_amounts_previous_bills,avg_previous_payment,amount_of_late,late_payment_amount,credit_score,arrears_amounts,aging,employment_type,loss_reverse
0,0,15500000.0,1.562500e+07,29500000.0,24350000.0,2.0,1.0,670.0,0.0,DPK,Retired,0.000000e+00
1,1,17500000.0,1.770833e+07,3000000.0,1600000.0,31.0,3.0,402.0,1.0,DPK,Full-time,1.770833e+07
2,2,17500000.0,1.483333e+07,13500000.0,9050000.0,0.0,0.0,505.0,0.0,Lancar,Full-time,0.000000e+00
3,3,15500000.0,1.529167e+07,10500000.0,8350000.0,64.0,5.0,508.0,1.0,DPK,Retired,1.529167e+07
4,4,8700000.0,1.404167e+07,6500000.0,5750000.0,14.0,2.0,371.0,0.0,DPK,Contract,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4995,15500000.0,2.200000e+07,22000000.0,12900000.0,106.0,7.0,272.0,2.0,Kurang lancar,Retired,4.400000e+07
4996,4996,19000000.0,1.275000e+07,2000000.0,1600000.0,25.0,1.0,464.0,1.0,DPK,Full-time,1.275000e+07
4997,4997,12700000.0,1.537500e+07,10500000.0,10050000.0,53.0,4.0,369.0,1.0,DPK,Contract,1.537500e+07
4998,4998,13700000.0,1.475000e+07,8000000.0,4800000.0,27.0,1.0,266.0,1.0,DPK,Contract,1.475000e+07


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0                  5000 non-null   int64  
 1   net_income                  5000 non-null   float64
 2   loan_amount                 5000 non-null   float64
 3   avg_amounts_previous_bills  5000 non-null   float64
 4   avg_previous_payment        5000 non-null   float64
 5   amount_of_late              5000 non-null   float64
 6   late_payment_amount         5000 non-null   float64
 7   credit_score                5000 non-null   float64
 8   arrears_amounts             5000 non-null   float64
 9   aging                       5000 non-null   object 
 10  employment_type             5000 non-null   object 
 11  loss_reverse                5000 non-null   float64
dtypes: float64(9), int64(1), object(2)
memory usage: 468.9+ KB


In [4]:
for i in df.columns:
    if df[i].dtype == "object":
        df[i] = df[i].astype("category")

df.drop(["Unnamed: 0"], axis=1, inplace=True)

In [5]:
# Preprocessing untuk fitur kategori
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder())
])

# Preprocessing untuk fitur numerik
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])
numeric_features = ["net_income", "loan_amount", "avg_amounts_previous_bills", "avg_previous_payment", "amount_of_late", "late_payment_amount", "credit_score",
                    "arrears_amounts"]
categorical_features= ["aging", "employment_type"]

# Gabungkan transformer dengan ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [6]:
X = df.drop("loss_reverse", axis=1)
y = df["loss_reverse"]
scaler = RobustScaler()
y = pd.Series(scaler.fit_transform(y.to_numpy().reshape(-1, 1)).ravel())

column_order = numeric_features + categorical_features
X = X[column_order]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
bagging = joblib.load("bagging2.pkl")
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', bagging)
])

model.fit(X_train, y_train)

In [7]:
X_train

Unnamed: 0,net_income,loan_amount,avg_amounts_previous_bills,avg_previous_payment,amount_of_late,late_payment_amount,credit_score,arrears_amounts,aging,employment_type
2695,24500000.0,1.616667e+07,13500000.0,7550000.0,51.0,3.0,571.0,1.0,DPK,Full-time
1539,15500000.0,2.070833e+07,29000000.0,28800000.0,51.0,3.0,430.0,1.0,DPK,Retired
2905,17500000.0,1.575000e+07,9500000.0,7350000.0,99.0,7.0,175.0,3.0,Kurang lancar,Full-time
4168,10500000.0,1.420833e+07,9000000.0,5500000.0,34.0,3.0,396.0,1.0,DPK,Contract
449,10500000.0,1.400000e+07,12000000.0,6000000.0,19.0,2.0,301.0,1.0,DPK,Retired
...,...,...,...,...,...,...,...,...,...,...
4345,13000000.0,1.504167e+07,19000000.0,17900000.0,30.0,3.0,326.0,1.0,DPK,Retired
25,13000000.0,1.491667e+07,12500000.0,10650000.0,0.0,0.0,474.0,0.0,Lancar,Retired
200,5700000.0,1.466667e+07,17500000.0,10450000.0,121.0,5.0,370.0,2.0,Diragukan,Contract
3372,26500000.0,3.450000e+07,38500000.0,21450000.0,2.0,2.0,644.0,0.0,DPK,Full-time


In [8]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = mse ** (1/2)
r_square = r2_score(y_test, y_pred)

print(f"""
      Mean Squared Errror : {mse:.2f}
      Mean Absolute Error : {mae:.2f}
      Mean Absolute Percentage Error : {mape:.2f}
      Root Mean Squared Error : {rmse:.2f}
      R_Squared : {r_square:.2f}
      """)


      Mean Squared Errror : 0.03
      Mean Absolute Error : 0.02
      Mean Absolute Percentage Error : 0.01
      Root Mean Squared Error : 0.18
      R_Squared : 0.98
      


In [9]:
# joblib.dump(model, "loss_reverse.joblib")

In [10]:
model = joblib.load("loss_reverse.joblib")
a = X_test.iloc[-1]

a = pd.DataFrame({i: [j] for i, j in zip(a.index, a.values)})
model.predict(a)

array([15041666.66666667])

In [11]:
y_test = pd.Series(scaler.inverse_transform(y_test.to_numpy().reshape(-1, 1)).ravel())
y_test.iloc[-1]

15041666.666666666