In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, RepeatedKFold, GridSearchCV
from sklearn.preprocessing import RobustScaler,  OrdinalEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

In [2]:
df = pd.read_csv("./data_baru/new_data.csv")
df

Unnamed: 0.1,Unnamed: 0,number_of_dependents,net_income,loan_amount,amount_of_late,late_payment_amount,credit_score,arrears_amounts,aset,tenor,debtor_education_level,aging,loss_reverse
0,0,3.0,15500000.0,6.250000e+07,2.0,1.0,670.0,0.0,11000000.0,21.0,D3,DPK,0.000000e+00
1,1,3.0,17500000.0,7.083333e+07,31.0,3.0,402.0,1.0,17000000.0,18.0,D3,DPK,1.770833e+07
2,2,4.0,17500000.0,5.933333e+07,0.0,0.0,505.0,0.0,14500000.0,17.0,D2,Lancar,0.000000e+00
3,3,3.0,15500000.0,6.116667e+07,64.0,5.0,508.0,1.0,11500000.0,17.0,D2,DPK,1.529167e+07
4,4,5.0,8700000.0,5.616667e+07,14.0,2.0,371.0,0.0,5500000.0,15.0,SMA,DPK,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4995,4.0,15500000.0,8.800000e+07,106.0,7.0,272.0,2.0,15000000.0,10.0,D3,Kurang lancar,4.400000e+07
4996,4996,4.0,19000000.0,5.100000e+07,25.0,1.0,464.0,1.0,9500000.0,17.0,S1,DPK,1.275000e+07
4997,4997,6.0,12700000.0,6.150000e+07,53.0,4.0,369.0,1.0,11500000.0,7.0,SMA,DPK,1.537500e+07
4998,4998,6.0,13700000.0,5.900000e+07,27.0,1.0,266.0,1.0,6500000.0,12.0,SMA,DPK,1.475000e+07


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              5000 non-null   int64  
 1   number_of_dependents    5000 non-null   float64
 2   net_income              5000 non-null   float64
 3   loan_amount             5000 non-null   float64
 4   amount_of_late          5000 non-null   float64
 5   late_payment_amount     5000 non-null   float64
 6   credit_score            5000 non-null   float64
 7   arrears_amounts         5000 non-null   float64
 8   aset                    5000 non-null   float64
 9   tenor                   5000 non-null   float64
 10  debtor_education_level  5000 non-null   object 
 11  aging                   5000 non-null   object 
 12  loss_reverse            5000 non-null   float64
dtypes: float64(10), int64(1), object(2)
memory usage: 507.9+ KB


In [4]:
# Preprocessing untuk fitur kategori
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder())
])

# Preprocessing untuk fitur numerik
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])
numeric_features = ['net_income', 'loan_amount', 'credit_score', 'arrears_amounts', 'aset', 'tenor']
categorical_features = ['debtor_education_level', 'aging']

# Gabungkan transformer dengan ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
)


In [5]:
X = df.drop("loss_reverse", axis=1)
y = df["loss_reverse"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

bagging = joblib.load("bagging3.pkl")
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", bagging)
])

model.fit(X_train, y_train)

In [6]:
X = df.drop("loss_reverse", axis=1)
y = df["loss_reverse"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
lasso = joblib.load("lasso.pkl")
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', lasso)
])

model.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(


In [7]:
X_train

Unnamed: 0.1,Unnamed: 0,number_of_dependents,net_income,loan_amount,amount_of_late,late_payment_amount,credit_score,arrears_amounts,aset,tenor,debtor_education_level,aging
1203,1203,3.0,17500000.0,5.100000e+07,49.0,3.0,424.0,1.0,8000000.0,17.0,D3,DPK
2139,2139,3.0,17500000.0,5.816667e+07,118.0,6.0,150.0,3.0,5000000.0,2.0,D1,Kurang lancar
295,295,4.0,13000000.0,6.116667e+07,39.0,3.0,340.0,1.0,7000000.0,9.0,S2,DPK
825,825,4.0,10500000.0,5.733333e+07,98.0,4.0,330.0,2.0,4000000.0,7.0,D3,Kurang lancar
4314,4314,4.0,5500000.0,6.033333e+07,47.0,2.0,303.0,1.0,6500000.0,18.0,D1,DPK
...,...,...,...,...,...,...,...,...,...,...,...,...
1383,1383,5.0,17500000.0,4.383333e+07,23.0,2.0,526.0,1.0,7000000.0,22.0,D2,DPK
119,119,3.0,22000000.0,8.650000e+07,52.0,4.0,631.0,1.0,22500000.0,15.0,S1,DPK
2345,2345,2.0,10500000.0,5.983333e+07,14.0,1.0,394.0,0.0,4000000.0,10.0,D2,DPK
315,315,3.0,17500000.0,5.666667e+07,111.0,5.0,482.0,1.0,9000000.0,15.0,D2,Kurang lancar


In [8]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = mse ** (1/2)
r_square = r2_score(y_test, y_pred)

print(f"""
      Mean Squared Error : {mse:.2f}
      Mean Absolute Error : {mae:.2f}
      Mean Absolute Percentage Error : {mape:.2f}
      Root Mean Squared Error: {rmse:.2f}
      R_Squared : {r_square:.2f}
      """)


      Mean Squared Error : 32916676697002.67
      Mean Absolute Error : 2788735.80
      Mean Absolute Percentage Error : 3311635862268772614144.00
      Root Mean Squared Error: 5737305.70
      R_Squared : 0.91
      


In [9]:
joblib.dump(model, "loss_reverse2.joblib")

['loss_reverse2.joblib']

In [10]:
model = joblib.load("loss_reverse2.joblib")
a = X_test.iloc[-1]

a = pd.DataFrame({i: [j] for i, j in zip(a.index, a.values)})
model.predict(a)

array([12883542.49099617])

In [11]:
y_test.iloc[-1]

14333333.333333334