In [1]:
import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import RobustScaler, LabelEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline  
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score

In [2]:
df = pd.read_csv(r"C:\Users\DHONI HANIF\OneDrive\Documents\AI_Collection_and_Loss_Reverse_Forecast\data\kolektor.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                1000 non-null   int64  
 1   collector_name            1000 non-null   object 
 2   collector_address         1000 non-null   object 
 3   collector_number          1000 non-null   object 
 4   collector_nik             1000 non-null   int64  
 5   time_to_collect           1000 non-null   int64  
 6   avg_bill_methods          1000 non-null   object 
 7   debtor_volume_handled     1000 non-null   int64  
 8   bill_amount_collected     1000 non-null   int64  
 9   total_actual              1000 non-null   int64  
 10  total_cost                1000 non-null   int64  
 11  success_rate              1000 non-null   float64
 12  collector_zip             1000 non-null   int64  
 13  collector_rt              1000 non-null   int64  
 14  collector

In [3]:
for i in df.columns:
    if df[i].dtype == "object":
        df[i] = df[i].astype("category")

df.drop(["Unnamed: 0"], axis=1, inplace=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   collector_name            1000 non-null   category
 1   collector_address         1000 non-null   category
 2   collector_number          1000 non-null   category
 3   collector_nik             1000 non-null   int64   
 4   time_to_collect           1000 non-null   int64   
 5   avg_bill_methods          1000 non-null   category
 6   debtor_volume_handled     1000 non-null   int64   
 7   bill_amount_collected     1000 non-null   int64   
 8   total_actual              1000 non-null   int64   
 9   total_cost                1000 non-null   int64   
 10  success_rate              1000 non-null   float64 
 11  collector_zip             1000 non-null   int64   
 12  collector_rt              1000 non-null   int64   
 13  collector_rw              1000 non-null   int64  

In [5]:
# Preprocessing untuk fitur kategori
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder())
])

# Preprocessing untuk fitur numerik
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])
numeric_features = ['debtor_volume_handled', 'bill_amount_collected', 'total_actual', 'success_rate', 'collector_age', 'time_to_collect']
categorical_features = ['avg_bill_methods', 'collector_gender', 'collector_marital_status']
drop_features = ["collector_name", "collector_number", "collector_nik", "collector_zip",
                 "collector_rt", "collector_rw", "collector_birth_place", "collector_address"]

# Gabungkan transformer dengan ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('drop', 'drop', drop_features)
    ],
    remainder='passthrough'
)


In [6]:
print(numeric_features)
print(categorical_features)
print(drop_features)

['debtor_volume_handled', 'bill_amount_collected', 'total_actual', 'success_rate', 'collector_age', 'time_to_collect']
['avg_bill_methods', 'collector_gender', 'collector_marital_status']
['collector_name', 'collector_number', 'collector_nik', 'collector_zip', 'collector_rt', 'collector_rw', 'collector_birth_place', 'collector_address']


In [7]:
X = df.drop("total_cost", axis=1)
y = df["total_cost"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)


In [8]:
hist = joblib.load("gradient2.pkl")
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', hist)
])

model.fit(X_train, y_train)


In [9]:
model.predict(X_test)[0]

364697.46575002023

In [10]:
y_test.iloc[0]

374773

In [11]:
joblib.dump(model, "total_cost.joblib")

['total_cost.joblib']

In [12]:
X_test.iloc[0]

collector_name                                                 Susan Martinez
collector_address           Gg. Ahmad Dahlan No. 253\nSerang, Kalimantan B...
collector_number                                             +62-678-369-8793
collector_nik                                                4222058146794105
time_to_collect                                                             9
avg_bill_methods                                              surat panggilan
debtor_volume_handled                                                      24
bill_amount_collected                                                18500000
total_actual                                                        175500000
success_rate                                                             11.0
collector_zip                                                           85262
collector_rt                                                                5
collector_rw                                                    

In [13]:
avg_bill_methods = "surat panggilan"
debtor_volume_handled = 20
bill_amount_collected = 117500000
total_actual = 276500000
success_rate = 42.0
time_to_collect = 8
collector_gender = "perempuan"
collector_marital_status = "Cerai hidup"
collector_age = 36

a = pd.DataFrame(
    {
        "avg_bill_methods": [avg_bill_methods],
        "debtor_volume_handled": [debtor_volume_handled],
        "bill_amount_collected": [bill_amount_collected],
        "total_actual": [total_actual],
        "success_rate": [success_rate],
        "time_to_collect": [time_to_collect],
        "collector_gender": [collector_gender],
        "collector_marital_status": [collector_marital_status],
        "collector_age": [collector_age]
    }
)

a

Unnamed: 0,avg_bill_methods,debtor_volume_handled,bill_amount_collected,total_actual,success_rate,time_to_collect,collector_gender,collector_marital_status,collector_age
0,surat panggilan,20,117500000,276500000,42.0,8,perempuan,Cerai hidup,36


In [14]:
model.predict(a)

array([309494.56672238])

In [15]:
y_test.iloc[0]

374773

In [16]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = mse ** (1/2)
r_square = r2_score(y_test, y_pred)

print(f"""
      Mean Squared Errror : {mse:.2f}
      Mean Absolute Error : {mae:.2f}
      Mean Absolute Percentage Error : {mape:.2f}
      Root Mean Squared Error : {rmse:.2f}
      R_Squared : {r_square:.2f}
      """)


      Mean Squared Errror : 39489068318.51
      Mean Absolute Error : 135379.10
      Mean Absolute Percentage Error : 0.35
      Root Mean Squared Error : 198718.57
      R_Squared : 0.95
      
