In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

In [2]:
df = pd.read_csv(r"C:\Users\DHONI HANIF\OneDrive\Documents\AI Collection and Loss Reverse Forecast\data_preparation\EDA\Univariate_bivariate_multivariate\Univariate_Analysis\new_kolektor.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             1000 non-null   int64  
 1   time_to_collect        1000 non-null   int64  
 2   avg_bill_methods       1000 non-null   object 
 3   debtor_volume_handled  1000 non-null   int64  
 4   bill_amount_collected  1000 non-null   int64  
 5   total_actual           1000 non-null   int64  
 6   total_cost             1000 non-null   int64  
 7   success_rate           1000 non-null   float64
dtypes: float64(1), int64(6), object(1)
memory usage: 62.6+ KB


In [3]:
for i in df.columns:
    if df[i].dtype == "object":
        df[i] = df[i].astype("category")

df.drop(["Unnamed: 0"], axis=1, inplace=True)

In [4]:
df.head()

Unnamed: 0,time_to_collect,avg_bill_methods,debtor_volume_handled,bill_amount_collected,total_actual,total_cost,success_rate
0,0,sms or WA,7,14400000,35700000,277706,40.0
1,11,panggilan,35,79000000,160500000,22382611,49.0
2,4,sms or WA,6,7100000,26700000,205389,27.0
3,29,datang ke tempat,63,782000000,1395000000,482942548,56.0
4,22,datang ke tempat,35,346000000,485500000,151035603,71.0


In [5]:
bill_methods = {"sms or WA": 0, "surat panggilan": 1, "panggilan": 2, "datang ke tempat": 3}
columns = {"avg_bill_methods": bill_methods}
for i in df.columns:
    if df[i].dtype == "category" or df[i].dtype == "object":
        df[i] = [columns[i][j] for j in df[i]]

y = df["time_to_collect"]
df.drop("time_to_collect", axis=1, inplace=True)             
scaler = RobustScaler().fit(df[[i for i in df.columns if i not in columns]])
df2 = pd.DataFrame(scaler.transform(df[[i for i in df.columns if i not in columns]]), columns=df[[i for i in df.columns if i not in columns]].columns)
for i in columns:
    df2[i] = df[i]
df2.head(5)

Unnamed: 0,debtor_volume_handled,bill_amount_collected,total_actual,total_cost,success_rate,avg_bill_methods
0,-1.347826,-0.459398,-0.520698,-0.252668,-0.363636,0
1,-0.130435,-0.282327,-0.344973,-0.162494,-0.159091,2
2,-1.391304,-0.479408,-0.533371,-0.252963,-0.659091,0
3,1.086957,1.644624,1.39327,1.716301,0.0,3
4,-0.130435,0.449531,0.112644,0.36233,0.340909,3


In [6]:
X = df2

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800, 6), (200, 6), (800,), (200,))

In [7]:
model = BaggingRegressor()
model.fit(X_train, y_train)

In [8]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = mse ** (1/2)
r_square = r2_score(y_test, y_pred)

print(f"""
      Mean Squared Errror : {mse:.2f}
      Mean Absolute Error : {mae:.2f}
      Mean Absolute Percentage Error : {mape:.2f}
      Root Mean Squared Error : {mape:.2f}
      R_Squared : {r_square:.2f}
      """)


      Mean Squared Errror : 10.95
      Mean Absolute Error : 2.55
      Mean Absolute Percentage Error : 220676381741154.47
      Root Mean Squared Error : 220676381741154.47
      R_Squared : 0.85
      


In [9]:

# Tentukan model Bagging Regressor dengan estimator dasar (base estimator) yang sesuai
base_estimator = DecisionTreeRegressor()  # Contoh, Anda dapat mengganti ini dengan algoritma regresi yang lain
bagging_regressor = BaggingRegressor(base_estimator, n_estimators=10)  # Anda dapat menyesuaikan n_estimators sesuai kebutuhan

# Tentukan tipe cross-validation dan hitung skor model
cv_scores = cross_val_score(bagging_regressor, X, y, cv=5)  # Ganti cv dengan jumlah fold yang diinginkan

# Tampilkan hasil skor cross-validation
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())

Cross-Validation Scores: [0.85920538 0.88030542 0.86950014 0.87818153 0.83259496]
Mean CV Score: 0.8639574853323595


In [10]:
model = BaggingRegressor(DecisionTreeRegressor())
model.fit(X_train, y_train)

In [11]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = mse ** (1/2)
r_square = r2_score(y_test, y_pred)

print(f"""
      Mean Squared Errror : {mse:.2f}
      Mean Absolute Error : {mae:.2f}
      Mean Absolute Percentage Error : {mape:.2f}
      Root Mean Squared Error : {mape:.2f}
      R_Squared : {r_square:.2f}
      """)


      Mean Squared Errror : 11.06
      Mean Absolute Error : 2.57
      Mean Absolute Percentage Error : 195906583790616.75
      Root Mean Squared Error : 195906583790616.75
      R_Squared : 0.85
      


In [12]:
param = {
    "base_estimator": [LinearRegression(), DecisionTreeRegressor(), SVR()],
    "n_estimators": np.arange(10, 101)
}

grid = GridSearchCV(estimator=BaggingRegressor(), param_grid=param, cv=5)
grid.fit(X_train, y_train)



In [13]:
grid.best_score_

0.8765080404857096

In [14]:
model2 = grid.best_estimator_
y_pred = model2.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = mse ** (1/2)
r_square = r2_score(y_test, y_pred)

print(f"""
      Mean Squared Errror : {mse:.2f}
      Mean Absolute Error : {mae:.2f}
      Mean Absolute Percentage Error : {mape:.2f}
      Root Mean Squared Error : {mape:.2f}
      R_Squared : {r_square:.2f}
      """)


      Mean Squared Errror : 9.44
      Mean Absolute Error : 2.46
      Mean Absolute Percentage Error : 196809607419185.59
      Root Mean Squared Error : 196809607419185.59
      R_Squared : 0.87
      


In [15]:
import pickle

pickle.dump(model, open("bagging.pkl", "wb"))