In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor, BaggingRegressor
from lazypredict.Supervised import LazyRegressor

In [2]:
df = pd.read_csv("./data/data2.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                1000 non-null   int64  
 1   avg_bill_methods          1000 non-null   object 
 2   debtor_volume_handled     1000 non-null   int64  
 3   bill_amount_collected     1000 non-null   int64  
 4   total_actual              1000 non-null   int64  
 5   total_cost                1000 non-null   int64  
 6   success_rate              1000 non-null   float64
 7   time_to_collect           1000 non-null   int64  
 8   collector_gender          1000 non-null   object 
 9   collector_marital_status  1000 non-null   object 
 10  collector_age             1000 non-null   int64  
dtypes: float64(1), int64(7), object(3)
memory usage: 86.1+ KB


In [3]:
for i in df.columns:
    if df[i].dtype == "object":
        df[i] = df[i].astype("category")

df.drop(["Unnamed: 0"], axis=1, inplace=True)

In [4]:
df.head()

Unnamed: 0,avg_bill_methods,debtor_volume_handled,bill_amount_collected,total_actual,total_cost,success_rate,time_to_collect,collector_gender,collector_marital_status,collector_age
0,sms or WA,10,61600000,67200000,72549,92.0,0,laki-laki,Menikah,45
1,sms or WA,15,32800000,140200000,86235,23.0,3,perempuan,Cerai mati,32
2,sms or WA,10,38400000,55300000,99363,69.0,1,laki-laki,Cerai hidup,62
3,sms or WA,9,32500000,57400000,76118,57.0,1,perempuan,Cerai mati,55
4,sms or WA,6,21000000,25400000,61461,83.0,4,perempuan,Menikah,53


In [5]:
df["avg_bill_methods"].value_counts()

avg_bill_methods
sms or WA           300
panggilan           250
surat panggilan     250
datang ke tempat    200
Name: count, dtype: int64

In [6]:
bill_methods = LabelEncoder().fit(df["avg_bill_methods"].to_numpy().reshape(-1, 1))
gender = LabelEncoder().fit(df["collector_gender"].to_numpy().reshape(-1, 1))
mart = LabelEncoder().fit(df["collector_marital_status"].to_numpy().reshape(-1, 1))
df2 = pd.DataFrame()

df2["avg_bill_methods"] = bill_methods.transform(df["avg_bill_methods"])
df2["collector_gender"] = gender.transform(df["collector_gender"])
df2["collector_marital_status"] = mart.transform(df["collector_marital_status"])
           
scaler = RobustScaler().fit(df[["bill_amount_collected", "total_actual"]])
df2[["bill_amount_collected", "total_actual"]] = scaler.transform(df[["bill_amount_collected", "total_actual"]])
df2[["time_to_collect", "debtor_volume_handled", "collector_age", "success_rate"]] = df[["time_to_collect", "debtor_volume_handled", "collector_age", "success_rate"]]
y = df["total_cost"]

df2.head(5)

Unnamed: 0,avg_bill_methods,collector_gender,collector_marital_status,bill_amount_collected,total_actual,time_to_collect,debtor_volume_handled,collector_age,success_rate
0,2,0,3,-0.19,-0.35,0,10,45,92.0
1,2,1,2,-0.33,-0.18,3,15,32,23.0
2,2,0,1,-0.3,-0.37,1,10,62,69.0
3,2,1,2,-0.33,-0.37,1,9,55,57.0
4,2,1,3,-0.39,-0.44,4,6,53,83.0


In [7]:
X = df2

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
reg = LazyRegressor(verbose=0, ignore_warnings=True)
models, predictions = reg.fit(X_train, X_test, y_train, y_test)
models

 14%|█▍        | 6/42 [00:00<00:02, 13.22it/s]

100%|██████████| 42/42 [00:07<00:00,  5.72it/s]

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 747
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 9
[LightGBM] [Info] Start training from score 925718.230000





Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RandomForestRegressor,0.95,0.95,201483.06,1.46
BaggingRegressor,0.95,0.95,201938.42,0.2
GradientBoostingRegressor,0.95,0.95,204553.93,0.44
ExtraTreesRegressor,0.95,0.95,205119.45,1.14
AdaBoostRegressor,0.95,0.95,206283.75,0.31
LGBMRegressor,0.94,0.95,216599.81,0.15
HistGradientBoostingRegressor,0.94,0.94,222988.11,0.62
XGBRegressor,0.94,0.94,223451.02,0.28
OrthogonalMatchingPursuitCV,0.93,0.93,243019.9,0.04
DecisionTreeRegressor,0.93,0.93,243027.31,0.04


In [8]:
y.describe()

count      1000.00
mean     933656.32
std      910148.06
min        5624.00
25%      118628.50
50%      568615.00
75%     1502473.25
max     3582769.00
Name: total_cost, dtype: float64

In [9]:
def evaluate_performa(X_test, y_test, model1, model2, model3, model4, model5, model6):
    rate = [0, 0, 0, 0, 0, 0]
    #difference = [0, 0, 0, 0, 0]

    X_test = X_test.reset_index(drop=True)
    for i, _ in X_test.iterrows():
        X = np.array(X_test.iloc[[i]])
        y = np.array(y_test.iloc[[i]])
        difference = [0, 0, 0, 0, 0, 0]
        difference[0] = abs(model1.predict(X) - y[0])
        difference[1] = abs(model2.predict(X) - y[0])
        difference[2] = abs(model3.predict(X) - y[0])
        difference[3] = abs(model4.predict(X) - y[0])
        difference[4] = abs(model5.predict(X) - y[0])
        difference[5] = abs(model6.predict(X) - y[0])
        print(difference)

        a = int(np.argmin(difference))
        print(a)
        for i in range(len(difference)):
            if difference[a] == difference[i]:
                rate[i] += 1
    
    a = f"Best model : {rate.index(max(rate))+1}"
    return a, rate

In [10]:

forest = RandomForestRegressor()
gradient = GradientBoostingRegressor()
ada = AdaBoostRegressor()
extra = ExtraTreesRegressor()
hist = HistGradientBoostingRegressor()
bagging = BaggingRegressor()

forest.fit(X_train, y_train)
gradient.fit(X_train, y_train)
ada.fit(X_train, y_train)
extra.fit(X_train, y_train)
hist.fit(X_train, y_train)
bagging.fit(X_train, y_train)

a = evaluate_performa(X_test, y_test, forest, gradient, ada, extra, hist, bagging)
a

[array([3298.82]), array([1265.74639933]), array([84215.70731707]), array([12295.61]), array([7233.69384775]), array([21301.8])]
1
[array([152084.97]), array([153750.63074208]), array([145234.18235294]), array([119058.48]), array([111285.4938068]), array([110013.])]
5
[array([90501.17]), array([9113.48218561]), array([40263.74162679]), array([3177.13]), array([19229.2721338]), array([31777.7])]
3
[array([197044.97]), array([112672.8972059]), array([178743.13664596]), array([97080.47]), array([297043.22679753]), array([181717.2])]
3
[array([51453.87]), array([73004.19021668]), array([8088.29268293]), array([79440.61]), array([106498.88788907]), array([56093.8])]
2
[array([195462.29]), array([87722.05646912]), array([132063.76632302]), array([125799.96]), array([198826.60757628]), array([231878.3])]
1
[array([221313.85]), array([149798.70812022]), array([185061.34782609]), array([266668.59]), array([200290.69572162]), array([93057.2])]
5
[array([134522.15]), array([84930.84704718]), arra

('Best model : 2', [25, 44, 25, 38, 32, 36])