In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor, BaggingRegressor
from lazypredict.Supervised import LazyRegressor

In [2]:
df = pd.read_csv("./data/data2.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                1000 non-null   int64  
 1   avg_bill_methods          1000 non-null   object 
 2   debtor_volume_handled     1000 non-null   int64  
 3   bill_amount_collected     1000 non-null   int64  
 4   total_actual              1000 non-null   int64  
 5   total_cost                1000 non-null   int64  
 6   success_rate              1000 non-null   float64
 7   time_to_collect           1000 non-null   int64  
 8   collector_gender          1000 non-null   object 
 9   collector_marital_status  1000 non-null   object 
 10  collector_age             1000 non-null   int64  
dtypes: float64(1), int64(7), object(3)
memory usage: 86.1+ KB


In [3]:
for i in df.columns:
    if df[i].dtype == "object":
        df[i] = df[i].astype("category")

df.drop(["Unnamed: 0"], axis=1, inplace=True)

In [4]:
df.head()

Unnamed: 0,avg_bill_methods,debtor_volume_handled,bill_amount_collected,total_actual,total_cost,success_rate,time_to_collect,collector_gender,collector_marital_status,collector_age
0,sms or WA,13,46800000,123100000,99948,38.0,1,laki-laki,Menikah,45
1,sms or WA,10,52400000,58300000,72633,90.0,1,perempuan,Cerai mati,32
2,sms or WA,13,21300000,127900000,79992,17.0,2,laki-laki,Cerai hidup,62
3,sms or WA,12,47600000,78400000,83538,61.0,1,perempuan,Cerai mati,55
4,sms or WA,19,16700000,31600000,133245,53.0,0,perempuan,Menikah,53


In [5]:
df["avg_bill_methods"].value_counts()

avg_bill_methods
sms or WA           300
panggilan           250
surat panggilan     250
datang ke tempat    200
Name: count, dtype: int64

In [6]:
bill_methods = LabelEncoder().fit(df["avg_bill_methods"].to_numpy().reshape(-1, 1))
gender = LabelEncoder().fit(df["collector_gender"].to_numpy().reshape(-1, 1))
mart = LabelEncoder().fit(df["collector_marital_status"].to_numpy().reshape(-1, 1))
df2 = pd.DataFrame()

df2["avg_bill_methods"] = bill_methods.transform(df["avg_bill_methods"])
df2["collector_gender"] = gender.transform(df["collector_gender"])
df2["collector_marital_status"] = mart.transform(df["collector_marital_status"])
           
scaler = RobustScaler().fit(df[["bill_amount_collected", "total_actual"]])
df2[["bill_amount_collected", "total_actual"]] = scaler.transform(df[["bill_amount_collected", "total_actual"]])
df2[["time_to_collect", "debtor_volume_handled", "collector_age", "success_rate"]] = df[["time_to_collect", "debtor_volume_handled", "collector_age", "success_rate"]]
y = df["total_cost"]

df2.head(5)

Unnamed: 0,avg_bill_methods,collector_gender,collector_marital_status,bill_amount_collected,total_actual,time_to_collect,debtor_volume_handled,collector_age,success_rate
0,2,0,3,-0.25,-0.26,1,13,45,38.0
1,2,1,2,-0.22,-0.4,1,10,32,90.0
2,2,0,1,-0.35,-0.25,2,13,62,17.0
3,2,1,2,-0.24,-0.36,1,12,55,61.0
4,2,1,3,-0.37,-0.46,0,19,53,53.0


In [7]:
X = df2

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
reg = LazyRegressor(verbose=0, ignore_warnings=True)
models, predictions = reg.fit(X_train, X_test, y_train, y_test)
models

  5%|▍         | 2/42 [00:00<00:05,  7.02it/s]

100%|██████████| 42/42 [00:06<00:00,  6.53it/s]

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 732
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 9
[LightGBM] [Info] Start training from score 936858.023750





Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RandomForestRegressor,0.96,0.96,196053.09,1.33
GradientBoostingRegressor,0.96,0.96,199970.86,0.36
AdaBoostRegressor,0.95,0.96,202727.1,0.11
ExtraTreesRegressor,0.95,0.96,204606.92,0.94
LGBMRegressor,0.95,0.95,206764.55,0.13
HistGradientBoostingRegressor,0.95,0.95,208177.11,0.79
XGBRegressor,0.95,0.95,211459.78,0.17
BaggingRegressor,0.95,0.95,211820.44,0.16
DecisionTreeRegressor,0.92,0.92,271277.4,0.03
OrthogonalMatchingPursuitCV,0.92,0.92,276906.35,0.03


In [8]:
y.describe()

count      1000.00
mean     947289.71
std      938523.75
min        5529.00
25%      120380.25
50%      565866.50
75%     1499679.00
max     3534796.00
Name: total_cost, dtype: float64

In [9]:
def evaluate_performa(X_test, y_test, model1, model2, model3, model4, model5, model6):
    rate = [0, 0, 0, 0, 0, 0]
    #difference = [0, 0, 0, 0, 0]

    X_test = X_test.reset_index(drop=True)
    for i, _ in X_test.iterrows():
        X = np.array(X_test.iloc[[i]])
        y = np.array(y_test.iloc[[i]])
        difference = [0, 0, 0, 0, 0, 0]
        difference[0] = abs(model1.predict(X) - y[0])
        difference[1] = abs(model2.predict(X) - y[0])
        difference[2] = abs(model3.predict(X) - y[0])
        difference[3] = abs(model4.predict(X) - y[0])
        difference[4] = abs(model5.predict(X) - y[0])
        difference[5] = abs(model6.predict(X) - y[0])
        print(difference)

        a = int(np.argmin(difference))
        print(a)
        for i in range(len(difference)):
            if difference[a] == difference[i]:
                rate[i] += 1
    
    a = f"Best model : {rate.index(max(rate))+1}"
    return a, rate

In [10]:

forest = RandomForestRegressor()
gradient = GradientBoostingRegressor()
ada = AdaBoostRegressor()
extra = ExtraTreesRegressor()
hist = HistGradientBoostingRegressor()
bagging = BaggingRegressor()

forest.fit(X_train, y_train)
gradient.fit(X_train, y_train)
ada.fit(X_train, y_train)
extra.fit(X_train, y_train)
hist.fit(X_train, y_train)
bagging.fit(X_train, y_train)

a = evaluate_performa(X_test, y_test, forest, gradient, ada, extra, hist, bagging)
a

[array([144545.99]), array([73727.36149598]), array([183622.18421053]), array([145757.81]), array([91033.58963385]), array([153116.5])]
1
[array([150297.45]), array([118418.5007438]), array([12128.64]), array([137551.85]), array([99134.11797862]), array([173253.8])]
2
[array([21106.5]), array([14052.01986792]), array([157163.64]), array([10948.84]), array([22494.94744569]), array([36595.8])]
3
[array([64656.75]), array([30638.80777438]), array([24026.15441176]), array([117523.55]), array([34425.70292903]), array([36482.2])]
2
[array([166533.43]), array([135716.70596939]), array([96198.79411765]), array([173471.04]), array([208884.74577896]), array([180027.2])]
2
[array([215072.5]), array([209415.67221664]), array([155740.35078534]), array([239656.43]), array([186574.99246045]), array([168033.6])]
2
[array([39272.87]), array([91438.3718684]), array([246213.76237624]), array([32785.58]), array([57953.48784686]), array([66860.1])]
3
[array([126660.67]), array([108953.42142725]), array([11

('Best model : 5', [18, 29, 34, 33, 47, 39])