In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, ExtraTreesRegressor, BaggingRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from lazypredict.Supervised import LazyRegressor


In [2]:
df = pd.read_csv(r"C:\Users\DHONI HANIF\OneDrive\Documents\AI_Collection_and_Loss_Reverse_Forecast\modelling\kolektor\regresi_for_total_cost\data\data2.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                1000 non-null   int64  
 1   avg_bill_methods          1000 non-null   object 
 2   debtor_volume_handled     1000 non-null   int64  
 3   bill_amount_collected     1000 non-null   int64  
 4   total_actual              1000 non-null   int64  
 5   total_cost                1000 non-null   int64  
 6   success_rate              1000 non-null   float64
 7   time_to_collect           1000 non-null   int64  
 8   collector_gender          1000 non-null   object 
 9   collector_marital_status  1000 non-null   object 
 10  collector_age             1000 non-null   int64  
dtypes: float64(1), int64(7), object(3)
memory usage: 86.1+ KB


In [3]:
for i in df.columns:
    if df[i].dtype == "object":
        df[i] = df[i].astype("category")

df.drop(["Unnamed: 0"], axis=1, inplace=True)

In [4]:
df.head()

Unnamed: 0,avg_bill_methods,debtor_volume_handled,bill_amount_collected,total_actual,total_cost,success_rate,time_to_collect,collector_gender,collector_marital_status,collector_age
0,sms or WA,10,61600000,67200000,72549,92.0,0,laki-laki,Menikah,45
1,sms or WA,15,32800000,140200000,86235,23.0,3,perempuan,Cerai mati,32
2,sms or WA,10,38400000,55300000,99363,69.0,1,laki-laki,Cerai hidup,62
3,sms or WA,9,32500000,57400000,76118,57.0,1,perempuan,Cerai mati,55
4,sms or WA,6,21000000,25400000,61461,83.0,4,perempuan,Menikah,53


In [5]:
df["avg_bill_methods"].value_counts()

avg_bill_methods
sms or WA           300
panggilan           250
surat panggilan     250
datang ke tempat    200
Name: count, dtype: int64

In [6]:
bill_methods = LabelEncoder().fit(df["avg_bill_methods"].to_numpy().reshape(-1, 1))
gender = LabelEncoder().fit(df["collector_gender"].to_numpy().reshape(-1, 1))
mart = LabelEncoder().fit(df["collector_marital_status"].to_numpy().reshape(-1, 1))
df2 = pd.DataFrame()

df2["avg_bill_methods"] = bill_methods.transform(df["avg_bill_methods"])
df2["collector_gender"] = gender.transform(df["collector_gender"])
df2["collector_marital_status"] = mart.transform(df["collector_marital_status"])
           
scaler = RobustScaler().fit(df[["bill_amount_collected", "total_actual", "total_cost"]])
df2[["bill_amount_collected", "total_actual", "total_cost"]] = scaler.transform(df[["bill_amount_collected", "total_actual", "total_cost"]])
df2[["debtor_volume_handled", "collector_age", "success_rate"]] = df[["debtor_volume_handled", "collector_age", "success_rate"]]
y = df["time_to_collect"]

df2.head(5)

Unnamed: 0,avg_bill_methods,collector_gender,collector_marital_status,bill_amount_collected,total_actual,total_cost,debtor_volume_handled,collector_age,success_rate
0,2,0,3,-0.19,-0.35,-0.36,10,45,92.0
1,2,1,2,-0.33,-0.18,-0.35,15,32,23.0
2,2,0,1,-0.3,-0.37,-0.34,10,62,69.0
3,2,1,2,-0.33,-0.37,-0.36,9,55,57.0
4,2,1,3,-0.39,-0.44,-0.37,6,53,83.0


In [7]:
X = df2

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
reg = LazyRegressor(verbose=0, ignore_warnings=True)
models, predictions = reg.fit(X_train, X_test, y_train, y_test)
models

 17%|█▋        | 7/42 [00:00<00:03,  9.36it/s]

100%|██████████| 42/42 [00:09<00:00,  4.51it/s]

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 973
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 9
[LightGBM] [Info] Start training from score 10.130000





Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AdaBoostRegressor,0.92,0.92,2.28,0.11
GradientBoostingRegressor,0.92,0.92,2.28,0.68
RandomForestRegressor,0.91,0.92,2.29,1.73
ExtraTreesRegressor,0.91,0.91,2.36,0.89
BaggingRegressor,0.91,0.91,2.37,0.24
LGBMRegressor,0.91,0.91,2.38,0.17
HistGradientBoostingRegressor,0.91,0.91,2.39,1.36
KNeighborsRegressor,0.9,0.9,2.54,0.04
XGBRegressor,0.89,0.89,2.65,0.26
MLPRegressor,0.87,0.87,2.88,1.83


In [8]:
def evaluate_performa(X_test, y_test, model1, model2, model3, model4, model5, model6):
    rate = [0, 0, 0, 0, 0, 0]
    #difference = [0, 0, 0, 0, 0]

    X_test = X_test.reset_index(drop=True)
    for i, _ in X_test.iterrows():
        X = np.array(X_test.iloc[[i]])
        y = np.array(y_test.iloc[[i]])
        difference = [0, 0, 0, 0, 0, 0]
        difference[0] = abs(model1.predict(X) - y[0])
        difference[1] = abs(model2.predict(X) - y[0])
        difference[2] = abs(model3.predict(X) - y[0])
        difference[3] = abs(model4.predict(X) - y[0])
        difference[4] = abs(model5.predict(X) - y[0])
        difference[5] = abs(model6.predict(X) - y[0])
        print(difference)

        a = int(np.argmin(difference))
        print(a)
        for i in range(len(difference)):
            if difference[a] == difference[i]:
                rate[i] += 1
    
    a = f"Best model : {rate.index(max(rate))+1}"
    return a, rate

In [9]:

forest = RandomForestRegressor()
gradient = GradientBoostingRegressor()
ada = AdaBoostRegressor()
extra = ExtraTreesRegressor()
knn = KNeighborsRegressor()
bagging = BaggingRegressor()

forest.fit(X_train, y_train)
gradient.fit(X_train, y_train)
ada.fit(X_train, y_train)
extra.fit(X_train, y_train)
knn.fit(X_train, y_train)
bagging.fit(X_train, y_train)

a = evaluate_performa(X_test, y_test, forest, gradient, ada, extra, knn, bagging)
a

[array([1.93]), array([1.5405043]), array([1.4516129]), array([1.68]), array([3.]), array([2.])]
2
[array([2.18]), array([1.49695129]), array([1.49541284]), array([1.17]), array([3.6]), array([1.9])]
3
[array([1.76]), array([1.39770233]), array([1.43859649]), array([1.53]), array([2.8]), array([2.2])]
1
[array([0.44]), array([0.62174717]), array([0.49541284]), array([0.79]), array([1.8]), array([0.])]
5
[array([0.64]), array([0.33973639]), array([0.4516129]), array([0.53]), array([1.]), array([0.2])]
5
[array([0.36]), array([0.33392503]), array([0.56140351]), array([0.53]), array([6.4]), array([0.2])]
5
[array([1.14]), array([0.6470092]), array([0.56140351]), array([0.51]), array([3.2]), array([0.7])]
3
[array([0.33]), array([0.94997657]), array([0.5483871]), array([0.61]), array([1.6]), array([0.5])]
0
[array([7.24]), array([6.14572709]), array([6.39285714]), array([9.45]), array([17.]), array([9.1])]
1
[array([0.29]), array([0.12511431]), array([0.1958042]), array([0.26]), array([1.2

('Best model : 6', [24, 34, 36, 29, 36, 42])