In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from lazypredict.Supervised import LazyRegressor


In [2]:
df = pd.read_csv(r"C:\Users\DHONI HANIF\OneDrive\Documents\AI_Collection_and_Loss_Reverse_Forecast\data_preparation\EDA\Univariate_bivariate_multivariate\Univariate_Analysis\new_kolektor.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                1000 non-null   int64  
 1   time_to_collect           1000 non-null   int64  
 2   avg_bill_methods          1000 non-null   object 
 3   debtor_volume_handled     1000 non-null   int64  
 4   bill_amount_collected     1000 non-null   int64  
 5   total_actual              1000 non-null   int64  
 6   total_cost                1000 non-null   int64  
 7   success_rate              1000 non-null   float64
 8   collector_age             1000 non-null   int64  
 9   collector_marital_status  1000 non-null   object 
 10  collector_gender          1000 non-null   object 
dtypes: float64(1), int64(7), object(3)
memory usage: 86.1+ KB


In [3]:
for i in df.columns:
    if df[i].dtype == "object":
        df[i] = df[i].astype("category")

df.drop(["Unnamed: 0"], axis=1, inplace=True)

In [4]:
df.head()

Unnamed: 0,time_to_collect,avg_bill_methods,debtor_volume_handled,bill_amount_collected,total_actual,total_cost,success_rate,collector_age,collector_marital_status,collector_gender
0,0,sms or WA,7,14400000,35700000,277706,40.0,45,Menikah,laki-laki
1,11,panggilan,35,79000000,160500000,22382611,49.0,32,Cerai mati,perempuan
2,4,sms or WA,6,7100000,26700000,205389,27.0,62,Cerai hidup,laki-laki
3,29,datang ke tempat,63,782000000,1395000000,482942548,56.0,55,Cerai mati,perempuan
4,22,datang ke tempat,35,346000000,485500000,151035603,71.0,53,Menikah,perempuan


In [5]:
df["avg_bill_methods"].value_counts()

avg_bill_methods
datang ke tempat    481
sms or WA           198
panggilan           162
surat panggilan     159
Name: count, dtype: int64

In [6]:
bill_methods = LabelEncoder().fit(df["avg_bill_methods"].to_numpy().reshape(-1, 1))
gender = LabelEncoder().fit(df["collector_gender"].to_numpy().reshape(-1, 1))
mart = LabelEncoder().fit(df["collector_marital_status"].to_numpy().reshape(-1, 1))
df2 = pd.DataFrame()

df2["avg_bill_methods"] = bill_methods.transform(df["avg_bill_methods"])
df2["collector_gender"] = gender.transform(df["collector_gender"])
df2["collector_marital_status"] = mart.transform(df["collector_marital_status"])
           
scaler = RobustScaler().fit(df[["bill_amount_collected", "total_actual", "total_cost"]])
df2[["bill_amount_collected", "total_actual", "total_cost"]] = scaler.transform(df[["bill_amount_collected", "total_actual", "total_cost"]])
df2[["debtor_volume_handled", "collector_age", "success_rate"]] = df[["debtor_volume_handled", "collector_age", "success_rate"]]
y = df["time_to_collect"]

df2.head(5)

Unnamed: 0,avg_bill_methods,collector_gender,collector_marital_status,bill_amount_collected,total_actual,total_cost,debtor_volume_handled,collector_age,success_rate
0,2,0,3,-0.46,-0.52,-0.25,7,45,40.0
1,1,1,2,-0.28,-0.34,-0.16,35,32,49.0
2,2,0,1,-0.48,-0.53,-0.25,6,62,27.0
3,0,1,2,1.64,1.39,1.72,63,55,56.0
4,0,1,3,0.45,0.11,0.36,35,53,71.0


In [7]:
X = df2

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
reg = LazyRegressor(verbose=0, ignore_warnings=True)
models, predictions = reg.fit(X_train, X_test, y_train, y_test)
models

 12%|█▏        | 5/42 [00:00<00:04,  7.99it/s]

100%|██████████| 42/42 [00:07<00:00,  5.59it/s]

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 984
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 9
[LightGBM] [Info] Start training from score 14.565000





Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AdaBoostRegressor,0.87,0.88,2.97,0.22
GradientBoostingRegressor,0.86,0.87,3.09,0.6
RandomForestRegressor,0.86,0.87,3.11,1.59
ExtraTreesRegressor,0.86,0.87,3.13,0.81
BaggingRegressor,0.85,0.85,3.26,0.29
XGBRegressor,0.85,0.85,3.27,0.21
LGBMRegressor,0.84,0.85,3.33,0.2
HistGradientBoostingRegressor,0.84,0.84,3.38,0.91
KNeighborsRegressor,0.83,0.84,3.44,0.02
NuSVR,0.83,0.84,3.47,0.06


In [8]:
def evaluate_performa(X_test, y_test, model1, model2, model3, model4, model5):
    rate = [0, 0, 0, 0, 0]
    #difference = [0, 0, 0, 0, 0]

    X_test = X_test.reset_index(drop=True)
    for i, _ in X_test.iterrows():
        X = np.array(X_test.iloc[[i]])
        y = np.array(y_test.iloc[[i]])
        difference = [0, 0, 0, 0, 0]
        difference[0] = abs(model1.predict(X) - y[0])
        difference[1] = abs(model2.predict(X) - y[0])
        difference[2] = abs(model3.predict(X) - y[0])
        difference[3] = abs(model4.predict(X) - y[0])
        difference[4] = abs(model5.predict(X) - y[0])
        print(difference)

        a = int(np.argmin(difference))
        print(a)
        for i in range(len(difference)):
            if difference[a] == difference[i]:
                rate[i] += 1
    
    a = f"Best model : {rate.index(max(rate))+1}"
    return a, rate

In [9]:
extra = ExtraTreesRegressor()
forest = RandomForestRegressor()
ada = AdaBoostRegressor()
gradient = GradientBoostingRegressor()
bagging = BaggingRegressor()

extra.fit(X_train, y_train)
forest.fit(X_train, y_train)
ada.fit(X_train, y_train)
gradient.fit(X_train, y_train)
bagging.fit(X_train, y_train)

a = evaluate_performa(X_test, y_test, extra, forest, gradient, bagging, ada)
a

[array([3.1]), array([3.7]), array([2.82187853]), array([3.]), array([3.31564246])]
2
[array([8.37]), array([7.41]), array([7.64720435]), array([7.1]), array([6.82012195])]
4
[array([2.58]), array([2.26]), array([2.38621604]), array([1.4]), array([2.46099291])]
3
[array([1.59]), array([1.6]), array([0.64123183]), array([1.1]), array([1.41772152])]
2
[array([2.54]), array([2.41]), array([1.95281579]), array([2.9]), array([2.13432836])]
2
[array([7.83]), array([7.12]), array([6.88444]), array([7.7]), array([6.68435754])]
4
[array([1.83]), array([1.84]), array([1.97930781]), array([2.9]), array([2.08108108])]
0
[array([1.1]), array([1.]), array([0.55668441]), array([1.2]), array([0.86567164])]
2
[array([0.5]), array([0.1]), array([1.86375113]), array([1.3]), array([0.45355191])]
1
[array([3.22]), array([2.81]), array([2.63975143]), array([2.2]), array([2.88235294])]
3
[array([0.1]), array([0.34]), array([0.05498302]), array([1.1]), array([0.41772152])]
2
[array([0.03]), array([0.23]), arr

('Best model : 5', [43, 25, 38, 45, 50])