In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, ExtraTreesRegressor, BaggingRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from lazypredict.Supervised import LazyRegressor


In [2]:
df = pd.read_csv(r"C:\Users\DHONI HANIF\OneDrive\Documents\AI_Collection_and_Loss_Reverse_Forecast\modelling\kolektor\regresi_for_total_cost\data\data2.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                1000 non-null   int64  
 1   avg_bill_methods          1000 non-null   object 
 2   debtor_volume_handled     1000 non-null   int64  
 3   bill_amount_collected     1000 non-null   int64  
 4   total_actual              1000 non-null   int64  
 5   total_cost                1000 non-null   int64  
 6   success_rate              1000 non-null   float64
 7   time_to_collect           1000 non-null   int64  
 8   collector_gender          1000 non-null   object 
 9   collector_marital_status  1000 non-null   object 
 10  collector_age             1000 non-null   int64  
dtypes: float64(1), int64(7), object(3)
memory usage: 86.1+ KB


In [3]:
for i in df.columns:
    if df[i].dtype == "object":
        df[i] = df[i].astype("category")

df.drop(["Unnamed: 0"], axis=1, inplace=True)

In [4]:
df.head()

Unnamed: 0,avg_bill_methods,debtor_volume_handled,bill_amount_collected,total_actual,total_cost,success_rate,time_to_collect,collector_gender,collector_marital_status,collector_age
0,sms or WA,13,46800000,123100000,99948,38.0,1,laki-laki,Menikah,45
1,sms or WA,10,52400000,58300000,72633,90.0,1,perempuan,Cerai mati,32
2,sms or WA,13,21300000,127900000,79992,17.0,2,laki-laki,Cerai hidup,62
3,sms or WA,12,47600000,78400000,83538,61.0,1,perempuan,Cerai mati,55
4,sms or WA,19,16700000,31600000,133245,53.0,0,perempuan,Menikah,53


In [5]:
df["avg_bill_methods"].value_counts()

avg_bill_methods
sms or WA           300
panggilan           250
surat panggilan     250
datang ke tempat    200
Name: count, dtype: int64

In [6]:
bill_methods = LabelEncoder().fit(df["avg_bill_methods"].to_numpy().reshape(-1, 1))
gender = LabelEncoder().fit(df["collector_gender"].to_numpy().reshape(-1, 1))
mart = LabelEncoder().fit(df["collector_marital_status"].to_numpy().reshape(-1, 1))
df2 = pd.DataFrame()

df2["avg_bill_methods"] = bill_methods.transform(df["avg_bill_methods"])
df2["collector_gender"] = gender.transform(df["collector_gender"])
df2["collector_marital_status"] = mart.transform(df["collector_marital_status"])
           
scaler = RobustScaler().fit(df[["bill_amount_collected", "total_actual", "total_cost"]])
df2[["bill_amount_collected", "total_actual", "total_cost"]] = scaler.transform(df[["bill_amount_collected", "total_actual", "total_cost"]])
df2[["debtor_volume_handled", "collector_age", "success_rate"]] = df[["debtor_volume_handled", "collector_age", "success_rate"]]
y = df["time_to_collect"]

df2.head(5)

Unnamed: 0,avg_bill_methods,collector_gender,collector_marital_status,bill_amount_collected,total_actual,total_cost,debtor_volume_handled,collector_age,success_rate
0,2,0,3,-0.25,-0.26,-0.34,13,45,38.0
1,2,1,2,-0.22,-0.4,-0.36,10,32,90.0
2,2,0,1,-0.35,-0.25,-0.35,13,62,17.0
3,2,1,2,-0.24,-0.36,-0.35,12,55,61.0
4,2,1,3,-0.37,-0.46,-0.31,19,53,53.0


In [7]:
X = df2

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
reg = LazyRegressor(verbose=0, ignore_warnings=True)
models, predictions = reg.fit(X_train, X_test, y_train, y_test)
models

  5%|▍         | 2/42 [00:00<00:09,  4.35it/s]

100%|██████████| 42/42 [00:10<00:00,  3.92it/s]

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 973
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 9
[LightGBM] [Info] Start training from score 8.058750





Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AdaBoostRegressor,0.93,0.93,1.24,0.24
RandomForestRegressor,0.92,0.92,1.29,1.98
GradientBoostingRegressor,0.92,0.92,1.32,0.7
KNeighborsRegressor,0.92,0.92,1.33,0.03
ExtraTreesRegressor,0.91,0.92,1.33,2.02
BaggingRegressor,0.91,0.91,1.38,0.22
HistGradientBoostingRegressor,0.91,0.91,1.4,1.62
LGBMRegressor,0.9,0.91,1.41,0.22
XGBRegressor,0.89,0.9,1.51,0.28
NuSVR,0.87,0.88,1.65,0.1


In [8]:
def evaluate_performa(X_test, y_test, model1, model2, model3, model4, model5, model6):
    rate = [0, 0, 0, 0, 0, 0]
    #difference = [0, 0, 0, 0, 0]

    X_test = X_test.reset_index(drop=True)
    for i, _ in X_test.iterrows():
        X = np.array(X_test.iloc[[i]])
        y = np.array(y_test.iloc[[i]])
        difference = [0, 0, 0, 0, 0, 0]
        difference[0] = abs(model1.predict(X) - y[0])
        difference[1] = abs(model2.predict(X) - y[0])
        difference[2] = abs(model3.predict(X) - y[0])
        difference[3] = abs(model4.predict(X) - y[0])
        difference[4] = abs(model5.predict(X) - y[0])
        difference[5] = abs(model6.predict(X) - y[0])
        print(difference)

        a = int(np.argmin(difference))
        print(a)
        for i in range(len(difference)):
            if difference[a] == difference[i]:
                rate[i] += 1
    
    a = f"Best model : {rate.index(max(rate))+1}"
    return a, rate

In [9]:

forest = RandomForestRegressor()
gradient = GradientBoostingRegressor()
ada = AdaBoostRegressor()
extra = ExtraTreesRegressor()
knn = KNeighborsRegressor()
bagging = BaggingRegressor()

forest.fit(X_train, y_train)
gradient.fit(X_train, y_train)
ada.fit(X_train, y_train)
extra.fit(X_train, y_train)
knn.fit(X_train, y_train)
bagging.fit(X_train, y_train)

a = evaluate_performa(X_test, y_test, forest, gradient, ada, extra, knn, bagging)
a

[array([0.92]), array([1.20332933]), array([0.96363636]), array([0.84]), array([1.6]), array([0.8])]
5
[array([1.38]), array([1.55857786]), array([1.75698324]), array([1.42]), array([0.6]), array([1.3])]
4
[array([0.77]), array([0.82140316]), array([0.44479495]), array([1.1]), array([1.8]), array([0.8])]
2
[array([0.3]), array([0.64458111]), array([0.75698324]), array([0.41]), array([1.2]), array([0.2])]
5
[array([0.89]), array([0.79684468]), array([1.38823529]), array([1.31]), array([0.8]), array([1.1])]
1
[array([0.22]), array([0.56096429]), array([0.44479495]), array([0.31]), array([2.]), array([0.1])]
5
[array([0.36]), array([0.32333902]), array([0.24301676]), array([0.45]), array([0.4]), array([0.5])]
2
[array([1.71]), array([1.63169073]), array([1.4491018]), array([1.77]), array([0.8]), array([1.9])]
4
[array([0.59]), array([0.47003571]), array([0.55520505]), array([0.7]), array([0.8]), array([0.7])]
1
[array([3.36]), array([2.59404616]), array([2.67857143]), array([3.3]), array(

('Best model : 5', [14, 29, 43, 24, 54, 42])