In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from lazypredict.Supervised import LazyRegressor


In [2]:
df = pd.read_csv("./data/new_kolektor_data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             1000 non-null   int64  
 1   debtor_volume_handled  1000 non-null   float64
 2   bill_amount_collected  1000 non-null   float64
 3   total_actual           1000 non-null   float64
 4   success_rate           1000 non-null   float64
 5   avg_bill_methods       1000 non-null   int64  
 6   total_cost             1000 non-null   int64  
dtypes: float64(4), int64(3)
memory usage: 54.8 KB


In [3]:
for i in df.columns:
    if df[i].dtype == "object":
        df[i] = df[i].astype("category")

df.drop(["Unnamed: 0"], axis=1, inplace=True)

In [4]:
df.head()

Unnamed: 0,debtor_volume_handled,bill_amount_collected,total_actual,success_rate,avg_bill_methods,total_cost
0,7.0,14400000.0,35700000.0,40.0,0,277706
1,35.0,79000000.0,160500000.0,49.0,2,22382611
2,6.0,7100000.0,26700000.0,27.0,0,205389
3,63.0,782000000.0,1395000000.0,56.0,3,482942548
4,35.0,346000000.0,485500000.0,71.0,3,151035603


In [5]:
df["avg_bill_methods"].value_counts()

avg_bill_methods
3    481
0    198
2    162
1    159
Name: count, dtype: int64

In [6]:
bill_methods = {"sms or WA": 0, "surat panggilan": 1, "panggilan": 2, "datang ke tempat": 3}
columns = {"avg_bill_methods": bill_methods}
for i in df.columns:
    if df[i].dtype == "category" or df[i].dtype == "object":
        df[i] = [columns[i][j] for j in df[i]]

y = df["total_cost"]
df.drop("total_cost", axis=1, inplace=True)             
scaler = RobustScaler().fit(df[[i for i in df.columns if i not in columns]])
df2 = pd.DataFrame(scaler.transform(df[[i for i in df.columns if i not in columns]]), columns=df[[i for i in df.columns if i not in columns]].columns)
for i in columns:
    df2[i] = df[i]
df2.head(5)

Unnamed: 0,debtor_volume_handled,bill_amount_collected,total_actual,success_rate,avg_bill_methods
0,-1.35,-0.46,-0.52,-0.36,0
1,-0.13,-0.28,-0.34,-0.16,2
2,-1.39,-0.48,-0.53,-0.66,0
3,1.09,1.64,1.39,0.0,3
4,-0.13,0.45,0.11,0.34,3


In [7]:
X = df2

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
reg = LazyRegressor(verbose=0, ignore_warnings=True)
models, predictions = reg.fit(X_train, X_test, y_train, y_test)
models

  5%|▍         | 2/42 [00:00<00:06,  6.43it/s]

100%|██████████| 42/42 [00:07<00:00,  5.48it/s]

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 679
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 5
[LightGBM] [Info] Start training from score 134930141.780000





Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AdaBoostRegressor,0.68,0.69,97184954.07,0.12
PoissonRegressor,0.67,0.68,98871786.47,0.02
GradientBoostingRegressor,0.6,0.61,108825481.03,0.52
RandomForestRegressor,0.6,0.61,108917299.33,1.29
KNeighborsRegressor,0.59,0.6,110029705.51,0.04
BaggingRegressor,0.59,0.6,110093564.67,0.18
LGBMRegressor,0.58,0.59,112149286.42,0.17
HistGradientBoostingRegressor,0.56,0.58,113710946.8,1.17
SGDRegressor,0.56,0.57,114855902.57,0.01
LinearRegression,0.56,0.57,114921995.36,0.02


In [8]:
y.describe()

count        1000.00
mean    139633105.12
std     166600490.67
min         25263.00
25%       1928856.50
50%      62215627.00
75%     247064723.00
max     678041154.00
Name: total_cost, dtype: float64

In [10]:
X = df2.drop("success_rate", axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
reg = LazyRegressor(verbose=0, ignore_warnings=True)
models, predictions = reg.fit(X_train, X_test, y_train, y_test)
models

  2%|▏         | 1/42 [00:00<00:04,  9.66it/s]

100%|██████████| 42/42 [00:06<00:00,  6.40it/s]

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 584
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 4
[LightGBM] [Info] Start training from score 134930141.780000





Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PoissonRegressor,0.67,0.68,98985045.33,0.02
AdaBoostRegressor,0.67,0.67,99715067.53,0.1
GradientBoostingRegressor,0.62,0.63,106519238.09,0.41
RandomForestRegressor,0.61,0.62,108135738.42,1.19
HistGradientBoostingRegressor,0.6,0.61,108703283.03,0.79
BaggingRegressor,0.6,0.6,109778120.47,0.14
LGBMRegressor,0.59,0.6,110270695.87,0.14
KNeighborsRegressor,0.58,0.59,111747372.72,0.03
SGDRegressor,0.55,0.56,115339482.09,0.12
OrthogonalMatchingPursuitCV,0.55,0.56,115368966.15,0.03
