In [1]:
import numpy as np
import pandas as pd
import joblib, pickle, json

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import RobustScaler, LabelEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline  
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score

In [2]:
df = pd.read_csv(r"C:\Users\DHONI HANIF\OneDrive\Documents\AI_Collection_and_Loss_Reverse_Forecast\modelling\kolektor\regresi_for_total_cost\data\data2.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                1000 non-null   int64  
 1   avg_bill_methods          1000 non-null   object 
 2   debtor_volume_handled     1000 non-null   int64  
 3   bill_amount_collected     1000 non-null   int64  
 4   total_actual              1000 non-null   int64  
 5   total_cost                1000 non-null   int64  
 6   success_rate              1000 non-null   float64
 7   time_to_collect           1000 non-null   int64  
 8   collector_gender          1000 non-null   object 
 9   collector_marital_status  1000 non-null   object 
 10  collector_age             1000 non-null   int64  
dtypes: float64(1), int64(7), object(3)
memory usage: 86.1+ KB


In [3]:
for i in df.columns:
    if df[i].dtype == "object":
        df[i] = df[i].astype("category")

df.drop(["Unnamed: 0"], axis=1, inplace=True)

In [4]:
df["avg_bill_methods"].value_counts()

avg_bill_methods
sms or WA           300
panggilan           250
surat panggilan     250
datang ke tempat    200
Name: count, dtype: int64

In [5]:
# Preprocessing untuk fitur kategori
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder())
])

# Preprocessing untuk fitur numerik
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])
numeric_features = ['debtor_volume_handled', 'bill_amount_collected', 'total_actual', 'total_cost', 'success_rate', 'collector_age']
categorical_features = ['avg_bill_methods', 'collector_gender', 'collector_marital_status']

# Gabungkan transformer dengan ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
)

In [6]:
print(numeric_features)
print(categorical_features)

['debtor_volume_handled', 'bill_amount_collected', 'total_actual', 'total_cost', 'success_rate', 'collector_age']
['avg_bill_methods', 'collector_gender', 'collector_marital_status']


In [7]:
X = df.drop("time_to_collect", axis=1)
y = df["time_to_collect"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
X_inf = X_test.iloc[-1, :]
y_inf = y_test.iloc[-1]
y_inf = np.array(y_inf).reshape(1)
X_test = X_test.iloc[:-1, :]
y_test = y_test.iloc[:-1]

In [8]:
X_train

Unnamed: 0,avg_bill_methods,debtor_volume_handled,bill_amount_collected,total_actual,total_cost,success_rate,collector_gender,collector_marital_status,collector_age
29,sms or WA,3,1800000,2300000,39860,78.0,perempuan,Cerai hidup,34
535,surat panggilan,28,47500000,279000000,577092,17.0,perempuan,Cerai hidup,59
695,panggilan,34,141000000,259000000,1184672,54.0,perempuan,Cerai hidup,31
557,panggilan,46,118000000,140000000,1615002,84.0,perempuan,Cerai hidup,47
836,datang ke tempat,57,1196000000,1200000000,2637901,100.0,laki-laki,Menikah,50
...,...,...,...,...,...,...,...,...,...
106,sms or WA,11,8500000,12900000,92193,66.0,perempuan,Menikah,56
270,sms or WA,15,13000000,27700000,84618,47.0,laki-laki,Belum menikah,35
860,datang ke tempat,41,424000000,928000000,1731146,46.0,laki-laki,Cerai hidup,30
435,surat panggilan,20,94000000,228500000,404417,41.0,laki-laki,Cerai hidup,51


In [9]:
knn = joblib.load("knn.pkl")
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', knn)
])

model.fit(X_train, y_train)


In [10]:
model.predict(X_test)[0]

7.266666666666667

In [11]:
y_test.iloc[0] 

9

In [12]:
joblib.dump(model, "time_to_collect.joblib")

['time_to_collect.joblib']

In [13]:
load_model = joblib.load("time_to_collect.joblib")


In [14]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = mse ** (1/2)
r_square = r2_score(y_test, y_pred)

print(f"""
      Mean Squared Errror : {mse:.2f}
      Mean Absolute Error : {mae:.2f}
      Mean Absolute Percentage Error : {mape:.2f}
      Root Mean Squared Error : {rmse:.2f}
      R_Squared : {r_square:.2f}
      """)


      Mean Squared Errror : 5.35
      Mean Absolute Error : 1.64
      Mean Absolute Percentage Error : 270065103282854.12
      Root Mean Squared Error : 2.31
      R_Squared : 0.92
      


In [15]:
knn = joblib.load("bagging2.pkl")
model2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', knn)
])

model2.fit(X_train, y_train)

In [16]:
y_pred = model2.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = mse ** (1/2)
r_square = r2_score(y_test, y_pred)

print(f"""
      Mean Squared Errror : {mse:.2f}
      Mean Absolute Error : {mae:.2f}
      Mean Absolute Percentage Error : {mape:.2f}
      Root Mean Squared Error : {rmse:.2f}
      R_Squared : {r_square:.2f}
      """)


      Mean Squared Errror : 4.87
      Mean Absolute Error : 1.63
      Mean Absolute Percentage Error : 260031958384357.00
      Root Mean Squared Error : 2.21
      R_Squared : 0.92
      


In [17]:
y_pred1 = model.predict(X_test)
y_pred2 = model2.predict(X_test)
print(y_pred1[0], y_pred2[0], y_test.iloc[0])

7.266666666666667 7.43 9


In [18]:
joblib.dump(model2, "time_to_collect.joblib")

['time_to_collect.joblib']