In [1]:
import numpy as np
import pandas as pd
import joblib, pickle, json

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import RobustScaler, LabelEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline  
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score

In [2]:
df = pd.read_csv(r"C:\Users\DHONI HANIF\OneDrive\Documents\AI_Collection_and_Loss_Reverse_Forecast\data\kolektor.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                1000 non-null   int64  
 1   collector_name            1000 non-null   object 
 2   collector_address         1000 non-null   object 
 3   collector_number          1000 non-null   object 
 4   collector_nik             1000 non-null   int64  
 5   time_to_collect           1000 non-null   int64  
 6   avg_bill_methods          1000 non-null   object 
 7   debtor_volume_handled     1000 non-null   int64  
 8   bill_amount_collected     1000 non-null   int64  
 9   total_actual              1000 non-null   int64  
 10  total_cost                1000 non-null   int64  
 11  success_rate              1000 non-null   float64
 12  collector_zip             1000 non-null   int64  
 13  collector_rt              1000 non-null   int64  
 14  collector

In [3]:
for i in df.columns:
    if df[i].dtype == "object":
        df[i] = df[i].astype("category")

df.drop(["Unnamed: 0"], axis=1, inplace=True)

In [4]:
df["avg_bill_methods"].value_counts()

avg_bill_methods
sms or WA           300
panggilan           250
surat panggilan     250
datang ke tempat    200
Name: count, dtype: int64

In [5]:
# Preprocessing untuk fitur kategori
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder())
])

# Preprocessing untuk fitur numerik
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])
numeric_features = ['debtor_volume_handled', 'bill_amount_collected', 'total_actual', 'total_cost', 'success_rate', 'collector_age']
categorical_features = ['avg_bill_methods', 'collector_gender', 'collector_marital_status']
drop_features = ["collector_name", "collector_number", "collector_nik", "collector_zip",
                 "collector_rt", "collector_rw", "collector_birth_place", "collector_address"]

# Gabungkan transformer dengan ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('drop', 'drop', drop_features)
    ],
    remainder="passthrough"
)

In [6]:
print(numeric_features)
print(categorical_features)

['debtor_volume_handled', 'bill_amount_collected', 'total_actual', 'total_cost', 'success_rate', 'collector_age']
['avg_bill_methods', 'collector_gender', 'collector_marital_status']


In [7]:
X = df.drop("time_to_collect", axis=1)
y = df["time_to_collect"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)


In [8]:
X_train

Unnamed: 0,collector_name,collector_address,collector_number,collector_nik,avg_bill_methods,debtor_volume_handled,bill_amount_collected,total_actual,total_cost,success_rate,collector_zip,collector_rt,collector_rw,collector_birth_place,collector_age,collector_marital_status,collector_gender
29,Kelsey Brown,"Jl. Pasteur No. 2\nTegal, Kalimantan Selatan 4...",+62 (030) 759-5619,9474549916298063,sms or WA,3,1800000,2300000,39860,78.0,44096,3,13,"Nusa Tenggara Timur, 01-06-1989",34,Cerai hidup,perempuan
535,Joseph Gonzales,"Jl. Pasir Koja No. 26\nKupang, GO 34992",+62 (0156) 345 4482,3231701743669595,surat panggilan,28,47500000,279000000,577092,17.0,34992,4,14,"Bali, 19-08-1964",59,Cerai hidup,perempuan
695,Kimberly Ballard,"Gg. Rumah Sakit No. 317\nPalu, DKI Jakarta 73649",+62 (0657) 914 0637,8553379858640366,panggilan,34,141000000,259000000,1184672,54.0,73649,10,16,"Kepulauan Bangka Belitung, 10-09-1992",31,Cerai hidup,perempuan
557,Patricia Willis,"Jl. Siliwangi No. 041\nBatu, Kalimantan Timur ...",+62 (912) 705-6456,5587069543984402,panggilan,46,118000000,140000000,1615002,84.0,91666,3,2,"Kepulauan Bangka Belitung, 25-09-1976",47,Cerai hidup,perempuan
836,Glenn Hernandez,"Gang Cempaka No. 74\nSorong, KT 47844",+62 (631) 490-3310,1575899344447531,datang ke tempat,57,1196000000,1200000000,2637901,100.0,47844,3,11,"Kalimantan Selatan, 23-03-1973",50,Menikah,laki-laki
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,Nicholas Brooks,"Jl. PHH. Mustofa No. 348\nPariaman, YO 31724",+62-059-019-2470,4838850815445206,sms or WA,11,8500000,12900000,92193,66.0,31724,20,16,"Jambi, 30-12-1967",56,Menikah,perempuan
270,Ashley Phelps,"Jl. Surapati No. 4\nMalang, Kalimantan Utara 2...",+62 (09) 896-4215,8796843768957851,sms or WA,15,13000000,27700000,84618,47.0,28367,14,6,"Gorontalo, 01-12-1988",35,Belum menikah,laki-laki
860,Anthony Bell,"Gang PHH. Mustofa No. 491\nSingkawang, Maluku ...",+62-0881-593-4695,5706057955006069,datang ke tempat,41,424000000,928000000,1731146,46.0,6928,1,4,"Lampung, 17-08-1993",30,Cerai hidup,laki-laki
435,Sandra Parker,"Jalan Ciumbuleuit No. 232\nPasuruan, Kalimanta...",+62-74-905-4748,9065626423514771,surat panggilan,20,94000000,228500000,404417,41.0,61049,10,13,"Kalimantan Utara, 01-07-1972",51,Cerai hidup,laki-laki


In [9]:
knn = joblib.load("knn.pkl")
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', knn)
])

model.fit(X_train, y_train)


In [10]:
model.predict(X_test)[0]

7.266666666666667

In [11]:
y_test.iloc[0] 

9

In [12]:
joblib.dump(model, "time_to_collect.joblib")

['time_to_collect.joblib']

In [13]:
load_model = joblib.load("time_to_collect.joblib")


In [14]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = mse ** (1/2)
r_square = r2_score(y_test, y_pred)

print(f"""
      Mean Squared Errror : {mse:.2f}
      Mean Absolute Error : {mae:.2f}
      Mean Absolute Percentage Error : {mape:.2f}
      Root Mean Squared Error : {rmse:.2f}
      R_Squared : {r_square:.2f}
      """)


      Mean Squared Errror : 5.33
      Mean Absolute Error : 1.64
      Mean Absolute Percentage Error : 268714777766439.84
      Root Mean Squared Error : 2.31
      R_Squared : 0.92
      


In [15]:
knn = joblib.load("bagging2.pkl")
model2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', knn)
])

model2.fit(X_train, y_train)

In [16]:
y_pred = model2.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = mse ** (1/2)
r_square = r2_score(y_test, y_pred)

print(f"""
      Mean Squared Errror : {mse:.2f}
      Mean Absolute Error : {mae:.2f}
      Mean Absolute Percentage Error : {mape:.2f}
      Root Mean Squared Error : {rmse:.2f}
      R_Squared : {r_square:.2f}
      """)


      Mean Squared Errror : 5.15
      Mean Absolute Error : 1.67
      Mean Absolute Percentage Error : 243644739840744.09
      Root Mean Squared Error : 2.27
      R_Squared : 0.92
      


In [17]:
y_pred1 = model.predict(X_test)
y_pred2 = model2.predict(X_test)
print(y_pred1[0], y_pred2[0], y_test.iloc[0])

7.266666666666667 7.28 9


In [18]:
joblib.dump(model2, "time_to_collect.joblib")

['time_to_collect.joblib']