In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score

In [2]:
df = pd.read_csv(r"C:\Users\DHONI HANIF\OneDrive\Documents\AI_Collection_and_Loss_Reverse_Forecast\modelling\kolektor\regresi_for_total_cost\data\data2.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                1000 non-null   int64  
 1   avg_bill_methods          1000 non-null   object 
 2   debtor_volume_handled     1000 non-null   int64  
 3   bill_amount_collected     1000 non-null   int64  
 4   total_actual              1000 non-null   int64  
 5   total_cost                1000 non-null   int64  
 6   success_rate              1000 non-null   float64
 7   time_to_collect           1000 non-null   int64  
 8   collector_gender          1000 non-null   object 
 9   collector_marital_status  1000 non-null   object 
 10  collector_age             1000 non-null   int64  
dtypes: float64(1), int64(7), object(3)
memory usage: 86.1+ KB


In [3]:
for i in df.columns:
    if df[i].dtype == "object":
        df[i] = df[i].astype("category")

df.drop(["Unnamed: 0"], axis=1, inplace=True)

In [4]:
df.head()

Unnamed: 0,avg_bill_methods,debtor_volume_handled,bill_amount_collected,total_actual,total_cost,success_rate,time_to_collect,collector_gender,collector_marital_status,collector_age
0,sms or WA,13,46800000,123100000,99948,38.0,1,laki-laki,Menikah,45
1,sms or WA,10,52400000,58300000,72633,90.0,1,perempuan,Cerai mati,32
2,sms or WA,13,21300000,127900000,79992,17.0,2,laki-laki,Cerai hidup,62
3,sms or WA,12,47600000,78400000,83538,61.0,1,perempuan,Cerai mati,55
4,sms or WA,19,16700000,31600000,133245,53.0,0,perempuan,Menikah,53


In [5]:
df["avg_bill_methods"].value_counts()

avg_bill_methods
sms or WA           300
panggilan           250
surat panggilan     250
datang ke tempat    200
Name: count, dtype: int64

In [6]:
bill_methods = LabelEncoder().fit(df["avg_bill_methods"].to_numpy().reshape(-1, 1))
gender = LabelEncoder().fit(df["collector_gender"].to_numpy().reshape(-1, 1))
mart = LabelEncoder().fit(df["collector_marital_status"].to_numpy().reshape(-1, 1))
df2 = pd.DataFrame()

df2["avg_bill_methods"] = bill_methods.transform(df["avg_bill_methods"])
df2["collector_gender"] = gender.transform(df["collector_gender"])
df2["collector_marital_status"] = mart.transform(df["collector_marital_status"])
           
scaler = RobustScaler().fit(df[["bill_amount_collected", "total_actual", "total_cost", "debtor_volume_handled", "collector_age", "success_rate"]])
df2[["bill_amount_collected", "total_actual", "total_cost", "debtor_volume_handled", "collector_age", "success_rate"]] = scaler.transform(df[["bill_amount_collected", "total_actual", "total_cost", "debtor_volume_handled", "collector_age", "success_rate"]])
y = df["time_to_collect"]

df2.head(5)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Unnamed: 0,avg_bill_methods,collector_gender,collector_marital_status,bill_amount_collected,total_actual,total_cost,debtor_volume_handled,collector_age,success_rate
0,2,0,3,-0.245857,-0.259239,-0.337794,-0.761905,0.095238,-0.465116
1,2,1,2,-0.223081,-0.402633,-0.357597,-0.87619,-0.52381,0.744186
2,2,0,1,-0.349568,-0.248617,-0.352262,-0.761905,0.904762,-0.953488
3,2,1,2,-0.242603,-0.358154,-0.349691,-0.8,0.571429,0.069767
4,2,1,3,-0.368277,-0.461717,-0.313653,-0.533333,0.47619,-0.116279


In [7]:
X = df2

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
model = KNeighborsRegressor()
model.fit(X_train, y_train)

In [8]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = mse ** (1/2)
r_square = r2_score(y_test, y_pred)

print(f"""
      Mean Squared Errror : {mse:.2f}
      Mean Absolute Error : {mae:.2f}
      Mean Absolute Percentage Error : {mape:.2f}
      Root Mean Squared Error : {rmse:.2f}
      R_Squared : {r_square:.2f}
      """)


      Mean Squared Errror : 1.84
      Mean Absolute Error : 1.16
      Mean Absolute Percentage Error : 580964351930794.25
      Root Mean Squared Error : 580964351930794.25
      R_Squared : 0.92
      


In [9]:
param_grid = {
    "n_neighbors": np.arange(1, 41),
    "weights": ["uniform", "distance"],
    "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
    "metric": ["cityblock", "cosine", "euclidean", "haversine", "l1", "l2", "manhattan", "minkowski"]
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

Traceback (most recent call last):
  File "c:\Users\DHONI HANIF\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
  File "c:\Users\DHONI HANIF\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\metrics\_scorer.py", line 527, in __call__
    return estimator.score(*args, **kwargs)
  File "c:\Users\DHONI HANIF\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\base.py", line 759, in score
    y_pred = self.predict(X)
  File "c:\Users\DHONI HANIF\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\neighbors\_regression.py", line 237, in predict
    neigh_ind = self.kneighbors(X, return_distance=False)
  File "c:\Users\DHONI HANIF\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\neighbors\_base.py", line 822, in kneighbors
    results = ArgKmin.compute(
  File "c:\Users\DHONI HANIF\AppData\Local\Programs\Python\Python39\

In [10]:
model = grid_search.best_estimator_
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = mse ** (1/2)
r_square = r2_score(y_test, y_pred)

print(f"""
      Mean Squared Errror : {mse:.2f}
      Mean Absolute Error : {mae:.2f}
      Mean Absolute Percentage Error : {mape:.2f}
      Root Mean Squared Error : {rmse:.2f}
      R_Squared : {r_square:.2f}
      """)


      Mean Squared Errror : 1.64
      Mean Absolute Error : 1.11
      Mean Absolute Percentage Error : 637259347272925.38
      Root Mean Squared Error : 637259347272925.38
      R_Squared : 0.92
      


In [11]:
import pickle

pickle.dump(model, open("knn.pkl", "wb"))

In [12]:
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor

model2 = BaggingRegressor()
model2.fit(X_train, y_train)

In [13]:
y_pred = model2.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = mse ** (1/2)
r_square = r2_score(y_test, y_pred)

print(f"""
      Mean Squared Errror : {mse:.2f}
      Mean Absolute Error : {mae:.2f}
      Mean Absolute Percentage Error : {mape:.2f}
      Root Mean Squared Error : {rmse:.2f}
      R_Squared : {r_square:.2f}
      """)


      Mean Squared Errror : 1.77
      Mean Absolute Error : 1.14
      Mean Absolute Percentage Error : 576460752303423.62
      Root Mean Squared Error : 576460752303423.62
      R_Squared : 0.92
      


In [14]:
import pickle

pickle.dump(model, open("bagging.pkl", "wb"))

In [15]:
# Inisialisasi model Bagging Regressor
bagging_regressor = BaggingRegressor()

# Menentukan grid hyperparameter yang akan dijelajahi
param_grid = {
    'n_estimators': [10, 50, 100],
    'max_samples': [0.5, 0.7, 0.9],
    'max_features': [0.5, 0.7, 0.9]
}

# Inisiasi GridSearchCV
grid_search = GridSearchCV(estimator=bagging_regressor, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Melakukan pencarian hyperparameter terbaik
grid_search.fit(X_train, y_train)

# Menampilkan hyperparameter terbaik
best_params = grid_search.best_params_
print("Hyperparameter terbaik:", best_params)

# Menampilkan skor terbaik
best_score = -grid_search.best_score_
print("Skor terbaik:", best_score)

Hyperparameter terbaik: {'max_features': 0.9, 'max_samples': 0.5, 'n_estimators': 100}
Skor terbaik: 1.6174362500000001


In [16]:
model = grid_search.best_estimator_
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = mse ** (1/2)
r_square = r2_score(y_test, y_pred)

print(f"""
      Mean Squared Errror : {mse:.2f}
      Mean Absolute Error : {mae:.2f}
      Mean Absolute Percentage Error : {mape:.2f}
      Root Mean Squared Error : {rmse:.2f}
      R_Squared : {r_square:.2f}
      """)


      Mean Squared Errror : 1.64
      Mean Absolute Error : 1.14
      Mean Absolute Percentage Error : 608661489639122.75
      Root Mean Squared Error : 608661489639122.75
      R_Squared : 0.92
      


In [17]:
import pickle

pickle.dump(model, open("bagging2.pkl", "wb"))

In [18]:
model = AdaBoostRegressor()
model.fit(X_train, y_train)

In [19]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = mse ** (1/2)
r_square = r2_score(y_test, y_pred)

print(f"""
      Mean Squared Errror : {mse:.2f}
      Mean Absolute Error : {mae:.2f}
      Mean Absolute Percentage Error : {mape:.2f}
      Root Mean Squared Error : {rmse:.2f}
      R_Squared : {r_square:.2f}
      """)


      Mean Squared Errror : 1.61
      Mean Absolute Error : 1.11
      Mean Absolute Percentage Error : 603727009172774.38
      Root Mean Squared Error : 603727009172774.38
      R_Squared : 0.93
      


In [22]:
import joblib
load_model = joblib.load('knn.pkl')

input_data = [x for x in X_test.iloc[-1]]
final_features = [np.array(input_data)]
result = load_model.predict(final_features)[0]
print(result, y_test.iloc[-1])

2.1 0




In [23]:
load_model = joblib.load('bagging2.pkl')

input_data = [x for x in X_test.iloc[-1]]
final_features = [np.array(input_data)]
result = load_model.predict(final_features)[0]
print(result, y_test.iloc[-1])

1.83 0


