<a href="https://colab.research.google.com/github/dionmarshalll/dataquest-terserahmulah/blob/main/dataquest_terserahmu_lah.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Importing necessary library

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

%matplotlib inline

from warnings import filterwarnings
filterwarnings('ignore')

#### Downloading the dataset

In [None]:
!pip install gdown
!gdown 1xQ5tDWTNGiwfKLH8C6iwkqf-hyk3dbt-
!gdown 1-823H1idpXG_tc7KPPyi8nUrQ588pah7

Downloading...
From: https://drive.google.com/uc?id=1xQ5tDWTNGiwfKLH8C6iwkqf-hyk3dbt-
To: /content/training_dataset.csv
100% 2.94M/2.94M [00:00<00:00, 73.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=1-823H1idpXG_tc7KPPyi8nUrQ588pah7
To: /content/validation_set.csv
100% 726k/726k [00:00<00:00, 94.9MB/s]


In [None]:
df_train = pd.read_csv('training_dataset.csv')
df_validation = pd.read_csv('validation_set.csv')

In [None]:
cust_number_validation = df_validation['customer_number']

#### Brief overlook at the dataset

In [None]:
display(df_train.head())
print(df_train.shape)

Unnamed: 0,customer_number,usia,pekerjaan,status_perkawinan,pendidikan,gagal_bayar_sebelumnya,pinjaman_rumah,pinjaman_pribadi,jenis_kontak,bulan_kontak_terakhir,hari_kontak_terakhir,jumlah_kontak_kampanye_ini,hari_sejak_kontak_sebelumnya,jumlah_kontak_sebelumnya,hasil_kampanye_sebelumnya,tingkat_variasi_pekerjaan,indeks_harga_konsumen,indeks_kepercayaan_konsumen,suku_bunga_euribor_3bln,jumlah_pekerja,pulau,berlangganan_deposito
0,531036,63,sosial media specialis,menikah,Pendidikan Tinggi,no,yes,no,cellular,jul,fri,2,999,0,nonexistent,-1.7,94.215,-40.3,0.885,4991.6,Papua,1
1,999241,43,teknisi,menikah,Pendidikan Tinggi,no,yes,no,cellular,nov,fri,2,999,0,nonexistent,-0.1,93.2,-42.0,4.021,5195.8,Sulawesi,0
2,995002,29,sosial media specialis,lajang,Pendidikan Tinggi,no,yes,yes,cellular,jul,thu,1,999,0,nonexistent,1.4,93.918,-42.7,4.958,5228.1,Papua,0
3,932750,40,pekerja kasar,menikah,SMA,no,no,no,telephone,may,wed,2,999,0,nonexistent,1.1,93.994,-36.4,4.859,5191.0,Sumatera,1
4,684699,40,sosial media specialis,lajang,Pendidikan Tinggi,no,no,no,cellular,aug,wed,3,999,0,nonexistent,1.4,93.444,-36.1,4.964,5228.1,Bali,0


(22916, 22)


In [None]:
display(df_validation.head())
print(df_validation.shape)

Unnamed: 0,customer_number,usia,pekerjaan,status_perkawinan,pendidikan,gagal_bayar_sebelumnya,pinjaman_rumah,pinjaman_pribadi,jenis_kontak,bulan_kontak_terakhir,hari_kontak_terakhir,jumlah_kontak_kampanye_ini,hari_sejak_kontak_sebelumnya,jumlah_kontak_sebelumnya,hasil_kampanye_sebelumnya,tingkat_variasi_pekerjaan,indeks_harga_konsumen,indeks_kepercayaan_konsumen,suku_bunga_euribor_3bln,jumlah_pekerja,pulau
0,445420,35,penyedia jasa,menikah,SMA,no,yes,yes,cellular,jul,mon,1,999,0,nonexistent,1.4,93.918,-42.7,4.96,5228.1,Jawa
1,585604,52,teknisi,lajang,Diploma,unknown,no,no,telephone,may,thu,4,999,0,nonexistent,1.1,93.994,-36.4,4.86,5191.0,Papua
2,888824,37,pekerja kasar,menikah,SMP,unknown,yes,no,telephone,may,wed,3,999,0,nonexistent,1.1,93.994,-36.4,4.856,5191.0,Bali
3,816820,51,pengangguran,menikah,Diploma,no,no,no,telephone,may,tue,3,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,Sumatera
4,542716,45,teknisi,cerai,SMA,no,yes,no,cellular,may,thu,1,999,1,failure,-1.8,92.893,-46.2,1.327,5099.1,Sumatera


(5729, 21)


In [None]:
months = {
    'jan': 1,
    'feb': 2,
    'mar': 3,
    'apr': 4,
    'may': 5,
    'jun': 6,
    'jul': 7,
    'aug': 8,
    'sep': 9,
    'oct': 10,
    'nov': 11,
    'dec': 12
}
df_train = df_train.replace({'bulan_kontak_terakhir': months})
df_validation = df_validation.replace({'bulan_kontak_terakhir': months})

#### Preprocessing

In [None]:
import numpy as np

temp_df = df_train.groupby(by=['indeks_harga_konsumen', 'indeks_kepercayaan_konsumen'])[['indeks_harga_konsumen', 'indeks_kepercayaan_konsumen']].mean().reset_index(drop=True)
df_train['indeks_harga_konsumen_x_berlangganan_deposito_count'] = np.nan
df_validation['indeks_harga_konsumen_x_berlangganan_deposito_count'] = np.nan

for indeks in temp_df['indeks_harga_konsumen'].unique():
  indeks = round(indeks, 3)
  counts = df_train.loc[round(df_train['indeks_harga_konsumen'], 3)==indeks, 'berlangganan_deposito'].value_counts()
  if counts[0]>counts[1]:
    df_train.loc[round(df_train['indeks_harga_konsumen'], 3)==indeks, 'indeks_harga_konsumen_x_berlangganan_deposito_count'] = 0
    df_validation.loc[round(df_validation['indeks_harga_konsumen'], 3)==indeks, 'indeks_harga_konsumen_x_berlangganan_deposito_count'] = 0
  elif counts[1]>counts[0]:
    df_train.loc[round(df_train['indeks_harga_konsumen'], 3)==indeks, 'indeks_harga_konsumen_x_berlangganan_deposito_count'] = 1
    df_validation.loc[round(df_validation['indeks_harga_konsumen'], 3)==indeks, 'indeks_harga_konsumen_x_berlangganan_deposito_count'] = 1

df_train['indeks_harga_konsumen_x_berlangganan_deposito_count'] = df_train['indeks_harga_konsumen_x_berlangganan_deposito_count'].astype(int)
df_validation['indeks_harga_konsumen_x_berlangganan_deposito_count'] = df_validation['indeks_harga_konsumen_x_berlangganan_deposito_count'].astype(int)
df_train['indeks_harga_konsumen_x_berlangganan_deposito_count'].value_counts()

Unnamed: 0_level_0,count
indeks_harga_konsumen_x_berlangganan_deposito_count,Unnamed: 1_level_1
0,22220
1,696


##### Menggunakan fitur hari_sejak_kontak_sebelumnya untuk mencari perbandingan antara indeks_harga_konsumen dengan target berlangganan_deposito


In [None]:
feat_to_compare = 'hari_sejak_kontak_sebelumnya'
df_viz_ = df_train.loc[df_train[feat_to_compare]!=999].groupby(by=[feat_to_compare, 'berlangganan_deposito'])[['indeks_harga_konsumen']].mean()
df_viz = df_train.groupby(by=[feat_to_compare, 'berlangganan_deposito'])[['indeks_harga_konsumen']].mean()
df_viz_ = df_viz_.reset_index()
df_viz = df_viz.reset_index()
# df_viz_.head(10)

In [None]:
# membuat fitur baru sebagai melihat hubungan dari berlangganan_deposito dengan indeks_harga_konsumen dengan bantuan hari_sejak_kontak_sebelumnya
df_train['ind_bd_x_hri_kontak_sbelum'] = np.nan
df_validation['ind_bd_x_hri_kontak_sbelum'] = np.nan
for hari in df_viz['hari_sejak_kontak_sebelumnya'].unique():
  if df_viz.loc[(df_viz['hari_sejak_kontak_sebelumnya']==hari) & (df_viz['berlangganan_deposito']==0), 'indeks_harga_konsumen'].shape[0]==0:
    df_train.loc[(df_train['hari_sejak_kontak_sebelumnya']==hari), 'ind_bd_x_hri_kontak_sbelum'] = 1
    df_validation.loc[(df_validation['hari_sejak_kontak_sebelumnya']==hari), 'ind_bd_x_hri_kontak_sbelum'] = 1
  elif df_viz.loc[(df_viz['hari_sejak_kontak_sebelumnya']==hari) & (df_viz['berlangganan_deposito']==1), 'indeks_harga_konsumen'].shape[0]==0:
    df_train.loc[(df_train['hari_sejak_kontak_sebelumnya']==hari), 'ind_bd_x_hri_kontak_sbelum'] = 0
    df_validation.loc[(df_validation['hari_sejak_kontak_sebelumnya']==hari), 'ind_bd_x_hri_kontak_sbelum'] = 0

  else:
    mean_indeks_0 = df_viz.loc[(df_viz['hari_sejak_kontak_sebelumnya']==hari) & (df_viz['berlangganan_deposito']==0), 'indeks_harga_konsumen'].values[0]
    mean_indeks_1 = df_viz.loc[(df_viz['hari_sejak_kontak_sebelumnya']==hari) & (df_viz['berlangganan_deposito']==1), 'indeks_harga_konsumen'].values[0]
    if mean_indeks_0>mean_indeks_1:
      df_train.loc[(df_train['hari_sejak_kontak_sebelumnya']==hari), 'ind_bd_x_hri_kontak_sbelum'] = 0
      df_validation.loc[(df_validation['hari_sejak_kontak_sebelumnya']==hari), 'ind_bd_x_hri_kontak_sbelum'] = 0
    elif mean_indeks_1>mean_indeks_0:
      df_train.loc[(df_train['hari_sejak_kontak_sebelumnya']==hari), 'ind_bd_x_hri_kontak_sbelum'] = 1
      df_validation.loc[(df_validation['hari_sejak_kontak_sebelumnya']==hari), 'ind_bd_x_hri_kontak_sbelum'] = 1

df_train['ind_bd_x_hri_kontak_sbelum'] = df_train['ind_bd_x_hri_kontak_sbelum'].astype(int)
df_validation['ind_bd_x_hri_kontak_sbelum'] = df_validation['ind_bd_x_hri_kontak_sbelum'].astype(int)
df_train['ind_bd_x_hri_kontak_sbelum'].value_counts()

Unnamed: 0_level_0,count
ind_bd_x_hri_kontak_sbelum,Unnamed: 1_level_1
0,22137
1,779


##### Menggunakan fitur bulan_kontak_terakhir untuk mencari perbandingan antara indeks_harga_konsumen dengan target berlangganan_deposito

In [None]:
feat_to_compare = 'bulan_kontak_terakhir'
df_viz = df_train.groupby(by=[feat_to_compare, 'berlangganan_deposito'])[['indeks_harga_konsumen']].mean()
df_viz = df_viz.reset_index()
# df_viz.head(10)

In [None]:
# membuat fitur baru sebagai melihat hubungan dari berlangganan_deposito dengan indeks_harga_konsumen dengan bantuan bulan_kontak_terakhir
df_train['ind_bd_x_bulan_kontak_trakhir'] = np.nan
df_validation['ind_bd_x_bulan_kontak_trakhir'] = np.nan
for bulan in df_viz['bulan_kontak_terakhir'].unique():
  if df_viz.loc[(df_viz['bulan_kontak_terakhir']==bulan) & (df_viz['berlangganan_deposito']==0), 'indeks_harga_konsumen'].shape[0]==0:
    df_train.loc[(df_train['bulan_kontak_terakhir']==bulan), 'ind_bd_x_hri_kontak_sbelum'] = 1
    df_validation.loc[(df_validation['bulan_kontak_terakhir']==bulan), 'ind_bd_x_hri_kontak_sbelum'] = 1
  elif df_viz.loc[(df_viz['bulan_kontak_terakhir']==bulan) & (df_viz['berlangganan_deposito']==1), 'indeks_harga_konsumen'].shape[0]==0:
    df_train.loc[(df_train['bulan_kontak_terakhir']==bulan), 'ind_bd_x_hri_kontak_sbelum'] = 0
    df_validation.loc[(df_validation['bulan_kontak_terakhir']==bulan), 'ind_bd_x_hri_kontak_sbelum'] = 0

  else:
    mean_indeks_0 = df_viz.loc[(df_viz['bulan_kontak_terakhir']==bulan) & (df_viz['berlangganan_deposito']==0), 'indeks_harga_konsumen'].values[0]
    mean_indeks_1 = df_viz.loc[(df_viz['bulan_kontak_terakhir']==bulan) & (df_viz['berlangganan_deposito']==1), 'indeks_harga_konsumen'].values[0]
    if mean_indeks_0>mean_indeks_1:
      df_train.loc[(df_train['bulan_kontak_terakhir']==bulan), 'ind_bd_x_bulan_kontak_trakhir'] = 0
      df_validation.loc[(df_validation['bulan_kontak_terakhir']==bulan), 'ind_bd_x_bulan_kontak_trakhir'] = 0
    elif mean_indeks_1>mean_indeks_0:
      df_train.loc[(df_train['bulan_kontak_terakhir']==bulan), 'ind_bd_x_bulan_kontak_trakhir'] = 1
      df_validation.loc[(df_validation['bulan_kontak_terakhir']==bulan), 'ind_bd_x_bulan_kontak_trakhir'] = 1

df_train['ind_bd_x_bulan_kontak_trakhir'] = df_train['ind_bd_x_bulan_kontak_trakhir'].astype(int)
df_validation['ind_bd_x_bulan_kontak_trakhir'] = df_validation['ind_bd_x_bulan_kontak_trakhir'].astype(int)
df_train['ind_bd_x_bulan_kontak_trakhir'].value_counts()

Unnamed: 0_level_0,count
ind_bd_x_bulan_kontak_trakhir,Unnamed: 1_level_1
0,20388
1,2528


##### Another preprocessing

In [None]:
df_train = df_train.drop(['gagal_bayar_sebelumnya'], axis=1)
df_validation = df_validation.drop(['gagal_bayar_sebelumnya'], axis=1)

In [None]:
df_train['hasil_kampanaye_sebelumnya_nonexistent'] = df_train['hasil_kampanye_sebelumnya'].apply(lambda x: 1 if x=='nonexistent' else 0)
df_validation['hasil_kampanaye_sebelumnya_nonexistent'] = df_validation['hasil_kampanye_sebelumnya'].apply(lambda x: 1 if x=='nonexistent' else 0)

In [None]:
pendidikan_le = {
    'TIDAK SEKOLAH': 0,
    'Tidak Tamat SD': 1,
    'SD': 2,
    'SMP': 3,
    'SMA': 4,
    'Pendidikan Tinggi': 5,
    'Diploma': 6,
    'unknown': -1
}

hari_kontak_terakhir_le = {
    'mon': 1,
    'tue': 2,
    'wed': 3,
    'thu': 4,
    'fri': 5
}

hasil_kampanye_sebelumnya_le = {
    'nonexistent': -1,
    'failure': 0,
    'successs': 1
}

df_train = df_train.replace({'pendidikan': pendidikan_le, 'hari_kontak_terakhir': hari_kontak_terakhir_le, 'hasil_kampanye_sebelumnya': hasil_kampanye_sebelumnya_le})
df_validation = df_validation.replace({'pendidikan': pendidikan_le, 'hari_kontak_terakhir': hari_kontak_terakhir_le, 'hasil_kampanye_sebelumnya': hasil_kampanye_sebelumnya_le})

In [None]:
df_train = df_train.drop(['hasil_kampanye_sebelumnya', 'customer_number'], axis=1)
df_validation = df_validation.drop(['hasil_kampanye_sebelumnya', 'customer_number'], axis=1)

In [None]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder().fit(df_train[['pekerjaan', 'jenis_kontak', 'pulau', 'status_perkawinan', 'pinjaman_rumah', 'pinjaman_pribadi']])
encoded_df_train = pd.DataFrame(ohe.transform(df_train[['pekerjaan', 'jenis_kontak', 'pulau', 'status_perkawinan', 'pinjaman_rumah', 'pinjaman_pribadi']]).toarray(), columns=ohe.get_feature_names_out(['pekerjaan', 'jenis_kontak', 'pulau', 'status_perkawinan', 'pinjaman_rumah', 'pinjaman_pribadi']))
encoded_df_validation = pd.DataFrame(ohe.transform(df_validation[['pekerjaan', 'jenis_kontak', 'pulau', 'status_perkawinan', 'pinjaman_rumah', 'pinjaman_pribadi']]).toarray(), columns=ohe.get_feature_names_out(['pekerjaan', 'jenis_kontak', 'pulau', 'status_perkawinan', 'pinjaman_rumah', 'pinjaman_pribadi']))

df_train = df_train.drop(['pekerjaan', 'jenis_kontak', 'pulau', 'status_perkawinan', 'pinjaman_rumah', 'pinjaman_pribadi'], axis=1)
df_validation = df_validation.drop(['pekerjaan', 'jenis_kontak', 'pulau', 'status_perkawinan', 'pinjaman_rumah', 'pinjaman_pribadi'], axis=1)

df_train = pd.concat([df_train, encoded_df_train], axis=1)
df_validation = pd.concat([df_validation, encoded_df_validation], axis=1)

In [None]:
# balancing the class
from sklearn.utils import resample

df_class1 = df_train[df_train['berlangganan_deposito'] == 1]
df_class0 = df_train[df_train['berlangganan_deposito'] == 0]

df_class0_downsampled = df_class0.sample(n=10000, random_state=42)

df_class1_oversampled = resample(df_class1,
                                 replace=True,      # sample with replacement
                                 n_samples=10000,   # number of samples after resampling
                                 random_state=42)

df_balanced = pd.concat([df_class0_downsampled, df_class1_oversampled])

df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
df_train = df_balanced.copy()

#### Initial modelling

In [None]:
X = df_train.drop(['berlangganan_deposito'], axis=1)
y = df_train['berlangganan_deposito']

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression

selector = SelectKBest(score_func=f_regression, k=15)
selector.fit(X, y)

selected_features = X.columns[selector.get_support()].tolist()
X = X[selected_features]

print("Selected features:", selected_features)

Selected features: ['jumlah_kontak_kampanye_ini', 'hari_sejak_kontak_sebelumnya', 'jumlah_kontak_sebelumnya', 'tingkat_variasi_pekerjaan', 'indeks_harga_konsumen', 'suku_bunga_euribor_3bln', 'jumlah_pekerja', 'indeks_harga_konsumen_x_berlangganan_deposito_count', 'ind_bd_x_hri_kontak_sbelum', 'ind_bd_x_bulan_kontak_trakhir', 'hasil_kampanaye_sebelumnya_nonexistent', 'pekerjaan_pekerja kasar', 'pekerjaan_pensiunan', 'jenis_kontak_cellular', 'jenis_kontak_telephone']


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# scaling the dataset
from sklearn.preprocessing import MinMaxScaler

X_train = pd.DataFrame(MinMaxScaler().fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(MinMaxScaler().fit_transform(X_test), columns=X_test.columns)
df_validation = pd.DataFrame(MinMaxScaler().fit_transform(df_validation), columns=df_validation.columns)

In [None]:
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

def get_raw_models():
  return [XGBClassifier(), LogisticRegression(), MultinomialNB(), KNeighborsClassifier(), RandomForestClassifier()]

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report

models = get_raw_models()
for model in models:
  model.fit(X_train, y_train)
  preds = model.predict(X_test)

  print(f'From model: {model.__class__.__name__}')
  print(f'Precision score: {precision_score(y_test, preds)}')
  print(f'Recall score: {recall_score(y_test, preds)}')
  print(f'F1-score: {f1_score(y_test, preds)}')
  print(f'Accuracy score: {accuracy_score(y_test, preds)}')
  print(classification_report(y_test, preds))
  print('\n=======================================\n')

From model: XGBClassifier
Precision score: 0.7811059907834101
Recall score: 0.6918367346938775
F1-score: 0.7337662337662337
Accuracy score: 0.754
              precision    recall  f1-score   support

           0       0.73      0.81      0.77      3060
           1       0.78      0.69      0.73      2940

    accuracy                           0.75      6000
   macro avg       0.76      0.75      0.75      6000
weighted avg       0.76      0.75      0.75      6000



From model: LogisticRegression
Precision score: 0.7433174661296228
Recall score: 0.6904761904761905
F1-score: 0.7159231176159407
Accuracy score: 0.7315
              precision    recall  f1-score   support

           0       0.72      0.77      0.75      3060
           1       0.74      0.69      0.72      2940

    accuracy                           0.73      6000
   macro avg       0.73      0.73      0.73      6000
weighted avg       0.73      0.73      0.73      6000



From model: MultinomialNB
Precision score: 0

Will be using xgboost, for the next step at searching for the best hyperparameter

In [None]:
# from sklearn.model_selection import RandomizedSearchCV

# xgb = XGBClassifier(use_label_encoder=False, eval_metric='auc', gamma=0.1, colsanple_bytree=0.8, reg_alpha=0.1)

# param_dist = {
#     'n_estimators': [250, 350, 300, 400],
#     'max_depth': [8, 10, 12, 15],
#     'learning_rate': [0.1, 0.2, 0.25],
#     'subsample': [0.6, 0.8, 1.0],
#     'reg_lambda': [1.5, 1.8, 2.0]
# }

# random_search = RandomizedSearchCV(
#     estimator=xgb,
#     param_distributions=param_dist,
#     n_iter=30,                    # Number of random combinations to try
#     scoring='roc_auc',            # Use 'accuracy', 'f1', or 'roc_auc' based on your task
#     cv=5,                         # 3-fold cross-validation
#     verbose=1,
#     random_state=42,
#     n_jobs=-1                     # Use all available cores
# )

# # 4. Fit to your training data
# random_search.fit(X_train, y_train)

# # 5. Print the best parameters and score
# print("Best Parameters:", random_search.best_params_)
# print("Best ROC AUC Score:", random_search.best_score_)

# # 6. Get best model and predict
# best_model = random_search.best_estimator_
# y_pred_proba = best_model.predict_proba(X_test)[:, 1]

In [None]:
# preds = best_model.predict(X_test)

# print(f'From model: {model.__class__.__name__}')
# print(f'Precision score: {precision_score(y_test, preds)}')
# print(f'Recall score: {recall_score(y_test, preds)}')
# print(f'F1-score: {f1_score(y_test, preds)}')
# print(f'Accuracy score: {accuracy_score(y_test, preds)}')
# print(classification_report(y_test, preds))
# print('\n=======================================\n')

In [None]:
xgb = XGBClassifier(use_label_encoder=False, eval_metric='auc', gamma=0.1, colsanple_bytree=0.8, reg_alpha=0.1, subsample=0.8, reg_lambda=1.5, n_estimators=400, max_depth=15, learning_rate=0.25)
xgb.fit(X_train, y_train)

In [None]:
preds = xgb.predict(X_test)

print(f'From model: {model.__class__.__name__}')
print(f'Precision score: {precision_score(y_test, preds)}')
print(f'Recall score: {recall_score(y_test, preds)}')
print(f'F1-score: {f1_score(y_test, preds)}')
print(f'Accuracy score: {accuracy_score(y_test, preds)}')
print(classification_report(y_test, preds))
print('\n=======================================\n')

From model: RandomForestClassifier
Precision score: 0.757163850110213
Recall score: 0.7010204081632653
F1-score: 0.7280113034263511
Accuracy score: 0.7433333333333333
              precision    recall  f1-score   support

           0       0.73      0.78      0.76      3060
           1       0.76      0.70      0.73      2940

    accuracy                           0.74      6000
   macro avg       0.74      0.74      0.74      6000
weighted avg       0.74      0.74      0.74      6000





In [None]:
subm_pred = xgb.predict_proba(df_validation[selected_features])

In [None]:
len(subm_pred[:, 1]), len(cust_number_validation)

(5729, 5729)

In [None]:
subm_pred_df = pd.DataFrame(subm_pred[:,1])
subm_pred_df.columns = ['berlangganan_deposito']
cust_number_validation_df = pd.DataFrame(cust_number_validation)

submission = pd.concat([cust_number_validation_df, subm_pred_df], axis=1).reset_index(drop=True)

In [None]:
submission.head()

Unnamed: 0,customer_number,berlangganan_deposito
0,445420,0.151231
1,585604,0.373253
2,888824,0.163091
3,816820,0.181052
4,542716,0.301006


In [None]:
submission.shape

(5729, 2)

In [None]:
submission.to_csv('submission.csv', index=False)