# Data Preparation lanjutan dari pentaho

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np

## Preparation file credit history (take the max Value Overdue dari file credit history dan menghapus yang tidak memiliki pinjaman)

In [None]:
cr = pd.read_csv("/content/drive/MyDrive/finproject_celerates/credit_history.csv")

In [None]:
cr.head()

Unnamed: 0,Id_customer,Overdue
0,5001711,Tidak memiliki pinjaman
1,5001711,0
2,5001711,0
3,5001711,0
4,5001712,0


In [None]:
cr.isnull().sum()

Id_customer    0
Overdue        0
dtype: int64

#### menghapus yang tidak memiliki pinjaman

In [None]:
mask = cr["Overdue"] == "Tidak memiliki pinjaman"

In [None]:
cr = cr[~mask]

In [None]:
cr.Overdue.value_counts()

0    821230
1     10936
5      1677
2       849
3       317
4       222
Name: Overdue, dtype: int64

### mengambil max value dari overdue credit history

In [None]:
cr = cr.groupby("Id_customer").max()

In [None]:
cr.to_csv("credit_history.csv")

In [None]:
#from google.colab import files
#files.download('credit_history.csv')

In [None]:
cr["Overdue"] = cr["Overdue"].astype(np.float64)

In [None]:
cr.Overdue.value_counts()

0.0    35976
1.0     4617
2.0      328
5.0      193
3.0       87
4.0       48
Name: Overdue, dtype: int64

In [None]:
cr["Overdue"] = np.where(cr["Overdue"].isin([1, 2, 3, 4, 5]), 1, 0)

In [None]:
cr.Overdue.value_counts()

0    35976
1     5273
Name: Overdue, dtype: int64

## KNN Imputer for handling missing values in 'Pekerjaan'

In [None]:
df = pd.read_csv("/content/drive/MyDrive/finproject_celerates/data_train.csv")

In [None]:
df.isnull().sum()

Id_customer                 0
JK                          0
KepemilikanMobil            0
KepemilikanProperti         0
JmlAnak                     0
Pendapatan                  0
TipePendapatan              0
TingkatPendidikan           0
StatusKeluarga              0
TipeRumah                   0
FlagMobile                  0
FlagWorkPhone               0
FlagPhone                   0
Email                       0
Pekerjaan              134243
JmlAnggotaKeluarga          0
Age                         0
Experience                  0
dtype: int64

In [None]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler

In [None]:
obj_df = df.select_dtypes(include=['object'])
colum = obj_df
for i in colum:
  df[i] = df[i].astype('category')

In [None]:
count = -1
convert_original = ['JK','KepemilikanMobil','KepemilikanProperti','TipePendapatan','TingkatPendidikan','StatusKeluarga','TipeRumah','Pekerjaan']
original = []
for i in convert_original:
  c = df[i].astype('category')
  d = dict(enumerate(c.cat.categories))
  original.append(d)
  count += 1
  print(count,i,d)

0 JK {0: 'Laki-laki', 1: 'Perempuan'}
1 KepemilikanMobil {0: 'Tidak', 1: 'Ya'}
2 KepemilikanProperti {0: 'Tidak', 1: 'Ya'}
3 TipePendapatan {0: 'Asosiasi komersial', 1: 'Bekerja', 2: 'Mahasiswa/Murid', 3: 'PNS', 4: 'Pensioner'}
4 TingkatPendidikan {0: 'Graduate', 1: 'Postgraduate', 2: 'Undergraduate'}
5 StatusKeluarga {0: 'Belum Menikah', 1: 'Cerai', 2: 'Menikah'}
6 TipeRumah {0: 'Apartemen Pribadi', 1: 'Apartemen kantor', 2: 'Rumah orang tua', 3: 'Rumah pribadi', 4: 'Sewa Apartemen'}
7 Pekerjaan {0: 'Accountants', 1: 'Cleaning staff', 2: 'Cooking staff', 3: 'Core staff', 4: 'Drivers', 5: 'HR staff', 6: 'High skill tech staff', 7: 'IT staff', 8: 'Laborers', 9: 'Low-skill Laborers', 10: 'Managers', 11: 'Medicine staff', 12: 'Private service staff', 13: 'Realty agents', 14: 'Sales staff', 15: 'Secretaries', 16: 'Security staff', 17: 'Waiters/barmen staff'}


### encoding

In [None]:
df['JK'] = df['JK'].cat.codes
df['KepemilikanMobil'] = df['KepemilikanMobil'].cat.codes
df['KepemilikanProperti'] = df['KepemilikanProperti'].cat.codes
df['TipePendapatan'] = df['TipePendapatan'].cat.codes
df['TingkatPendidikan'] = df['TingkatPendidikan'].cat.codes
df['StatusKeluarga'] = df['StatusKeluarga'].cat.codes
df['TipeRumah'] = df['TipeRumah'].cat.codes

In [None]:
index_null = df[df['Pekerjaan'].isnull()].index.tolist()
test_mv = df.iloc[index_null]

In [None]:
train_mv = df.drop(index_null)

In [None]:
train_mv['Pekerjaan'] = train_mv['Pekerjaan'].cat.codes

In [None]:
corr_col = train_mv.corr()

In [None]:
corr_col['Pekerjaan']

Id_customer           -0.001239
JK                     0.028289
KepemilikanMobil      -0.038866
KepemilikanProperti    0.005130
JmlAnak               -0.011505
Pendapatan            -0.012059
TipePendapatan        -0.106225
TingkatPendidikan     -0.127887
StatusKeluarga        -0.011756
TipeRumah              0.002157
FlagMobile                  NaN
FlagWorkPhone          0.005652
FlagPhone             -0.013332
Email                 -0.014900
Pekerjaan              1.000000
JmlAnggotaKeluarga    -0.014575
Age                   -0.015691
Experience            -0.055193
Name: Pekerjaan, dtype: float64

### get column names of train data

In [None]:
feature = []
for i in train_mv.columns:
  if i not in ['Pekerjaan', 'Id_customer', 'FlagMobile']:
    feature.append(i)
feature

['JK',
 'KepemilikanMobil',
 'KepemilikanProperti',
 'JmlAnak',
 'Pendapatan',
 'TipePendapatan',
 'TingkatPendidikan',
 'StatusKeluarga',
 'TipeRumah',
 'FlagWorkPhone',
 'FlagPhone',
 'Email',
 'JmlAnggotaKeluarga',
 'Age',
 'Experience']

### split data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x = train_mv.loc[:, feature]
y = train_mv.loc[:, ['Pekerjaan']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x,y, train_size = 0.8)

### KNN imputer

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train,y_train)

  return self._fit(X, y)


KNeighborsClassifier(n_neighbors=1)

In [None]:
y_predict = knn.predict(X_test)
from sklearn.metrics import mean_squared_error, classification_report, confusion_matrix

In [None]:
rmse=mean_squared_error(y_test, y_predict)

In [None]:
print('rmse',rmse)

rmse 1.1335897309468632


In [None]:
class_report = classification_report(y_test, y_predict)
print(class_report)

              precision    recall  f1-score   support

           0       0.96      0.95      0.96      3232
           1       0.93      0.90      0.92      1143
           2       0.91      0.92      0.92      1581
           3       0.96      0.96      0.96      8679
           4       0.94      0.94      0.94      5205
           5       0.94      0.96      0.95       184
           6       0.95      0.95      0.95      3460
           7       0.97      0.95      0.96       113
           8       0.95      0.95      0.95     15682
           9       0.94      0.90      0.92       460
          10       0.96      0.97      0.96      7075
          11       0.93      0.96      0.95      2706
          12       0.93      0.93      0.93       684
          13       0.95      0.97      0.96       208
          14       0.94      0.94      0.94      8109
          15       0.93      0.96      0.94       404
          16       0.94      0.93      0.93      1605
          17       0.93    

### predict missing value

In [None]:
features_test = test_mv.loc[:, feature]
label = test_mv.loc[:, {'Pekerjaan'}]

In [None]:
test_mv_new = knn.predict(features_test)
test_mv['Pekerjaan'] = test_mv_new

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_mv['Pekerjaan'] = test_mv_new


In [None]:
df_new = pd.concat([train_mv, test_mv], ignore_index=True)
df_new = df_new.sort_values(by='Id_customer', ascending=True, ignore_index=True)

In [None]:
df_new['Id_customer'].nunique()

438310

In [None]:
df_new.head()

Unnamed: 0,Id_customer,JK,KepemilikanMobil,KepemilikanProperti,JmlAnak,Pendapatan,TipePendapatan,TingkatPendidikan,StatusKeluarga,TipeRumah,FlagMobile,FlagWorkPhone,FlagPhone,Email,Pekerjaan,JmlAnggotaKeluarga,Age,Experience
0,5008804,0,1,1,0,427500.0,1,1,2,4,1,1,0,0,10,2.0,33.0,12.0
1,5008805,0,1,1,0,427500.0,1,1,2,4,1,1,0,0,10,2.0,33.0,12.0
2,5008805,0,1,1,0,427500.0,1,1,2,4,1,1,0,0,10,2.0,33.0,12.0
3,5008806,0,1,1,0,112500.0,1,0,2,3,1,0,0,0,16,2.0,59.0,3.0
4,5008808,1,0,1,0,270000.0,0,0,0,3,1,0,1,1,14,1.0,52.0,8.0


In [None]:
df_new.isnull().sum()

Id_customer            0
JK                     0
KepemilikanMobil       0
KepemilikanProperti    0
JmlAnak                0
Pendapatan             0
TipePendapatan         0
TingkatPendidikan      0
StatusKeluarga         0
TipeRumah              0
FlagMobile             0
FlagWorkPhone          0
FlagPhone              0
Email                  0
Pekerjaan              0
JmlAnggotaKeluarga     0
Age                    0
Experience             0
dtype: int64

## Merge credit history dan dataset yang sudah kita handle missing valuesnya

In [None]:
df = df_new.merge(cr, on ="Id_customer", how ="left")

In [None]:
df.head()

Unnamed: 0,Id_customer,JK,KepemilikanMobil,KepemilikanProperti,JmlAnak,Pendapatan,TipePendapatan,TingkatPendidikan,StatusKeluarga,TipeRumah,FlagMobile,FlagWorkPhone,FlagPhone,Email,Pekerjaan,JmlAnggotaKeluarga,Age,Experience,Overdue
0,5008804,0,1,1,0,427500.0,1,1,2,4,1,1,0,0,10,2.0,33.0,12.0,1.0
1,5008805,0,1,1,0,427500.0,1,1,2,4,1,1,0,0,10,2.0,33.0,12.0,1.0
2,5008805,0,1,1,0,427500.0,1,1,2,4,1,1,0,0,10,2.0,33.0,12.0,1.0
3,5008806,0,1,1,0,112500.0,1,0,2,3,1,0,0,0,16,2.0,59.0,3.0,0.0
4,5008808,1,0,1,0,270000.0,0,0,0,3,1,0,1,1,14,1.0,52.0,8.0,0.0


In [None]:
df.isnull().sum()

Id_customer                 0
JK                          0
KepemilikanMobil            0
KepemilikanProperti         0
JmlAnak                     0
Pendapatan                  0
TipePendapatan              0
TingkatPendidikan           0
StatusKeluarga              0
TipeRumah                   0
FlagMobile                  0
FlagWorkPhone               0
FlagPhone                   0
Email                       0
Pekerjaan                   0
JmlAnggotaKeluarga          0
Age                         0
Experience                  0
Overdue                405456
dtype: int64

### delete row yang ada NaN

In [None]:
df = df.dropna(axis=0)

In [None]:
df.describe()

Unnamed: 0,Id_customer,JK,KepemilikanMobil,KepemilikanProperti,JmlAnak,Pendapatan,TipePendapatan,TingkatPendidikan,StatusKeluarga,TipeRumah,FlagMobile,FlagWorkPhone,FlagPhone,Email,Pekerjaan,JmlAnggotaKeluarga,Age,Experience,Overdue
count,33000.0,33000.0,33000.0,33000.0,33000.0,33000.0,33000.0,33000.0,33000.0,33000.0,33000.0,33000.0,33000.0,33000.0,33000.0,33000.0,33000.0,33000.0,33000.0
mean,5077801.0,0.670758,0.377394,0.672303,0.427303,185803.0,1.451909,0.290939,1.632909,2.852788,1.0,0.223424,0.292515,0.091182,7.617758,2.194,43.879091,177.815576,0.127939
std,41966.03,0.469945,0.484742,0.469381,0.741541,101103.4,1.38086,0.478188,0.706583,0.604057,0.0,0.416547,0.454925,0.287872,4.163484,0.911157,11.558684,374.718837,0.334027
min,5008804.0,0.0,0.0,0.0,0.0,27000.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,21.0,0.0,0.0
25%,5041874.0,0.0,0.0,0.0,0.0,121500.0,1.0,0.0,2.0,3.0,1.0,0.0,0.0,0.0,4.0,2.0,34.0,3.0,0.0
50%,5069371.0,1.0,0.0,1.0,0.0,157500.0,1.0,0.0,2.0,3.0,1.0,0.0,0.0,0.0,8.0,2.0,43.0,7.0,0.0
75%,5114593.0,1.0,1.0,1.0,1.0,225000.0,3.0,1.0,2.0,3.0,1.0,0.0,1.0,0.0,10.0,3.0,53.0,16.0,0.0
max,5150487.0,1.0,1.0,1.0,19.0,1575000.0,4.0,2.0,2.0,4.0,1.0,1.0,1.0,1.0,17.0,20.0,69.0,1001.0,1.0


In [None]:
df.Id_customer.value_counts()

5009022    2
5009170    2
5009371    2
5009370    2
5009369    2
          ..
5052932    1
5052931    1
5052930    1
5052929    1
5150487    1
Name: Id_customer, Length: 32910, dtype: int64

In [None]:
df.Overdue.value_counts()

0.0    28778
1.0     4222
Name: Overdue, dtype: int64

In [None]:
df.shape[0]

33000

In [None]:
df = df.drop_duplicates()

In [None]:
df.Id_customer.value_counts()

5008804    1
5100042    1
5099999    1
5099993    1
5099991    1
          ..
5052850    1
5052849    1
5052848    1
5052847    1
5150487    1
Name: Id_customer, Length: 32910, dtype: int64

In [None]:
df.shape[0]

32910

In [None]:
df.to_csv('Datasets.csv')

In [None]:
#files.download('Datasets.csv')

# Build Model

## Melihat korelasi tiap column terhadap overdue

In [None]:
corr_col = df.corr()

In [None]:
corr_col['Overdue']

Id_customer            0.005444
JK                    -0.023770
KepemilikanMobil      -0.009119
KepemilikanProperti   -0.031034
JmlAnak                0.013482
Pendapatan             0.028256
TipePendapatan        -0.019906
TingkatPendidikan     -0.002836
StatusKeluarga        -0.006619
TipeRumah             -0.011112
FlagMobile                  NaN
FlagWorkPhone          0.001318
FlagPhone             -0.005236
Email                  0.019464
Pekerjaan             -0.000981
JmlAnggotaKeluarga     0.011236
Age                   -0.033856
Experience            -0.025959
Overdue                1.000000
Name: Overdue, dtype: float64

In [None]:
feature = []
for i in df.columns:
  if i not in ['Overdue', 'Id_customer', 'FlagMobile']:
    feature.append(i)
feature

['JK',
 'KepemilikanMobil',
 'KepemilikanProperti',
 'JmlAnak',
 'Pendapatan',
 'TipePendapatan',
 'TingkatPendidikan',
 'StatusKeluarga',
 'TipeRumah',
 'FlagWorkPhone',
 'FlagPhone',
 'Email',
 'Pekerjaan',
 'JmlAnggotaKeluarga',
 'Age',
 'Experience']

In [None]:
x = df.loc[:, feature]
y = df.loc[:, ['Overdue']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
nb = GaussianNB()

In [None]:
y_train_1d = np.ravel(y_train)

In [None]:
nb.fit(X_train, y_train_1d)

GaussianNB()

In [None]:
accuracy = nb.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.8614402917046491


In [None]:
from sklearn.metrics import classification_report

test = nb.predict(X_test)

print ("Testing selesai")

print(classification_report(y_test, test))

Testing selesai
              precision    recall  f1-score   support

         0.0       0.87      0.99      0.93      5704
         1.0       0.20      0.01      0.02       878

    accuracy                           0.86      6582
   macro avg       0.53      0.50      0.47      6582
weighted avg       0.78      0.86      0.81      6582



## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
lr = LogisticRegression()

In [None]:
lr.fit(X_train, y_train_1d)

LogisticRegression()

In [None]:
y_pred = lr.predict(X_test)

In [None]:
accuracy = lr.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.8666058948647828


In [None]:
test = lr.predict(X_test)

print ("Testing selesai")

print(classification_report(y_test, test))

Testing selesai
              precision    recall  f1-score   support

         0.0       0.87      1.00      0.93      5704
         1.0       0.00      0.00      0.00       878

    accuracy                           0.87      6582
   macro avg       0.43      0.50      0.46      6582
weighted avg       0.75      0.87      0.80      6582



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Random Forest

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
rf.fit(X_train,y_train_1d)

RandomForestClassifier(random_state=42)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
accuracy = rf.score(X_test, y_test)
print(f'Model accuracy: {accuracy:.2f}')

Model accuracy: 0.85


In [None]:
test = rf.predict(X_test)

print ("Testing selesai")

print(classification_report(y_test, test))

Testing selesai
              precision    recall  f1-score   support

         0.0       0.87      0.97      0.92      5723
         1.0       0.10      0.02      0.04       859

    accuracy                           0.85      6582
   macro avg       0.49      0.50      0.48      6582
weighted avg       0.77      0.85      0.80      6582



## SVM

In [None]:
from sklearn.svm import SVC

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)

In [None]:
y_train_1d = np.ravel(y_train)

In [None]:
model = SVC(kernel='linear', random_state=42)

In [None]:
#model.fit(X_train,y_train_1d)

In [None]:
#y_pred = model.predict(X_test)

In [None]:
#accuracy = model.score(X_test, y_test)
print(f'Model accuracy: {accuracy:.2f}')

Model accuracy: 0.85


In [None]:
#test = model.predict(X_test)

#print ("Testing selesai")

print(classification_report(y_test, test))

              precision    recall  f1-score   support

         0.0       0.87      0.97      0.92      5707
         1.0       0.14      0.03      0.05       875

    accuracy                           0.84      6582
   macro avg       0.50      0.50      0.48      6582
weighted avg       0.77      0.84      0.80      6582



## Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)

In [None]:
y_train_1d = np.ravel(y_train)

In [None]:
dt = DecisionTreeClassifier(random_state=2)

In [None]:
dt.fit(X_train,y_train)

DecisionTreeClassifier(random_state=2)

In [None]:
y_pred = dt.predict(X_test)

In [None]:
accuracy = dt.score(X_test, y_test)
print(f'Model accuracy: {accuracy:.2f}')

Model accuracy: 0.87


In [None]:
test = dt.predict(X_test)

print ("Testing selesai")

print(classification_report(y_test, test))

Testing selesai
              precision    recall  f1-score   support

         0.0       0.90      0.96      0.93      5707
         1.0       0.52      0.30      0.38       875

    accuracy                           0.87      6582
   macro avg       0.71      0.63      0.65      6582
weighted avg       0.85      0.87      0.85      6582



### cross val

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier


# Create the classifier
clf = DecisionTreeClassifier()

# Use 5-fold cross-validation
scores = cross_val_score(clf, x, y, cv=5)

# Print the mean and standard deviation of the scores
print(f'Mean score: {scores.mean()}')
print(f'Standard deviation: {scores.std()}')


Mean score: 0.7708599209966576
Standard deviation: 0.014655772396278854


In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

# Use cross_val_predict to get the predicted labels for each fold
y_pred = cross_val_predict(clf, x, y, cv=5)

# Generate the report
report = classification_report(y, y_pred)
print(report)


              precision    recall  f1-score   support

         0.0       0.87      0.86      0.87     28696
         1.0       0.13      0.14      0.14      4214

    accuracy                           0.77     32910
   macro avg       0.50      0.50      0.50     32910
weighted avg       0.78      0.77      0.77     32910



# Predict Data_test

### Disini saya menggunakan decision tree classifier karena f1 scorenya lebih tinggi dari model lain yang saya coba

#### saya test dengan file data_test yang telah disediakan oleh celerates

In [None]:
importance = dt.feature_importances_
importance

array([0.01870116, 0.02255723, 0.02008844, 0.02205738, 0.17932977,
       0.05117346, 0.02927425, 0.04592705, 0.03127874, 0.01948301,
       0.02784267, 0.01603373, 0.12031099, 0.04331344, 0.19536434,
       0.15726436])

In [None]:
data_test = pd.read_csv("/content/drive/MyDrive/finproject_celerates/data_test.csv")

In [None]:
obj_df = data_test.select_dtypes(include=['object'])
colum = obj_df
for i in colum:
  data_test[i] = data_test[i].astype('category')

In [None]:
data_test['JK'] = data_test['JK'].cat.codes
data_test['KepemilikanMobil'] = data_test['KepemilikanMobil'].cat.codes
data_test['KepemilikanProperti'] = data_test['KepemilikanProperti'].cat.codes
data_test['TipePendapatan'] = data_test['TipePendapatan'].cat.codes
data_test['TingkatPendidikan'] = data_test['TingkatPendidikan'].cat.codes
data_test['StatusKeluarga'] = data_test['StatusKeluarga'].cat.codes
data_test['TipeRumah'] = data_test['TipeRumah'].cat.codes
data_test['Pekerjaan'] = data_test['Pekerjaan'].cat.codes

In [None]:
feature_test = []
for i in data_test.columns:
  if i not in ['Id_customer', 'FlagMobile']:
    feature_test.append(i)
feature_test

['JK',
 'KepemilikanMobil',
 'KepemilikanProperti',
 'JmlAnak',
 'Pendapatan',
 'TipePendapatan',
 'TingkatPendidikan',
 'StatusKeluarga',
 'TipeRumah',
 'FlagWorkPhone',
 'FlagPhone',
 'Email',
 'Pekerjaan',
 'JmlAnggotaKeluarga',
 'Age',
 'Experience']

In [None]:
x_test = data_test.loc[:, feature_test]

In [None]:
test = dt.predict(x_test)

In [None]:
pred_series = pd.Series(test)

In [None]:
data_test['Value'] = pred_series

In [None]:
data_test[['Id_customer','Value']].to_csv("tes.csv")

In [None]:
#files.download('tes.csv')

# DECODE

In [None]:
df = df.replace({'JK':original[0]})
df = df.replace({'StatusKeluarga':original[5]})
df = df.replace({'TipeRumah':original[6]})
df = df.replace({'KepemilikanMobil':original[1]})
df = df.replace({'KepemilikanProperti':original[1]})
df = df.replace({'TipePendapatan':original[3]})
df = df.replace({'TingkatPendidikan':original[4]})
df = df.replace({'Pekerjaan':original[7]})

In [None]:
df = df.replace({'Overdue':{0:"Good Borrower", 1:"Bad Borrower"}})

In [None]:
df.head()

Unnamed: 0,Id_customer,JK,KepemilikanMobil,KepemilikanProperti,JmlAnak,Pendapatan,TipePendapatan,TingkatPendidikan,StatusKeluarga,TipeRumah,FlagMobile,FlagWorkPhone,FlagPhone,Email,Pekerjaan,JmlAnggotaKeluarga,Age,Experience,Overdue
0,5008804,Laki-laki,Ya,Ya,0,427500.0,Bekerja,Postgraduate,Menikah,Sewa Apartemen,1,1,0,0,Managers,2.0,33.0,12.0,Bad Borrower
1,5008805,Laki-laki,Ya,Ya,0,427500.0,Bekerja,Postgraduate,Menikah,Sewa Apartemen,1,1,0,0,Managers,2.0,33.0,12.0,Bad Borrower
3,5008806,Laki-laki,Ya,Ya,0,112500.0,Bekerja,Graduate,Menikah,Rumah pribadi,1,0,0,0,Security staff,2.0,59.0,3.0,Good Borrower
4,5008808,Perempuan,Tidak,Ya,0,270000.0,Asosiasi komersial,Graduate,Belum Menikah,Rumah pribadi,1,0,1,1,Sales staff,1.0,52.0,8.0,Good Borrower
6,5008810,Perempuan,Tidak,Ya,0,270000.0,Asosiasi komersial,Graduate,Belum Menikah,Rumah pribadi,1,0,1,1,Sales staff,1.0,52.0,8.0,Good Borrower


In [None]:
df.to_csv('Datasets_clean.csv')

In [None]:
#files.download('Datasets_clean.csv')