# Import Library

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold, GridSearchCV, cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.formula.api import ols
import statsmodels.api as sm
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

# Import Data

In [8]:
kolom = pd.read_csv('/kaggle/input/kolom-satria-data/kolom.csv',names=['Variable','Position','Label'])
nama_kolom = kolom.iloc[2:28,0]
df = pd.read_csv('/kaggle/input/data-train-satria-data-2022/train_fktp.txt',header=None,names=nama_kolom)
df.head()

Unnamed: 0,PSTV01,PSTV02,PSTV15,FKP02,FKP03,FKP04,FKP05,FKP06,FKP07,FKP08,...,FKP15,FKP15A,FKP16,FKP17,FKP18,FKP19,FKP20,FKP21,FKP22,FKP13Class
0,157978748.0,50765420,20.064983,958073632,2019-08-01,2019-08-01,61,6108,3,1,...,9999,9999,98,9998,98,98,98,98,2,Sehat
1,95085112.0,26931775,2.626307,449590620P000054,2020-06-17,2020-06-17,51,5171,9,3,...,K297,"Gastritis, unspecified",98,9998,98,98,98,98,1,Belum_Sehat
2,224470578.0,227368233,1.05183,467490619P000001,2019-06-01,2019-06-01,35,3516,9,2,...,Z309,"Contraceptive management, unspecified",98,9998,98,98,98,98,1,Belum_Sehat
3,63871289.0,62313678,364.741455,254321219Y002368,2019-12-21,2019-12-21,34,3402,3,1,...,I10,Essential (primary) hypertension,98,9998,98,98,98,98,1,Belum_Sehat
4,29915626.0,113446188,1.050523,88681119P000020,2019-11-04,2019-11-04,35,3509,3,1,...,I110,Hypertensive heart disease with (congestive) h...,98,9998,98,98,98,98,1,Belum_Sehat


In [16]:
print(df['FKP13Class'].value_counts(normalize=True))
df['FKP13Class'].value_counts()

0    0.64939

1    0.35061

Name: FKP13Class, dtype: float64


0    2634511
1    1422387
Name: FKP13Class, dtype: int64

# Feature Engineering & Preprocessing Data

In [9]:
df['FKP13Class'].replace(['Sehat','Belum_Sehat'],[1,0],inplace=True)
df['FKP03'] = pd.to_datetime(df['FKP03'])
df['FKP04'] = pd.to_datetime(df['FKP04'])
df['lama_rawat'] = (df['FKP04']-df['FKP03']).dt.days
df['year'] = df['FKP03'].dt.year
df['month'] = df['FKP03'].dt.month
df['day_of_week'] = df['FKP03'].dt.day_of_week
df['week_of_year'] = df['FKP03'].dt.week
df['quarter_of_year'] = df['FKP03'].dt.quarter

df["diag_icd_0"] = df["FKP14A"].str[0]
df["diag_icd_1"] = df["FKP14A"].str[1:]
df["diag_icd_2"] = df["FKP15"].apply(lambda x: str(x)[-1] if str(x) != "9999" else str(x))

df["diag_icd_0"].fillna("9999", inplace=True)
df["diag_icd_1"].fillna("9999", inplace=True)
df["FKP11"].fillna(df["FKP11"].mode()[0], inplace=True)

df = df.drop(["PSTV01", "PSTV02", "FKP02","FKP03", "FKP04","FKP14A", "FKP15", "FKP15A"],axis=1)
df.info()

  df['week_of_year'] = df['FKP03'].dt.week


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4056898 entries, 0 to 4056897
Data columns (total 27 columns):
 #   Column           Dtype  
---  ------           -----  
 0   PSTV15           float64
 1   FKP05            int64  
 2   FKP06            int64  
 3   FKP07            int64  
 4   FKP08            int64  
 5   FKP09            int64  
 6   FKP10            int64  
 7   FKP11            float64
 8   FKP12            int64  
 9   FKP14            int64  
 10  FKP16            int64  
 11  FKP17            int64  
 12  FKP18            int64  
 13  FKP19            int64  
 14  FKP20            int64  
 15  FKP21            int64  
 16  FKP22            int64  
 17  FKP13Class       int64  
 18  lama_rawat       int64  
 19  year             int64  
 20  month            int64  
 21  day_of_week      int64  
 22  week_of_year     int64  
 23  quarter_of_year  int64  
 24  diag_icd_0       object 
 25  diag_icd_1       object 
 26  diag_icd_2       object 
dtypes: float64(2

In [10]:
le = LabelEncoder()
obj_cols = list(df.select_dtypes(include="object").columns)
for col in obj_cols:
    df[col] = df[col].astype(str)
    df[col] = le.fit_transform(df[col])

# Handling Missing Value

In [11]:
df.isnull().sum()

PSTV15             0
FKP05              0
FKP06              0
FKP07              0
FKP08              0
FKP09              0
FKP10              0
FKP11              0
FKP12              0
FKP14              0
FKP16              0
FKP17              0
FKP18              0
FKP19              0
FKP20              0
FKP21              0
FKP22              0
FKP13Class         0
lama_rawat         0
year               0
month              0
day_of_week        0
week_of_year       0
quarter_of_year    0
diag_icd_0         0
diag_icd_1         0
diag_icd_2         0
dtype: int64

# Split Data

In [12]:
X = df.drop(['FKP13Class'], axis=1)
y = df['FKP13Class']

In [13]:
print(X.shape)
print(y.shape)

(4056898, 26)
(4056898,)


In [14]:
from sklearn.model_selection import train_test_split
# Separate train and test set for modelling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

# Train and test set dimension
print('Shape of X_train', X_train.shape)
print('Shape of y_train', y_train.shape)
print('Shape of X_test', X_test.shape)
print('Shape of y_test', y_test.shape)

Shape of X_train (3245518, 26)
Shape of y_train (3245518,)
Shape of X_test (811380, 26)
Shape of y_test (811380,)


## Percobaan 1 (XGB)

In [15]:
# Fit classifier
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

# predict test set
y_pred_def = xgb.predict(X_test)

# Calculate accuracy, precision, recall, and f1-score
train_score_def =xgb.score(X_train, y_train) 
test_score_def = xgb.score(X_test, y_test) 
prec_score_def = precision_score(y_test, y_pred_def)
recall_score_def = recall_score(y_test, y_pred_def)
f1_def = f1_score(y_test, y_pred_def)

print('Training Accuracy : {}%'.format(train_score_def))
print('Test Accuracy : {}%'.format(test_score_def))
print('Precision Score : {}%'.format(prec_score_def))
print('Recall Score : {}%'.format(recall_score_def))
print('F1 Score : {}%'.format(f1_def))

Training Accuracy : 0.9995461433275058%
Test Accuracy : 0.9993406295447262%
Precision Score : 0.9988720417449971%
Recall Score : 0.999247744992583%
F1 Score : 0.9990598580473515%


In [17]:
from sklearn.metrics import classification_report, confusion_matrix
y_pred_xgb= xgb.predict(X_test)
print(classification_report(y_test, y_pred_def,digits=5))

              precision    recall  f1-score   support

           0    0.99959   0.99939   0.99949    526902
           1    0.99887   0.99925   0.99906    284478

    accuracy                        0.99934    811380
   macro avg    0.99923   0.99932   0.99928    811380
weighted avg    0.99934   0.99934   0.99934    811380



In [18]:
positive_records = y_train.sum()
negative_records = len(y_train) - positive_records
spw = negative_records / positive_records
xgb = XGBClassifier(scale_pos_weight=spw)
xgb.fit(X_train, y_train)

# predict test set
y_pred_def = xgb.predict(X_test)

# Calculate accuracy, precision, recall, and f1-score
train_score_def =xgb.score(X_train, y_train) 
test_score_def = xgb.score(X_test, y_test) 
prec_score_def = precision_score(y_test, y_pred_def)
recall_score_def = recall_score(y_test, y_pred_def)
f1_def = f1_score(y_test, y_pred_def)

print('Training Accuracy : {}%'.format(train_score_def))
print('Test Accuracy : {}%'.format(test_score_def))
print('Precision Score : {}%'.format(prec_score_def))
print('Recall Score : {}%'.format(recall_score_def))
print('F1 Score : {}%'.format(f1_def))

Training Accuracy : 0.999514715370551%
Test Accuracy : 0.9993554191624147%
Precision Score : 0.9984797469287728%
Recall Score : 0.999683631071647%
F1 Score : 0.9990813263329112%


In [19]:
y_pred_xgb= xgb.predict(X_test)
print(classification_report(y_test, y_pred_def,digits=5))

              precision    recall  f1-score   support

           0    0.99983   0.99918   0.99950    526902
           1    0.99848   0.99968   0.99908    284478

    accuracy                        0.99936    811380
   macro avg    0.99915   0.99943   0.99929    811380
weighted avg    0.99936   0.99936   0.99936    811380



## Percobaan 2 (Random Forest)

In [20]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
rfc = RandomForestClassifier()

# Fit classifier
rfc.fit(X_train, y_train)

# predict test set
y_pred_def = rfc.predict(X_test)

# Calculate accuracy, precision, recall, and f1-score
train_score_def =rfc.score(X_train, y_train) 
test_score_def = rfc.score(X_test, y_test) 
prec_score_def = precision_score(y_test, y_pred_def)
recall_score_def = recall_score(y_test, y_pred_def)
f1_def = f1_score(y_test, y_pred_def)

print('Training Accuracy : {}%'.format(train_score_def))
print('Test Accuracy : {}%'.format(test_score_def))
print('Precision Score : {}%'.format(prec_score_def))
print('Recall Score : {}%'.format(recall_score_def))
print('F1 Score : {}%'.format(f1_def))

Training Accuracy : 0.9999987675310998%
Test Accuracy : 0.9994515516773892%
Precision Score : 0.9990933020323389%
Recall Score : 0.9993426556710888%
F1 Score : 0.999217963295239%


In [21]:
y_pred_xgb= rfc.predict(X_test)
print(classification_report(y_test, y_pred_def,digits=5))

              precision    recall  f1-score   support

           0    0.99965   0.99951   0.99958    526902
           1    0.99909   0.99934   0.99922    284478

    accuracy                        0.99945    811380
   macro avg    0.99937   0.99943   0.99940    811380
weighted avg    0.99945   0.99945   0.99945    811380



## Percobaan 3 (Logistic Regression)

In [22]:
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression()

# Fit classifier
lg.fit(X_train, y_train)

# predict test set
y_pred_def = lg.predict(X_test)

# Calculate accuracy, precision, recall, and f1-score
train_score_def =lg.score(X_train, y_train) 
test_score_def = lg.score(X_test, y_test) 
prec_score_def = precision_score(y_test, y_pred_def)
recall_score_def = recall_score(y_test, y_pred_def)
f1_def = f1_score(y_test, y_pred_def)

print('Training Accuracy : {}%'.format(train_score_def))
print('Test Accuracy : {}%'.format(test_score_def))
print('Precision Score : {}%'.format(prec_score_def))
print('Recall Score : {}%'.format(recall_score_def))
print('F1 Score : {}%'.format(f1_def))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training Accuracy : 0.9891915558625772%
Test Accuracy : 0.9892491804086865%
Precision Score : 0.9987556205076634%
Recall Score : 0.9705460527703372%
F1 Score : 0.9844487904713854%


In [23]:
y_pred_xgb= lg.predict(X_test)
print(classification_report(y_test, y_pred_def,digits=5))

              precision    recall  f1-score   support

           0    0.98434   0.99935   0.99179    526902
           1    0.99876   0.97055   0.98445    284478

    accuracy                        0.98925    811380
   macro avg    0.99155   0.98495   0.98812    811380
weighted avg    0.98939   0.98925   0.98921    811380



percobaan 3 (XGB with Standarization)

In [24]:
from mlxtend.classifier import StackingClassifier
from sklearn.svm import SVC
xgb = XGBClassifier()
xgb2 = XGBClassifier(scale_pos_weight=spw)
clf2 =StackingClassifier(classifiers =[xgb, xgb2], meta_classifier = rfc ,use_probas = True)
clf2.fit(X_train, y_train)

# predict test set
y_pred_def = clf2.predict(X_test)

# Calculate accuracy, precision, recall, and f1-score
train_score_def =clf2.score(X_train, y_train)*100
test_score_def = clf2.score(X_test, y_test) *100
prec_score_def = precision_score(y_test, y_pred_def)*100
recall_score_def = recall_score(y_test, y_pred_def)*100
f1_def = f1_score(y_test, y_pred_def)*100

print('Training Accuracy : {}'.format(train_score_def))
print('Test Accuracy : {}'.format(test_score_def))
print('Precision Score : {}'.format(prec_score_def))
print('Recall Score : {}'.format(recall_score_def))
print('F1 Score : {}'.format(f1_def))

Training Accuracy : 99.99953782416243
Test Accuracy : 99.93689763119623
Precision Score : 99.90194148824001
Recall Score : 99.91809559965972
F1 Score : 99.91001789097402


In [32]:
data1 = 'test_fktp.txt'
df1 = pd.read_csv(data1,header=None,names=nama_kolom)
df1


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,PSTV01,PSTV02,PSTV15,FKP02,FKP03,FKP04,FKP05,FKP06,FKP07,FKP08,...,FKP15,FKP15A,FKP16,FKP17,FKP18,FKP19,FKP20,FKP21,FKP22,FKP13Class
0,187883292.0,98048784,240.884842,954386050,2019-08-21,2019-08-21,31,3172,9,2,...,9999,9999,98,9998,98,98,98,98,2,
1,123307139.0,20969999,47.378571,774246805,2019-02-12,2019-02-12,17,1705,3,1,...,9999,9999,98,9998,98,98,98,98,2,
2,21416273.0,21416273,5.882927,1202010273,2020-03-27,2020-03-27,64,6402,3,1,...,9999,9999,98,9998,98,98,98,98,2,
3,97630052.0,97630052,172.916031,1286640248,2020-07-30,2020-07-30,18,1872,9,2,...,9999,9999,98,9998,98,98,98,98,2,
4,124531163.0,124531163,187.203140,1285179242,2020-07-28,2020-07-28,18,1809,9,2,...,9999,9999,98,9998,98,98,98,98,2,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1014220,51801010.0,51801010,7.563763,248870919P000406,2019-09-08,2019-09-08,36,3601,9,2,...,Z992,Dependence on renal dialysis,36,3602,3,1,27,99,1,
1014221,339397889.0,339397889,11.063568,116000620P000034,2020-06-02,2020-06-02,33,3373,9,2,...,Z999,Dependence on unspecified enabling machine and...,98,9998,98,98,98,98,1,
1014222,76213152.0,76213152,286.792694,160421019P000631,2019-10-15,2019-10-15,32,3276,3,1,...,Z992,Dependence on renal dialysis,32,3276,9,1,33,99,1,
1014223,101797983.0,101797983,30.222155,332670919P000083,2019-09-06,2019-09-06,16,1606,3,1,...,Z992,Dependence on renal dialysis,16,1606,3,1,27,99,1,


In [33]:
df1.FKP03 = pd.to_datetime(df1.FKP03)
df1.FKP04 = pd.to_datetime(df1.FKP04)
df1['days'] = df1.FKP04 - df1.FKP03
df1.days = df1.days.astype('str')
df1['days'] = df1['days'].str.replace(r'\D', '').astype(int)
df1.days = df1.days.astype('int64')


  df1['days'] = df1['days'].str.replace(r'\D', '').astype(int)


In [None]:
X_predict = df1.drop(['PSTV01','FKP13Class','PSTV02','FKP15A','FKP14A','FKP02','FKP03','FKP04'],axis=1)

In [None]:
lr = LinearRegression()
imp = IterativeImputer(estimator=lr,missing_values=np.nan, max_iter=15, verbose=2, imputation_order='roman',random_state=0)
X_predict=imp.fit_transform(X_predict)

In [None]:
final = rfc.predict(X_predict)
final

In [None]:
unique, counts = np.unique(final, return_counts=True)
print(np.asarray((unique, counts)).T)

[[     0 657581]

 [     1 356644]]


In [None]:
unique, counts = np.unique(final, return_counts=True)
print(np.asarray((unique, counts)).T)

[[     0 659351]

 [     1 354874]]


In [None]:
!head -n20 'submission.csv'

'head' is not recognized as an internal or external command,

operable program or batch file.


In [None]:
fix = pd.read_csv('submission.csv')
fix


  fix = pd.read_csv('submission.csv')


Unnamed: 0.1,Unnamed: 0,FKP02,Status
0,1,954386050,
1,2,774246805,
2,3,1202010273,
3,4,1286640248,
4,5,1285179242,
...,...,...,...
1014220,1014221,248870919P000406,
1014221,1014222,116000620P000034,
1014222,1014223,160421019P000631,
1014223,1014224,332670919P000083,


In [None]:
fix.Status = final

0     1
1     1
2     1
3     1
4     1
5     1
6     1
7     1
8     1
9     1
10    1
11    1
12    1
13    1
14    1
15    1
16    1
17    1
18    1
19    1
20    1
21    1
22    1
23    1
24    1
25    1
26    1
27    1
28    1
29    1
30    1
31    1
32    1
33    1
34    1
35    1
36    1
37    1
38    1
39    1
40    1
41    1
42    1
43    1
44    1
45    1
46    1
47    1
48    1
49    1
Name: Status, dtype: int64

In [None]:
fix.iloc[:,:2] = fix.iloc[:,:2].astype('str')
fix.Status = fix.Status.astype('int64')

In [None]:
fix.to_csv('submission1.csv', index=False)

In [None]:
fix['nyoba'] = num_features_predict['187883292']
fix.sort_values(by=['PSTV01','nyoba'],ascending=False, inplace=True)
fix['hasil'] = np.where(fix['PSTV01'] == fix['nyoba'], 'True', 'False')
fix

Unnamed: 0.1,Unnamed: 0,PSTV01,Status,nyoba,hasil
771367,771368,455377139.0,,55262812.0,False
710032,710033,455372039.0,,121443360.0,False
848212,848213,455372039.0,,23932356.0,False
749253,749254,455369713.0,,77573364.0,False
317706,317707,455362964.0,,165653256.0,False
...,...,...,...,...,...
141,142,218.0,,87175694.0,False
267353,267354,218.0,,82003963.0,False
137202,137203,218.0,,11485277.0,False
236540,236541,101.0,,189473.0,False


In [None]:
fix[fix['hasil'] == "True"]

Unnamed: 0.1,Unnamed: 0,PSTV01,Status,nyoba,hasil
895999,896000,445337215.0,,445337215.0,True
819775,819776,443842043.0,,443842043.0,True
896069,896070,440312877.0,,440312877.0,True
388888,388889,439347517.0,,439347517.0,True
980287,980288,425220988.0,,425220988.0,True
...,...,...,...,...,...
962287,962288,525990.0,,525990.0,True
368138,368139,443490.0,,443490.0,True
685645,685646,436504.0,,436504.0,True
979794,979795,281764.0,,281764.0,True


In [None]:
fix['Status'] = final.tolist()
fix

In [None]:
from lazypredict.Supervised import LazyClassifier
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)
models