In [91]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from xgboost import XGBRegressor

In [92]:
df = pd.read_csv('datasets/dataset_p1.csv')
df

Unnamed: 0,R1301A,R1301B,R1301C,R1303A,R1303B,R1303C,R1305A,R1305B,R1305C,R1305D,...,R1206B,R1207,R1208,R1209,R1210A,R1210B,R1210C,R1210D,R1210E,index_k
0,1,1,1,4,1,4,1,2,2,3,...,2,1,2,3,2,2,1,1,2,62.31
1,1,2,2,4,1,2,4,2,2,3,...,3,1,2,4,1,2,1,1,2,72.21
2,1,1,2,1,1,4,4,1,2,2,...,2,1,2,4,2,2,1,2,2,70.80
3,1,1,2,2,1,2,2,2,2,3,...,3,2,4,2,2,2,2,1,2,72.89
4,1,1,2,4,1,2,2,2,2,3,...,2,4,2,3,1,2,2,1,2,72.34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74679,1,1,3,1,1,1,1,1,1,1,...,2,1,3,4,1,1,1,1,1,64.53
74680,1,1,3,1,1,1,1,1,1,1,...,2,1,3,4,1,1,1,1,1,72.00
74681,1,1,2,1,1,1,1,1,1,3,...,2,1,2,4,1,1,1,1,1,61.60
74682,1,1,3,1,1,1,1,1,1,3,...,2,1,3,1,1,2,1,1,2,61.48


# EDA

In [93]:
def natural_sort_key(column_name):
    # Menggunakan regex untuk memisahkan string menjadi bagian-bagian angka (\d+) dan non-angka (\D+)
    # Filter(None, ...) akan menghilangkan string kosong yang mungkin dihasilkan oleh split
    parts = [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', column_name)]
    return parts

# Mengambil daftar kolom dan mengurutkannya menggunakan kunci kustom
sorted_columns_natural_order = sorted(df.columns, key=natural_sort_key)

# Menerapkan urutan kolom yang baru ke DataFrame
df = df.reindex(sorted_columns_natural_order, axis=1)

## Kolom String

In [94]:

kolom_string = df.select_dtypes(include='object').columns.tolist()
if kolom_string:
    print(f"Kolom dengan tipe data 'object': {kolom_string}")
else:
    print("Tidak ada kolom dengan tipe data 'object' ditemukan.")


Kolom dengan tipe data 'object': ['R702B', 'R1607B', 'R1608C', 'R1609C']


In [95]:
df[['R1607B', 'R1608C', 'R1609C', 'R702B']]

Unnamed: 0,R1607B,R1608C,R1609C,R702B
0,,,,
1,,,,
2,,,,
3,,,,
4,D,,,
...,...,...,...,...
74679,,D,,
74680,BC,F,,
74681,,D,,
74682,D,A,,


In [96]:
data_bukan_nan = df[['R1607B', 'R1608C', 'R1609C', 'R702B']].dropna()
data_bukan_nan

Unnamed: 0,R1607B,R1608C,R1609C,R702B
280,X,A,A,Jantung
335,C,D,A,Diabetes
577,B,B,A,USUS BUNTU
586,C,A,A,Darah Tinggi
788,X,E,A,Diabetes
...,...,...,...,...
73625,C,A,A,Asma
73807,X,A,A,Asam lambung
73811,X,AF,A,Asma
74597,BX,A,C,Penyakit tahunan


## Info Data

In [97]:
kolom_numeric = df.drop(columns=['R1607B', 'R1608C', 'R1609C', 'R702B'])
kolom_numeric

Unnamed: 0,index_k,R501,R502,R503,R504A,R504B,R504C,R601A,R601B1,R602A1,...,R1610,R1611,R1612,R1613A,R1613B,R1613C,R1614,R1615,R1616,WEIGHT
0,62.31,2,2,1,1,1,1,2,3.0,,...,3,1,4,2,,2.0,2,4,1,210.870377
1,72.21,3,2,1,1,1,1,1,,1.0,...,4,1,4,2,,2.0,2,4,2,210.870377
2,70.80,3,2,1,1,1,1,2,1.0,,...,3,2,3,2,,2.0,2,4,2,210.870377
3,72.89,3,2,1,1,1,1,1,,1.0,...,4,1,4,2,,2.0,2,4,2,210.870377
4,72.34,5,2,1,1,2,2,1,,1.0,...,3,1,4,2,,1.0,2,4,2,210.870377
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74679,64.53,9,2,2,1,2,2,1,,14.0,...,2,2,1,1,2.0,,1,4,1,544.102112
74680,72.00,9,1,2,1,2,2,1,,6.0,...,2,2,1,1,2.0,,1,4,1,544.102112
74681,61.60,9,2,2,1,1,1,1,,14.0,...,1,2,1,1,2.0,,1,4,1,544.102112
74682,61.48,5,2,1,1,1,1,1,,14.0,...,3,2,1,2,,2.0,1,4,3,544.102112


In [98]:
kolom_numeric.describe()

Unnamed: 0,index_k,R501,R502,R503,R504A,R504B,R504C,R601A,R601B1,R602A1,...,R1610,R1611,R1612,R1613A,R1613B,R1613C,R1614,R1615,R1616,WEIGHT
count,74684.0,74684.0,74684.0,74684.0,74684.0,74684.0,74684.0,74684.0,21301.0,53383.0,...,74684.0,74684.0,74684.0,74684.0,22804.0,47270.0,74684.0,74684.0,74684.0,74684.0
mean,66.410889,4.006615,1.856288,1.537317,1.111831,1.933774,1.626105,1.285215,1.178583,5.439765,...,3.293088,2.011421,3.070028,1.756387,1.884406,1.811064,1.772722,3.49684,1.503254,934.529876
std,6.940839,1.956597,0.3508,0.498609,0.393634,0.932984,0.903802,0.45152,0.560209,5.412735,...,1.009745,1.032286,1.091356,0.554728,0.319744,0.659539,0.881138,0.872296,0.986134,903.367661
min,9.48,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,17.859985
25%,62.16,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,3.0,1.0,2.0,1.0,2.0,1.0,1.0,3.0,1.0,342.553009
50%,66.19,4.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,3.0,...,4.0,2.0,3.0,2.0,2.0,2.0,1.0,4.0,1.0,624.806641
75%,70.5,5.0,2.0,2.0,1.0,3.0,2.0,2.0,1.0,8.0,...,4.0,2.0,4.0,2.0,2.0,2.0,3.0,4.0,2.0,1274.261597
max,100.0,10.0,2.0,2.0,4.0,4.0,4.0,2.0,3.0,17.0,...,4.0,4.0,4.0,3.0,2.0,3.0,4.0,4.0,4.0,14644.427734


# Prepocessing

Proses ini menangani data hilang karena sparse dan encoding data kategorical

## Total Mising Value

In [99]:
df['R1103BK2']

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
         ..
74679   NaN
74680   NaN
74681   NaN
74682   NaN
74683   NaN
Name: R1103BK2, Length: 74684, dtype: float64

In [100]:
missing = df.isnull().sum()
print("\nJumlah Missing Values:")
print(missing[missing > 0])


Jumlah Missing Values:
R601B1     53383
R602A1     21301
R602A2     21301
R602B      21301
R603       21301
           ...  
R1609B5    25207
R1609B6    25207
R1609C     48158
R1613B     51880
R1613C     27414
Length: 99, dtype: int64


## handling data kategorical

### kolom R1607B, R1608C, R1609C

In [101]:
def encodeData(df, column,  references):
    # Loop melalui setiap huruf dalam daftar referensi
    for i, letter in enumerate(references):
        # Nama kolom baru akan menjadi R1607B_Huruf (misal: R1607B_A, R1607B_B)
        # Menggunakan f-string untuk membuat nama kolom dinamis
        # Menggunakan underscore untuk keterbacaan
        column_name = f'{column}_{letter}'
        df[column_name] = df[column].apply(
            lambda x: 0 if pd.isna(x) else (1 if letter in str(x) else 0)
        )

In [102]:
# Referensi huruf
references = ['A', 'B', 'C', 'D', 'E', 'F', 'X']
references2 = ['A', 'B', 'C', 'D', 'E', 'F']

encodeData(df, 'R1607B', references)
encodeData(df, 'R1608C', references2)
encodeData(df, 'R1609C', references2)

In [103]:
df.loc[df['R1607A'].isin([4,5]), ['R1607B','R1607B_A', 'R1607B_B', 'R1607B_C', 'R1607B_D', 'R1607B_E', 'R1607B_F', 'R1607B_X']].head()

Unnamed: 0,R1607B,R1607B_A,R1607B_B,R1607B_C,R1607B_D,R1607B_E,R1607B_F,R1607B_X
4,D,0,0,0,1,0,0,0
5,B,0,1,0,0,0,0,0
6,C,0,0,1,0,0,0,0
7,D,0,0,0,1,0,0,0
8,D,0,0,0,1,0,0,0


In [104]:
df.loc[df['R1608A'] == 1, ['R1608C','R1608C_A', 'R1608C_B', 'R1608C_C', 'R1608C_D', 'R1608C_E', 'R1608C_F']].head()

Unnamed: 0,R1608C,R1608C_A,R1608C_B,R1608C_C,R1608C_D,R1608C_E,R1608C_F
9,A,1,0,0,0,0,0
10,E,0,0,0,0,1,0
12,E,0,0,0,0,1,0
13,E,0,0,0,0,1,0
15,E,0,0,0,0,1,0


In [105]:
df.loc[df['R1609B1'] == 1, ['R1609C','R1609C_A', 'R1609C_B', 'R1609C_C', 'R1609C_D', 'R1609C_E', 'R1609C_F']].head()

Unnamed: 0,R1609C,R1609C_A,R1609C_B,R1609C_C,R1609C_D,R1609C_E,R1609C_F
23,A,1,0,0,0,0,0
28,BF,0,1,0,0,0,1
50,A,1,0,0,0,0,0
70,A,1,0,0,0,0,0
71,AB,1,1,0,0,0,0


In [106]:
df[['R1607B', 'R1607B_A', 'R1607B_B', 'R1607B_C', 'R1607B_D', 'R1607B_E', 'R1607B_F', 'R1607B_X','R1608C','R1608C_A', 'R1608C_B', 'R1608C_C', 'R1608C_D', 'R1608C_E', 'R1608C_F', 'R1609C', 'R1609C_A', 'R1609C_B', 'R1609C_C', 'R1609C_D', 'R1609C_E', 'R1609C_F']].isnull().sum()[lambda x: x > 0]

R1607B    16669
R1608C    21659
R1609C    48158
dtype: int64

In [107]:
#  drop column yang tidak terpakai
df.drop(columns=['R1607B', 'R1608C', 'R1609C'], inplace=True)

### kolom R702B (data penyakit)

In [108]:
import pandas as pd

# Kategori penyakit yang sudah disiapkan
categories = {
    "Penyakit Jantung dan Pembuluh Darah": ["sering pingsan tiba-tiba","pembekakan pembuluh darah","penyumbatan pembuluh darah","tensi","darah rendah","drop darah","penyempitan pembuluh darah","tensi tinggi","kurtosis aorta", "jantung", "koroner", "hipertensi", "tekanan darah tinggi","darah tinggi", "aritmia", "gagal jantung","penyumbatan darah"],
    "Penyakit Neurodegeneratif": ["alzheimer", "amyotrophic lateral sclerosis", "dimensia", "huntington", "parkinson"],
    "Penyakit Sistem Pernafasan": ["paru","afses paru","paru-paru","rinitis","faringitis","meniere syndrome","ispa","radang nasofaring","osa","nafas pendek","alergi dingin","tb","infeksi hidung","alergi asap","radang tenggorokan","covid","saluran pernafasan","peradangan paru","alergi debu","penyempitan pernafasan","tbc","sinusitis","polip", "amandel","step","tipes","sesak","sinositis","sesak nafas","asma", "bronkitis", "pneumonia", "copd"],
    "Penyakit Metabolik": ["kekurangan kalium","anemia","kencing manis","lemak darah","diabetes", "diabetes melitus", "kolesterol", "asam urat", "gout", "hiperlipidemia","gula darah","gula"],
    "Penyakit Tulang, Otot, dan Sendi": ["bengkak otot","kaki","avn","hnp","luka di kaki","kaki  terluka","pincang","otot bahu","kaki terluka","kolagen","cacat kaki","radang otot kaki","osteoarthrotis","punggung","kaki mudah merasa lemas","kaki selalu terasa lemas","hernea","sendi","gangguan persendian","kelelahan otot", "di bagin kaki","kesulitan untuk berjalan", "cacat pincang", "osteoritis","radang sendi","lutut","pengapuran","pinggang", "tulang", "hernia","patah kaki","osteoporosis", "artritis", "skoliosis", "fibromyalgia", "rematik"],
    "Penyakit Kanker dan Tumor": ["pembengkakan payudara","leukimia","kangker payudara","kista","kanker", "melanoma", "tumor","kangker perut"],
    "Penyakit Sistem Pencernaan": ["bagian perut","perut bengkak ( ada genangan air dlm perut )","pencernaan","berak darah","muntah darah","gangguan pencernaan","infeksi saluran pencernaan","empedu","refluks gastroesofagus","usus turung","pencernaan usus","gangguan empedu","kram usus", "liver","susah buang air besar", "radang usus", "empedu mengandung lemak dan lumpur","alergi saluran pencernaan","infeksi usus","tifus","peti liver","kuning","ambeyen", "hati","asam lambung","lambung", "maag", "gastritis", "batu empedu", "hepatitis", "ibs","usus buntu","gangguan usus"],
    "Penyakit Mata": ["ablosio retina", "tuna netra", "mata","katarak", "glaukoma", "konjungtivitis", "degenerasi makula", "buta","rabun"],
    "Penyakit Ginjal dan Saluran Kemih": ["kencing tidak lancar","infeksi saluran kemih","radang kencing","penyempitan saluran ginjal","gangguan saluran kencing","kencing batu","kencing frostat","ginjal", "batu ginjal", "isk", "gagal ginjal", "nefritis","infeksi saluran kencing"],
    "Penyakit Sistem Saraf dan Kejiwaan": ["pencairan di otak","jiwa","meningitis","tremor","gangguan jiwa","bisu","kejang","saraf terjepit","tekanan jiwa","kepala", "migrain","insomnia", "encephalitis","susah tidur", "vertigo","saraf","penyempitan saraf","epilepsi", "skizofrenia", "stroke", "depresi","mental","lumpuh","kejiwaan"],
    "Lainnya": ["kurang kalsium", "theroid","tiroid","teroid","hiperteroid","kelenjar","gondok","gondog","komplikasi","malaria","prostat","tiroit","autoimun","infeksi kulit yg parah","ada miom di rahim","keguguran","kelainan fisik","telinga pokak","kurang pendengaran","kelenjar leher","kaki bengkak spt kaki gajah  hanya saja bukan  kaki gajah","radang payudara","kulit","trauma menyeluruh","endometriosis","beri-beri","endometriosis","benjolan di leher","benjolan dileher","batuk","endrometriosis","prostat","porstat","tahunan","sel darah putih","miom habis operasi 6 bulan yllu","exim","maaf kronis","eksim","demam panas","dalam","terahum","pengangkatan rahim","kehamilan abdomen","kusta basah","gatal-gatal yang tidak bisa sembuh","virus tokso dinyatakan oleh dokter bahaya jika kalau hamil","hidroteroit","bulanan tidak normal","miyoma","miyoum","mioma","hypertirod","alergi semen","alergi","takanan","kronis","typoid","miom","kelebihan hormon","tekanan","pengobatan 6 bulan","payudara","gatal gatal","tidak bisa jalan","tua tdk bs jalan","dikarenakan umur yg sudah tua"],
}

# Fungsi memuat peta kesalahan ketik (typo_map)
def load_typo_map(file_path):
    typo_map = {}
    with open(file_path, 'r') as file:
        for line in file:
            typo, correct = line.strip().split('|')
            typo_map[typo] = correct
    return typo_map

# Muat peta kesalahan ketik
typo_map = load_typo_map('datasets/typo_map.txt')

# Fungsi normalisasi teks dengan memperbaiki typo menggunakan typo_map
def normalize_text(text, typo_map):
    # Pertama, ganti typo berdasarkan typo_map
    text = text.lower()
    for typo, correct in typo_map.items():
        text = text.replace(typo, correct)
    
    # Lakukan normalisasi lainnya
    return text.replace(" dan ", ",").replace("&", ",").replace("/", ",").replace(".", " ").strip()

# Fungsi pemrosesan baris data untuk kategorisasi penyakit
def process_row(entry, typo_map, categories):
    # Split berdasarkan koma
    # diseases = entry.split(",")
    
    # Normalisasi tiap penyakit
    # normalized_diseases = [normalize_text(disease.strip(), typo_map) for disease in diseases]
    normalized_entry = normalize_text(entry, typo_map)
    normalized_diseases = [disease.strip() for disease in normalized_entry.split(",")]

    
    # Pemetaan kategori untuk setiap penyakit
    result = {category: 0 for category in categories.keys()}  # Default 0 (tidak)
    result['normalize_R702B'] = normalized_entry
    for disease in normalized_diseases:
        for category, keywords in categories.items():
            if any(keyword in disease for keyword in keywords):
                result[category] = 1  # 1 (ya)
    
    return result

# Fungsi utama untuk memproses dan menambahkan hasil ke df_combined
def process_and_add_to_df(df_combined, typo_map, categories):
    results = []

    # Proses setiap baris pada df_combined berdasarkan 'R702A' == "1"
    for entry in df_combined.loc[df_combined['R702A'] == "1", 'R702B']:
        result = process_row(entry, typo_map, categories)
        results.append(result)

    # Buat DataFrame hasil
    df_results = pd.DataFrame(results)
    # print(df_results.isna().sum())

    # Kolom yang akan ditambahkan ke df_combined adalah hasil kategori R702A1 hingga R702A11
    df_combined_results = df_combined.loc[df_combined['R702A'] == "1"].copy()

    # Menambahkan hasil kategori ke df_combined_results
    print(df_results.isna().sum())
    df_combined_results = df_combined.loc[df_combined['R702A'] == "1"].copy().reset_index(drop=True)
    df_results = df_results.reset_index(drop=True)
    for i, column in enumerate(df_results.columns):
        new_column_name = f"R702B{i+1}"
        df_combined_results[new_column_name] = df_results[column]

    print(df_combined_results.isna().sum())
    # Menambahkan kolom R702B1 hingga R702B11 dengan nilai default 2 untuk baris lainnya
    for i in range(1, len(categories.keys()) + 1):
        column_name = f"R702B{i}"
        if column_name not in df_combined.columns:
            df_combined[column_name] = 0

    # Gabungkan kembali dengan data df_combined
    df_combined_final = pd.concat([df_combined[df_combined['R702A'] != "1"], df_combined_results])

    # Pastikan tidak ada NaN dengan menggantinya menjadi nilai default
    # df_combined_final.fillna(2, inplace=True)

    return df_combined_final



# Proses dan tambahkan ke df_combined
df = process_and_add_to_df(df, typo_map, categories)

Series([], dtype: float64)
index_k     0
R501        0
R502        0
R503        0
R504A       0
           ..
R1609C_B    0
R1609C_C    0
R1609C_D    0
R1609C_E    0
R1609C_F    0
Length: 272, dtype: int64


In [109]:
df[['R702B', 'R702B1', 'R702B2', 'R702B3', 'R702B4', 'R702B5', 'R702B6', 'R702B7', 'R702B8', 'R702B9', 'R702B10', 'R702B11']].head()


Unnamed: 0,R702B,R702B1,R702B2,R702B3,R702B4,R702B5,R702B6,R702B7,R702B8,R702B9,R702B10,R702B11
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [110]:
missing_values = df[['R702B1', 'R702B2', 'R702B3', 'R702B4', 'R702B5', 'R702B6', 'R702B7', 'R702B8', 'R702B9', 'R702B10', 'R702B11']].isnull().sum()
missing_values = missing_values[missing_values > 0]
missing_values

Series([], dtype: int64)

In [111]:
# drop column yang tidak terpakai
df.drop(columns=['R702B'], inplace=True)

## handling data kategorical numerical

isi semua nilai kosong karena sparse dengan nilai -1

### R601B1, R601B2 => Jika R601A = 1

R601B2 tidak perlu di perbaiki karena sudah dipakai untuk perhitungan target dan tidak di gunakan sebagai feature

In [112]:
print("Jumlah nilai NaN di R601B1:", df['R601B1'].isna().sum())

Jumlah nilai NaN di R601B1: 53383


In [113]:
#  Mengisi nilai kolom R601B1 jika R601A bernilai 4 (Berkerja)
df.loc[df['R601A'] == 1, ['R601B1']] = [4] 

In [114]:
df[['R601A','R601B1']].isnull().sum()[lambda x: x > 0]

Series([], dtype: int64)

### R602A1,R602A2,R602B,R603,R604A,R604B,R604C,R604D,R604E,R604F,R604G,R605 => Jika R601A = 2

Jika R601A bernilai 2 (Tidak), "R602A1","R602A2","R602B","R603","R604A","R604B","R604C","R604D","R604E","R604F","R604G","R605" Tidak Ada Nilai-nya

R605 tidak perlu di perbaiki karena sudah dipakai untuk perhitungan target dan tidak di gunakan sebagai feature

Isi nilai "R602A1","R602A2","R602B","R603","R604A","R604B","R604C","R604D","R604E","R604F","R604G" dengan -1 (tidak berlaku karena kondisi)


In [115]:
df.loc[df['R601A'] == 2, ['R602A1','R602A2','R602B','R603','R604A','R604B','R604C','R604D','R604E','R604F','R604G']] = [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1]

In [116]:
df[['R602A1','R602A2','R602B','R603','R604A','R604B','R604C','R604D','R604E','R604F','R604G']].isnull().sum()[lambda x: x > 0]

Series([], dtype: int64)

### R609C => Jika R609B != 3

In [117]:
df.loc[df['R609B'] != 3, ['R609C']] = [-1]
df[['R609C']].isnull().sum()[lambda x: x > 0]

Series([], dtype: int64)

### R701B => Jika R701A = 1

R701B diisi dengan nilai 1, karena R701A 1 = tidak ada keluhan maka R701B 1 = tidak terganggu karena  [???]

In [118]:
df.loc[df['R701A'] == 1, ['R701B']] = [1]
df[['R701B']].isnull().sum()[lambda x: x > 0]

Series([], dtype: int64)

### R703A1, R703A2 => Jika R701A berkode 1 dan R702A berkode 2

R703A1 di isi 2, karena tidak pernah rawat inap karena tidak pernah sakit atau ada keluah [???]

R703A1 di isi 4, karena tidak pernah rawat jalan karena tidak pernah sakit atau ada keluah [???]

In [119]:
df[['R703A1','R703A2']].isnull().sum()[lambda x: x > 0]

R703A1    24978
R703A2    24978
dtype: int64

In [120]:
df.loc[(df['R701A']==1) & (df['R702A']==2), ['R701A', 'R702A', 'R703A1','R703A2']].shape

(24978, 4)

In [121]:
df.loc[(df['R701A']==1) & (df['R702A']==2), ['R703A1','R703A2','R703B']] = [2,4,-1]

In [122]:
df[['R703A1','R703A2']].isnull().sum()[lambda x: x > 0]

Series([], dtype: int64)

### R703B => Jika R701A berkode 2, 3, 4 atau R702A berkode 1, atau R703A1 = 2 dan R703A2 = 2

In [123]:
df[['R703B']].isnull().sum()[lambda x: x > 0]

R703B    16297
dtype: int64

In [124]:
df.loc[(df['R703A1'] == 1) | (df['R703A2'] == 3) , ['R703A1','R703A2','R703B']]

Unnamed: 0,R703A1,R703A2,R703B
26,2.0,3.0,
28,1.0,4.0,
30,1.0,3.0,
32,2.0,3.0,
50,2.0,3.0,
...,...,...,...
74669,2.0,3.0,
74670,1.0,3.0,
74672,2.0,3.0,
74677,1.0,3.0,


In [125]:
df.loc[(df['R703A1'] == 1) | (df['R703A2'] == 3), ['R703B']] = [-1]

In [126]:
df.loc[df['R703B'].isnull(), ['R701A', 'R702A','R703A1', 'R703A2', 'R703B']]

Unnamed: 0,R701A,R702A,R703A1,R703A2,R703B
36362,3,2,2.0,4.0,
72943,2,2,2.0,4.0,


In [127]:
# isi 1 mising value pada R703B tersisa dengan nilai terbanyak
df['R703B'].fillna(df['R703B'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['R703B'].fillna(df['R703B'].mode()[0], inplace=True)


In [128]:
df[['R703B']].isnull().sum()[lambda x: x > 0]

Series([], dtype: int64)

### R705B1, R705B2, R705B3, R705B4, R705B5, R705B6 => Jika R705A = 2

In [129]:
df.loc[df['R705A'] == 2, ['R705B1', 'R705B2', 'R705B3', 'R705B4', 'R705B5', 'R705B6']] = [-1,-1,-1,-1,-1,-1]

In [130]:
df[['R705B1', 'R705B2', 'R705B3', 'R705B4', 'R705B5', 'R705B6']].isnull().sum()[lambda x: x > 0]

Series([], dtype: int64)

### R902B1, R902B2, R902B3, R902B4, R902B5, R902C => Jika R902A = 00

In [131]:
df.loc[df['R902A'] == 00, ['R902B1', 'R902B2', 'R902B3', 'R902B4', 'R902B5', 'R902C']] = [-1,-1,-1,-1,-1,-1]

In [132]:
df[['R902B1', 'R902B2', 'R902B3', 'R902B4', 'R902B5', 'R902C']].isnull().sum()[lambda x: x > 0]

Series([], dtype: int64)

### R1601A, R1601B => Jika R1206A = 3

In [133]:
df[['R1601A','R1601B']].isnull().sum()[lambda x: x > 0]

R1601A    1067
R1601B    1067
dtype: int64

In [134]:
df[['R1601A','R1601B']].describe()

Unnamed: 0,R1601A,R1601B
count,73617.0,73617.0
mean,6.732793,5.146746
std,3.536346,3.945203
min,1.0,0.0
25%,4.0,2.0
50%,6.0,5.0
75%,8.0,7.0
max,50.0,49.0


In [135]:
df.loc[df['R1206A'] == 3, ['R1601A','R1601B']] = [0,0]

In [136]:
df[['R1601A','R1601B']].isnull().sum()[lambda x: x > 0]

Series([], dtype: int64)

### R1604B => Jika R1604A = 2

In [137]:
df[['R1604B']].isnull().sum()[lambda x: x > 0]

R1604B    13133
dtype: int64

In [138]:
df.loc[df['R1604A'] == 2, ['R1604B']] = [1]

In [139]:
df[['R1604B']].isnull().sum()[lambda x: x > 0]

Series([], dtype: int64)

### R1608B => Jika R1608A = 1

In [140]:
df[['R1608B']].isnull().sum()[lambda x: x > 0]

R1608B    53025
dtype: int64

In [141]:
df.loc[df['R1608A'] == 1, ['R1608B']] = [-11]

In [142]:
df[['R1608B']].isnull().sum()[lambda x: x > 0]

Series([], dtype: int64)

### R1609B1, R1609B2, R1609B3, R1609B4, R1609B5, R1609B6 => Jika R1609A = 2

In [143]:
df[['R1609A','R1609B1','R1609B2','R1609B3','R1609B4','R1609B5','R1609B6']].isnull().sum()[lambda x: x > 0]

R1609B1    25207
R1609B2    25207
R1609B3    25207
R1609B4    25207
R1609B5    25207
R1609B6    25207
dtype: int64

In [144]:
df.loc[df['R1609A'] == 2, ['R1609A','R1609B1','R1609B2','R1609B3','R1609B4','R1609B5','R1609B6']] = [-1,-1,-1,-1,-1,-1,-1]

In [145]:
df[['R1609A','R1609B1','R1609B2','R1609B3','R1609B4','R1609B5','R1609B6']].isnull().sum()[lambda x: x > 0]

Series([], dtype: int64)

### R1613B => Jika R1613A != 1

In [146]:
df[['R1613B']].isnull().sum()[lambda x: x > 0]

R1613B    51880
dtype: int64

In [147]:
df.loc[df['R1613A'].isin([2,3]), ['R1613B']] = [-1]

In [148]:
df[['R1613B']].isnull().sum()[lambda x: x > 0]

Series([], dtype: int64)

### R1613C => Jika R1613A != 2

In [149]:
df[['R1613C']].isnull().sum()[lambda x: x > 0]

R1613C    27414
dtype: int64

In [150]:
df.loc[df['R1613A'].isin([1,3]), ['R1613C']] = [-1]

In [151]:
df[['R1613C']].isnull().sum()[lambda x: x > 0]

Series([], dtype: int64)

## Sisa Missing Value

In [152]:
df['R1103BK2']

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
         ..
74679   NaN
74680   NaN
74681   NaN
74682   NaN
74683   NaN
Name: R1103BK2, Length: 74684, dtype: float64

In [153]:
missing = df.isnull().sum()
print("\nJumlah Missing Values:")
print(missing[missing > 0])


Jumlah Missing Values:
R1103BK2    74499
R1103BK3    74653
R1103BK4    74493
R1103BK5    74478
R1103BK6    73415
R1103BK7    74230
R1103CK2    74499
R1103CK3    74653
R1103CK4    74493
R1103CK5    74478
R1103CK6    73415
R1103CK7    74230
R1103DK2    74655
R1103DK3    74681
R1103DK4    74656
R1103DK5    74668
R1103DK6    74608
R1103DK7    74653
R1103EK2    74499
R1103EK3    74653
R1103EK4    74493
R1103EK5    74478
R1103EK6    73415
R1103EK7    74230
R1103FK2    74499
R1103FK3    74653
R1103FK4    74493
R1103FK5    74478
R1103FK6    73415
R1103FK7    74230
R1103GK2    74499
R1103GK3    74653
R1103GK4    74493
R1103GK5    74478
R1103GK6    73415
R1103GK7    74230
R1103HK2    74499
R1103HK3    74653
R1103HK4    74493
R1103HK5    74478
R1103HK6    73415
R1103HK7    74230
R1103IK2    74499
R1103IK3    74653
R1103IK4    74493
R1103IK5    74478
R1103IK6    73415
R1103IK7    74230
R1103JK2    74546
R1103JK3    74656
R1103JK4    74516
R1103JK5    74606
R1103JK6    73537
R1103JK7    74292
dtyp

In [154]:
# missing values yang tersisa di drop kolumnya karena hampie 100% dari total data hilang
df.drop(columns=['R1103AK2','R1103BK2','R1103CK2','R1103DK2','R1103EK2','R1103FK2','R1103GK2','R1103HK2','R1103IK2','R1103JK2','R1103AK3','R1103BK3','R1103CK3','R1103DK3','R1103EK3','R1103FK3','R1103GK3','R1103HK3','R1103IK3','R1103JK3','R1103AK4','R1103BK4','R1103CK4','R1103DK4','R1103EK4','R1103FK4','R1103GK4','R1103HK4','R1103IK4','R1103JK4','R1103AK5','R1103BK5','R1103CK5','R1103DK5','R1103EK5','R1103FK5','R1103GK5','R1103HK5','R1103IK5','R1103JK5','R1103AK6','R1103BK6','R1103CK6','R1103DK6','R1103EK6','R1103FK6','R1103GK6','R1103HK6','R1103IK6','R1103JK6','R1103AK7','R1103BK7','R1103CK7','R1103DK7','R1103EK7','R1103FK7','R1103GK7','R1103HK7','R1103IK7','R1103JK7'], inplace=True)

In [155]:
missing = df.isnull().sum()
print("\nJumlah Missing Values:")
print(missing[missing > 0])


Jumlah Missing Values:
Series([], dtype: int64)


## Standarisasi nilai Biner

Standarisasi untuk nilai biner yang 1 = ya dan 2 = tidak, menjadi 0 = tidak 

In [156]:
def get_columns_with_specific_values(df):
    valid_columns = []
    allowed_values = {1, 2, -1}

    for column in df.columns:
        unique_values = set(df[column].unique())
        if unique_values.issubset(allowed_values):
            valid_columns.append(column)
    return valid_columns

# Contoh penggunaan:
# Buat DataFrame contoh
data = {'kolomA': [1, 2, 1, -1],
        'kolomB': [1, 2, 3, 4],
        'kolomC': [-1, 2, 2, 1],
        'kolomD': [1, 1, 1, 1],
        'kolomE': [2, 1, 2, 1]}
df_contoh = pd.DataFrame(data)

# Dapatkan kolom yang memenuhi kriteria
kolom_hasil = get_columns_with_specific_values(df)
print(f"Kolom yang hanya berisi nilai 1, 2, dan/atau -1 adalah: {kolom_hasil}")

Kolom yang hanya berisi nilai 1, 2, dan/atau -1 adalah: ['R502', 'R503', 'R601A', 'R602B', 'R604A', 'R604B', 'R604C', 'R604D', 'R604E', 'R604F', 'R604G', 'R608', 'R702A', 'R703A1', 'R705A', 'R705B1', 'R705B2', 'R705B3', 'R705B4', 'R705B5', 'R705B6', 'R902B1', 'R902B2', 'R902B3', 'R902B4', 'R902B5', 'R1101B1', 'R1101B2', 'R1101B3', 'R1210A', 'R1210B', 'R1210C', 'R1210D', 'R1210E', 'R1409B', 'R1604A', 'R1608A', 'R1609A', 'R1609B1', 'R1609B2', 'R1609B3', 'R1609B4', 'R1609B5', 'R1609B6', 'R1613B']


In [157]:
# df["R1101A1"].describe()
for kolom in df[["R1101A1","R1101A2", "R1101A3", "R1101A4", "R1101A5"]].columns:
    print(f"\nKolom: {kolom}")
    print(df[kolom].value_counts())
    print("---")


Kolom: R1101A1
R1101A1
2    64768
1     6959
9     2957
Name: count, dtype: int64
---

Kolom: R1101A2
R1101A2
2    65587
1     6153
9     2944
Name: count, dtype: int64
---

Kolom: R1101A3
R1101A3
2    66626
1     4887
9     3171
Name: count, dtype: int64
---

Kolom: R1101A4
R1101A4
2    70415
9     3131
1     1138
Name: count, dtype: int64
---

Kolom: R1101A5
R1101A5
2    67990
1     3669
9     3025
Name: count, dtype: int64
---


In [158]:
# Kolom yang ingin diproses
cols = ["R1101A1", "R1101A2", "R1101A3", "R1101A4", "R1101A5"]

# Fungsi untuk mengganti nilai 9 dengan modus pada tiap kolom
for col in cols:
    modus = df.loc[df[col] != 9, col].mode()[0]
    df[col] = df[col].replace(9, modus)

In [159]:
# df["R1101A1"].describe()
for kolom in df[["R1101A1","R1101A2", "R1101A3", "R1101A4", "R1101A5"]].columns:
    print(f"\nKolom: {kolom}")
    print(df[kolom].value_counts())
    print("---")


Kolom: R1101A1
R1101A1
2    67725
1     6959
Name: count, dtype: int64
---

Kolom: R1101A2
R1101A2
2    68531
1     6153
Name: count, dtype: int64
---

Kolom: R1101A3
R1101A3
2    69797
1     4887
Name: count, dtype: int64
---

Kolom: R1101A4
R1101A4
2    73546
1     1138
Name: count, dtype: int64
---

Kolom: R1101A5
R1101A5
2    71015
1     3669
Name: count, dtype: int64
---


In [160]:
# pada kolom dibawah nilai 2 diganti dengan 0
kolom_terdampak = [
  "R502", "R503", "R601A", "R602B", "R604A", "R604B", "R604C", "R604D", "R604E", "R604F", "R604G", "R608",
  "R702A", "R703A1", "R703A2", "R705A", "R705B1", "R705B2", "R705B3", "R705B4", "R705B5", "R705B6", "R902B1", "R902B2", "R902B3", "R902B4", "R902B5", "R1101A1","R1101A2", "R1101A3", "R1101A4", "R1101A5", "R1101B1","R1101B2", "R1101B3", "R1210A", "R1210B", "R1210C", "R1210D", "R1210E", "R1409B", "R1604A", "R1608A","R1609A", "R1609B1", "R1609B2", "R1609B3", "R1609B4", "R1609B5", "R1609B6", "R1613B"
]
for kolom in kolom_terdampak:
    df[kolom] = df[kolom].replace(2, 0)


In [161]:
for col in kolom_terdampak:
    print(f"\nKolom: {col}")
    print(df[col].value_counts(dropna=False))


Kolom: R502
R502
0    63951
1    10733
Name: count, dtype: int64

Kolom: R503
R503
0    40129
1    34555
Name: count, dtype: int64

Kolom: R601A
R601A
1    53383
0    21301
Name: count, dtype: int64

Kolom: R602B
R602B
 0.0    38709
-1.0    21301
 1.0    14674
Name: count, dtype: int64

Kolom: R604A
R604A
 1.0    41777
-1.0    21301
 0.0    11606
Name: count, dtype: int64

Kolom: R604B
R604B
 1.0    40560
-1.0    21301
 0.0    12823
Name: count, dtype: int64

Kolom: R604C
R604C
 1.0    37056
-1.0    21301
 0.0    16327
Name: count, dtype: int64

Kolom: R604D
R604D
 1.0    48652
-1.0    21301
 0.0     4731
Name: count, dtype: int64

Kolom: R604E
R604E
 1.0    39119
-1.0    21301
 0.0    14264
Name: count, dtype: int64

Kolom: R604F
R604F
 1.0    35115
-1.0    21301
 0.0    18268
Name: count, dtype: int64

Kolom: R604G
R604G
 1.0    48986
-1.0    21301
 0.0     4397
Name: count, dtype: int64

Kolom: R608
R608
0    63688
1    10996
Name: count, dtype: int64

Kolom: R702A
R702A
0    68457

## 

## ...

# Modeling and Feature Importance

In [169]:
df.describe()

Unnamed: 0,index_k,R501,R502,R503,R504A,R504B,R504C,R601A,R601B1,R602A1,...,R702B2,R702B3,R702B4,R702B5,R702B6,R702B7,R702B8,R702B9,R702B10,R702B11
count,74684.0,74684.0,74684.0,74684.0,74684.0,74684.0,74684.0,74684.0,74684.0,74684.0,...,74684.0,74684.0,74684.0,74684.0,74684.0,74684.0,74684.0,74684.0,74684.0,74684.0
mean,66.410889,4.006615,0.143712,0.462683,1.111831,1.933774,1.626105,0.714785,3.195289,3.603048,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,6.940839,1.956597,0.3508,0.498609,0.393634,0.932984,0.903802,0.45152,1.308585,5.421813,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,9.48,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,62.16,3.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,66.19,4.0,0.0,0.0,1.0,2.0,1.0,1.0,4.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,70.5,5.0,0.0,1.0,1.0,3.0,2.0,1.0,4.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,100.0,10.0,1.0,1.0,4.0,4.0,4.0,1.0,4.0,17.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [163]:
# Fitur dan target
y = df['index_k']
sample_weight = df['WEIGHT']
sample_weight = sample_weight / np.mean(sample_weight)
X = df.drop(columns=['index_k', 'WEIGHT'])

In [164]:
X

Unnamed: 0,R501,R502,R503,R504A,R504B,R504C,R601A,R601B1,R602A1,R602A2,...,R702B2,R702B3,R702B4,R702B5,R702B6,R702B7,R702B8,R702B9,R702B10,R702B11
0,2,0,1,1,1,1,0,3.0,-1.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3,0,1,1,1,1,1,4.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0,1,1,1,1,0,1.0,-1.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0,1,1,1,1,1,4.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0,1,1,2,2,1,4.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74679,9,0,0,1,2,2,1,4.0,14.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
74680,9,1,0,1,2,2,1,4.0,6.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
74681,9,0,0,1,1,1,1,4.0,14.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
74682,5,0,1,1,1,1,1,4.0,14.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Random Forest

In [165]:
rf_X_train, rf_X_test, rf_y_train, rf_y_test, weight_train, weight_test = train_test_split(
    X, y, sample_weight, test_size=0.3, random_state=42
)

In [166]:
rfr = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10, min_samples_leaf=5)
rfr.fit(rf_X_train, rf_y_train, sample_weight=weight_train)

In [167]:
# --- 4. Prediksi pada Data Testing ---
rf_y_pred = rfr.predict(rf_X_test)

# --- 5. Evaluasi Model ---
mse = mean_squared_error(rf_y_test, rf_y_pred)
rmse = np.sqrt(mse) # Root Mean Squared Error
r2 = r2_score(rf_y_test, rf_y_pred)

print(f"\nMetrik Evaluasi Model:")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R2): {r2:.2f}")


Metrik Evaluasi Model:
Mean Squared Error (MSE): 30.02
Root Mean Squared Error (RMSE): 5.48
R-squared (R2): 0.37


In [168]:
feature_importances = rfr.feature_importances_
feature_names = X.columns
# Pastikan feature_names sesuai dengan jumlah fitur di X
if len(feature_names) != len(feature_importances):
    # Jika Anda memiliki data riil, pastikan kolom X_train memiliki nama kolom
    # Atau gunakan indeks fitur sebagai nama sementara
    feature_names_display = [f'fitur_{i}' for i in range(len(feature_importances))]
else:
    feature_names_display = feature_names

importance_df = pd.DataFrame({'Feature': feature_names_display, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print("\n--- 50 Fitur Teratas Berdasarkan Importansi ---")
print(importance_df.head(50).to_string(index=False)) # .to_string(index=False) untuk membersihkan output




--- 50 Fitur Teratas Berdasarkan Importansi ---
 Feature  Importance
  R1003F    0.198249
  R1002A    0.062580
  R1002C    0.045190
  R1003D    0.036347
  R1003H    0.030066
  R1003G    0.026375
  R1002D    0.024373
  R1002E    0.020047
    R501    0.018515
  R1002J    0.018105
  R1409B    0.017126
  R1002F    0.017060
  R1003J    0.016230
   R609A    0.015983
  R1004C    0.014628
  R1003A    0.014251
  R1003I    0.013710
   R1202    0.013665
  R1004B    0.013121
    R606    0.012574
  R1003C    0.012533
  R1004F    0.011931
  R1002B    0.011118
  R1003E    0.010950
  R1605E    0.009865
   R902A    0.009399
  R1003B    0.007789
    R901    0.007620
  R1002I    0.007306
   R609C    0.006886
   R504C    0.006542
  R1004A    0.006470
  R1405C    0.005895
R1609C_C    0.005504
  R1002H    0.005415
    R603    0.005106
  R1004E    0.004984
   R1208    0.004949
  R1601A    0.004863
  R1004D    0.004853
  R1002G    0.004781
  R1605D    0.004625
  R1601B    0.004512
  R1607A    0.004340
   R60

## XGB