In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.feature_selection import f_regression, r_regression, mutual_info_regression

## Data Loading

In [2]:
df = pd.read_parquet("../dataset/etl/L2.regression_inliers.parquet")
df = df.drop(columns=["id", "installment", "description", "url", "last_modified_at", "scraped_at", "district", "city"])
df.head()

Unnamed: 0,price,kamar_tidur,kamar_mandi,carport,sertifikat,kamar_pembantu,kamar_mandi_pembantu,dapur,jumlah_lantai,hadap,...,facility_mezzanine,facility_musholla,facility_one_gate_system,facility_parkir,facility_playground,facility_shed,facility_taman,facility_wastafel,facility_water_heater,facility_water_tank
0,5300.0,4.0,4.0,2.0,SHM - Sertifikat Hak Milik,1.0,1.0,2.0,2.0,Selatan,...,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
2,1200.0,4.0,4.0,2.0,HGB - Hak Guna Bangunan,0.0,0.0,1.0,2.0,Utara,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,4500.0,3.0,3.0,2.0,SHM - Sertifikat Hak Milik,3.0,1.0,1.0,3.0,Timur,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,3190.0,4.0,4.0,2.0,SHM - Sertifikat Hak Milik,1.0,1.0,2.0,2.0,Timur,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
6,380.0,2.0,2.0,1.0,HGB - Hak Guna Bangunan,0.0,0.0,1.0,1.0,Selatan,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [3]:
df_corr = pd.read_csv("../dataset/etl/L3.correlations_features.csv")
df_corr.head()

Unnamed: 0,variable,method,r,pvalue
0,tags_bisa_nego,pointbiser,-0.014723,0.04677036
1,tags_cash_bertahap,pointbiser,-0.236766,8.352566000000001e-231
2,tags_cash_keras,pointbiser,-0.14675,2.423117e-88
3,tags_dijual_cepat,pointbiser,0.016894,0.02250845
4,tags_komplek,pointbiser,-0.072232,1.553275e-22


In [4]:
X = df.select_dtypes("number").drop(columns=["price"])
y = df["price"]

## Correlation and Statistical Significance

In [5]:
def formatter_pvalue(x):
  return "color: red" if x < 0.05 else None

In [6]:
freg = f_regression(X, y)
mutinfo = mutual_info_regression(X, y)

df_corr_fmut = pd.DataFrame({
    "variable": X.columns,
    "f": freg[0],
    "f_pvalue": freg[1],
    "mutual_info": mutinfo
})

df_corr_fmut.head()

Unnamed: 0,variable,f,f_pvalue,mutual_info
0,kamar_tidur,5476.104658,0.0,0.382793
1,kamar_mandi,8454.642064,0.0,0.434398
2,carport,713.79624,2.7559219999999998e-154,0.134977
3,kamar_pembantu,7069.68516,0.0,0.19903
4,kamar_mandi_pembantu,7006.238872,0.0,0.167048


In [7]:
df_corr_all = df_corr.copy().rename(columns={"pvalue": "r_pvalue"})
df_corr_all = df_corr_all.merge(df_corr_fmut, on="variable")
df_corr_all["r_abs"] = df_corr_all["r"].abs()

df_corr_all.head()

Unnamed: 0,variable,method,r,r_pvalue,r_abs,f,f_pvalue,mutual_info
0,tags_bisa_nego,pointbiser,-0.014723,0.04677036,0.014723,3.954168,0.04677036,0.03484
1,tags_cash_bertahap,pointbiser,-0.236766,8.352566000000001e-231,0.236766,1083.108155,8.352566000000001e-231,0.117955
2,tags_cash_keras,pointbiser,-0.14675,2.423117e-88,0.14675,401.408728,2.423117e-88,0.052534
3,tags_dijual_cepat,pointbiser,0.016894,0.02250845,0.016894,5.206934,0.02250845,0.0
4,tags_komplek,pointbiser,-0.072232,1.553275e-22,0.072232,95.65547,1.553275e-22,0.035749


In [8]:
df_corr_all.sort_values("r_abs", ascending=False).style.background_gradient(subset=["r_abs", "f", "mutual_info"]).map(formatter_pvalue, subset=["r_pvalue", "f_pvalue"])

Unnamed: 0,variable,method,r,r_pvalue,r_abs,f,f_pvalue,mutual_info
71,luas_bangunan,pearson,0.623242,0.0,0.623242,11583.620629,0.0,1.134753
61,kamar_mandi,pearson,0.562797,0.0,0.562797,8454.642064,0.0,0.434398
63,kamar_pembantu,pearson,0.528535,0.0,0.528535,7069.68516,0.0,0.19903
64,kamar_mandi_pembantu,pearson,0.526819,0.0,0.526819,7006.238872,0.0,0.167048
72,daya_listrik,pearson,0.498559,0.0,0.498559,6032.771511,0.0,0.335311
60,kamar_tidur,pearson,0.480543,0.0,0.480543,5476.104658,0.0,0.382793
70,luas_tanah,pearson,0.44012,0.0,0.44012,4381.521157,0.0,1.173259
66,jumlah_lantai,pearson,0.38976,0.0,0.38976,3266.869641,0.0,0.165894
47,house_mat_bata_merah,pointbiser,0.32863,0.0,0.32863,2208.130967,0.0,0.14582
46,house_mat_bata_hebel,pointbiser,-0.280326,0.0,0.280326,1555.422586,0.0,0.135502


## Feature Cardinality

In [13]:
def cardinality_measure(series):
    value_counts = series.value_counts(normalize=True)
    return np.sqrt(np.sum(np.square(value_counts.values)))

In [14]:
df_cat = pd.DataFrame({
  "variable": df.columns,
  "max_category": df.apply(lambda x: x.value_counts(normalize=True).max(), axis=0),
  "cardinality_measure": df.apply(cardinality_measure, axis=0)
}).reset_index(drop=True)

print("Total selected columns:", df_cat.shape[0])
df_cat.sort_values(by="max_category", ascending=True).style.background_gradient()

Total selected columns: 81


Unnamed: 0,variable,max_category,cardinality_measure
0,price,0.024452,0.087199
17,luas_bangunan,0.083607,0.149537
16,luas_tanah,0.10795,0.176137
2,kamar_mandi,0.40011,0.542039
1,kamar_tidur,0.404715,0.540589
18,daya_listrik,0.4625,0.572439
19,lebar_jalan,0.466502,0.60871
67,facility_laundry,0.476809,0.632951
63,facility_keamanan,0.485088,0.615707
3,carport,0.494189,0.620037


## Select by R score and Cardinality

In [21]:
candidate_corr = df_corr_all[df_corr_all["r_abs"] > 0.2].iloc[:, 0].tolist()
canidate_card = df_cat[df_cat["max_category"] < 0.6].iloc[:, 0].tolist()

In [27]:
set(candidate_corr + canidate_card) - set(["price"])

{'carport',
 'dapur',
 'daya_listrik',
 'facility_ac',
 'facility_keamanan',
 'facility_laundry',
 'facility_masjid',
 'house_mat_bata_hebel',
 'house_mat_bata_merah',
 'jumlah_lantai',
 'kamar_mandi',
 'kamar_mandi_pembantu',
 'kamar_pembantu',
 'kamar_tidur',
 'lebar_jalan',
 'luas_bangunan',
 'luas_tanah',
 'ruang_makan',
 'ruang_tamu',
 'tags_cash_bertahap',
 'tags_komplek',
 'tags_kpr',
 'tags_perumahan',
 'tahun_dibangun'}