## Import & Load Data

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import skew

df = pd.read_csv("train.csv")
dft = pd.read_csv("test.csv")

## EDA pre-Feature Engineering

In [None]:
#Pengaturan Tampilan Data
pd.set_option('display.max_columns', None)

In [48]:
#Cek Data yang Kosong
def missing(df) :
    missing = df.isnull().sum()
    percent = (df.isnull().sum()/len(df))*100
    missing_table = pd.DataFrame({
        'MISSING':missing,
        '%':percent
    })
    missing_table = missing_table[missing_table['MISSING'] > 0]
    missing_table['%'] = missing_table['%'].apply(lambda x: f"{x:.2f}%")
    missing_table = missing_table.sort_values(by='%', ascending=False)
    print(missing_table.round(2))
    print("\n")
    cols_with_missing = missing_table.index.tolist()
    print("Kolom yang punya missing values:", cols_with_missing)
missing(df)
missing(dft)
    


Empty DataFrame
Columns: [MISSING, %]
Index: []


Kolom yang punya missing values: []
              MISSING      %
Utilities           2  0.14%
BsmtFullBath        2  0.14%
BsmtHalfBath        2  0.14%
Functional          2  0.14%
Exterior1st         1  0.07%
Exterior2nd         1  0.07%
BsmtFinSF1          1  0.07%
BsmtFinSF2          1  0.07%
BsmtUnfSF           1  0.07%
TotalBsmtSF         1  0.07%
KitchenQual         1  0.07%
GarageCars          1  0.07%
SaleType            1  0.07%


Kolom yang punya missing values: ['Utilities', 'BsmtFullBath', 'BsmtHalfBath', 'Functional', 'Exterior1st', 'Exterior2nd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'KitchenQual', 'GarageCars', 'SaleType']


In [49]:
#control panel
dft[dft['Utilities'].isnull()]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
455,1916,30,RL,109.0,21780,Grvl,0,3,Lvl,,Inside,2,IDOTRR,Norm,Norm,1Fam,1Story,2,4,1910,1950,Gable,CompShg,Wd Sdng,Wd Sdng,BrkFace,0.0,2,2,CBlock,0,0,0,0,0.0,0,0.0,0.0,0.0,GasA,3,N,FuseA,810,0,0,810,0.0,0.0,1,0,1,1,3.0,4,6.0,0,0,Detchd,1975.0,1,1.0,280.0,3,3,0,119,24,0,0,0,0,0,0,,0,3,2009,ConLD,Normal
485,1946,20,RL,73.0,31220,Pave,0,2,Bnk,,FR2,2,Gilbert,Feedr,Norm,1Fam,1Story,6,2,1952,1952,Hip,CompShg,BrkFace,BrkFace,BrkFace,0.0,3,3,CBlock,3,3,1,1,0.0,1,0.0,1632.0,1632.0,GasA,3,Y,FuseA,1474,0,0,1474,0.0,0.0,1,0,3,1,3.0,7,5.0,2,4,Attchd,1952.0,1,2.0,495.0,3,3,2,0,0,144,0,0,0,0,0,Shed,750,5,2008,WD,Normal


In [None]:
#Reset Pengaturan Tampilan Data
pd.reset_option('display.max_rows')

In [None]:
#Visualisasi Outlier (LotFrontage, MasVnrArea)
sns.boxplot(y='LotFrontage', data=df)
plt.show()
sns.boxplot(y='MasVnrArea', data=df)
plt.show()

In [None]:
#Visualisasi Outlier (GrLivArea vs SalePrice)
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df['GrLivArea'], y=df['SalePrice'])
plt.xlabel('GrLivArea')
plt.ylabel('SalePrice')
plt.title('Deteksi Outlier GrLivArea vs SalePrice')
plt.show()

## Hapus Data Ga Guna dan Outlier

In [None]:

outlier_indices = df[(df['GrLivArea'] > 4000) & (df['SalePrice'] < 300000)].index
print(f"Dropping {len(outlier_indices)} outlier(s)...")
df = df.drop(outlier_indices)
df = df.reset_index(drop=True)

## Pengisian Data yang Kosong

In [None]:
#isi dengan modus
cols_fillna_modus = ['MasVnrType', 'Electrical', 'MSZoning']
for col in cols_fillna_modus:
    df[col].fillna(df[col].mode()[0], inplace=True)
    dft[col].fillna(df[col].mode()[0], inplace=True)

#isi dengan median
median_masvnrarea = df['MasVnrArea'].median()
df['MasVnrArea'] = df['MasVnrArea'].fillna(median_masvnrarea)
dft['MasVnrArea'] = dft['MasVnrArea'].fillna(median_masvnrarea)

#isi dengan none
cols_fill_none = [
    'PoolQC', 'GarageQual', 'GarageCond', 'GarageType',
    'GarageFinish', 'BsmtCond', 'BsmtQual', 'BsmtExposure',
    'BsmtFinType1', 'BsmtFinType2', 'MiscFeature', 'FireplaceQu'
]
df[cols_fill_none] = df[cols_fill_none].fillna('None')
dft[cols_fill_none] = dft[cols_fill_none].fillna('None')

#isi dengan 0
cols_fill_0 = ['GarageYrBlt', 'GarageArea', 'PoolArea','MasVnrType', 'MasVnrArea', 'Utilities']
df[cols_fill_0] = df[cols_fill_0].fillna('0')
dft[cols_fill_0] = dft[cols_fill_0].fillna('0')

#isi dengan median pergrup
LotFrontage_median = df.groupby('Neighborhood')['LotFrontage'].median()
df['LotFrontage'] = df['LotFrontage'].fillna(df['Neighborhood'].map(LotFrontage_median))
dft['LotFrontage'] = dft['LotFrontage'].fillna(df['Neighborhood'].map(LotFrontage_median))




## Encoding

In [None]:
#Ordinal
qual_cond_map = {
    'Ex': 5,
    'Gd': 4,
    'TA': 3,
    'Fa': 2,
    'Po': 1,
    'None': 0 
}
qual_cond_cols = [
    'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 
    'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC'
]
for col in qual_cond_cols:
    df[col] = df[col].map(qual_cond_map)
    dft[col] = dft[col].map(qual_cond_map)

bsmt_exp_map = {
    'Gd': 4,
    'Av': 3,
    'Mn': 2,
    'No': 1,
    'None': 0
}
df['BsmtExposure'] = df['BsmtExposure'].map(bsmt_exp_map)
dft['BsmtExposure'] = dft['BsmtExposure'].map(bsmt_exp_map)

bsmt_fin_map = {
    'GLQ': 6,
    'ALQ': 5,
    'BLQ': 4,
    'Rec': 3,
    'LwQ': 2,
    'Unf': 1,
    'None': 0
}
df['BsmtFinType1'] = df['BsmtFinType1'].map(bsmt_fin_map)
dft['BsmtFinType1'] = dft['BsmtFinType1'].map(bsmt_fin_map)
df['BsmtFinType2'] = df['BsmtFinType2'].map(bsmt_fin_map)
dft['BsmtFinType2'] = dft['BsmtFinType2'].map(bsmt_fin_map)

garage_fin_map = {
    'Fin': 3,
    'RFn': 2,
    'Unf': 1,
    'None': 0
}
df['GarageFinish'] = df['GarageFinish'].map(garage_fin_map)
dft['GarageFinish'] = dft['GarageFinish'].map(garage_fin_map)

lotshape_fin_map = {
    'IR3' : 0,
    'IR2' : 1,
    'IR1' : 2,
    'Reg' : 3
}
df['LotShape'] = df['LotShape'].map(lotshape_fin_map)
dft['LotShape'] = dft['LotShape'].map(lotshape_fin_map)

landslope_fin_map = {
    'Gtl' : 2,
    'Mod' : 1,
    'Sev' : 0
}
df['LandSlope'] = df['LandSlope'].map(landslope_fin_map)
dft['LandSlope'] = dft['LandSlope'].map(landslope_fin_map)

utilities_map = {
    'AllPub': 3,
    'NoSewr': 2,
    'NoSeWa': 1,
    'ELO': 0
}
df['Utilities'] = df['Utilities'].map(utilities_map)
dft['Utilities'] = dft['Utilities'].map(utilities_map)

paveddrive_map = {
    'Y': 2,
    'P': 1,
    'N': 0
}
df['PavedDrive'] = df['PavedDrive'].map(paveddrive_map)
dft['PavedDrive'] = dft['PavedDrive'].map(paveddrive_map)

functional_map = {
    'Typ': 7,
    'Min1': 6,
    'Min2': 5,
    'Mod': 4,
    'Maj1': 3,
    'Maj2': 2,
    'Sev': 1, 
    'Sal': 0
}
df['Functional'] = df['Functional'].map(functional_map)
dft['Functional'] = dft['Functional'].map(functional_map)

fence_map = {
    'GdPrv': 4,
    'MnPrv': 3,
    'GdWo': 2,
    'MnWw': 1,
    0: 0
}
df['Fence'] = df['Fence'].fillna(0)
dft['Fence'] = dft['Fence'].fillna(0)
df['Fence'] = df['Fence'].map(fence_map)
dft['Fence'] = dft['Fence'].map(fence_map)

alley_map = {
    'Grvl': 1,
    'Pave': 2,
    0: 0
}
df['Alley'] = df['Alley'].fillna(0)
dft['Alley'] = dft['Alley'].fillna(0)
df['Alley'] = df['Alley'].map(alley_map)
dft['Alley'] = dft['Alley'].map(alley_map)


In [None]:
#Categorical
nominal_cols = [
    'MSZoning', 'Alley', 'Neighborhood', 'Condition1', 'Condition2',
    'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
    'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'CentralAir',
    'Electrical', 'GarageType', 'MiscFeature', 'SaleType', 'SaleCondition'
]
df = pd.get_dummies(df, columns=nominal_cols)
dft = pd.get_dummies(dft, columns=nominal_cols)

## EDA

In [None]:
# HeatMap
df_num = df.select_dtypes(include=['int64', 'float64'])
corr = df_num.corr()
plt.figure(figsize=(21, 22))
sns.heatmap(corr, annot=True, cmap='coolwarm')

In [None]:
#Visualisasi Hubungan GarageCars, GarageArea, dan SalePrice dalam Scatterplot Matrix.
sns.pairplot(df[['GarageCars', 'GarageArea', 'SalePrice']], hue='SalePrice',palette="viridis")
plt.show()

## Transformasi dan Scaling

In [None]:
#Log Transform SalePrice
df['SalePrice'] = np.log1p(df['SalePrice'])

#Ambil semua kolom numerik dan log yang skewed
numeric_cols = df.select_dtypes(include = [np.number]).columns
numeric_cols = numeric_cols.drop(['Id', 'SalePrice'])
skewed_feats = df[numeric_cols].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("Skewness fitur numerik:")
print(skewed_feats)
skewness_threshold = 0.75
highly_skewed_cols = skewed_feats[abs(skewed_feats) > skewness_threshold].index
print(f"Menerapkan log-transform ke {len(highly_skewed_cols)} fitur...")
for col in highly_skewed_cols:
    df[col] = np.log1p(df[col])
    dft[col] = np.log1p(dft[col])

from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
dft[numeric_cols] = scaler.transform(dft[numeric_cols])

