In [1]:
# Load necessary packages and the dataset
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_regression

# Load dataset
df = pd.read_csv(r'house_price.csv')
df.head()

Unnamed: 0,Id,Neighborhood,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,TotalBsmtSF,GrLivArea,FullBath,...,Heating,Electrical,Foundation,Exterior1st,Exterior2nd,RoofStyle,SaleCondition,MoSold,YrSold,SalePrice
0,1,Richmond,Duplex,4.0,5.0,1938,2007,1418.54211,1334.672273,2.0,...,Electric,Mix,BrkTil,Wood,Wood,Cedar,Normal,4.0,2024.0,165523.67039
1,2,Delta,Duplex,7.0,1.0,1919,-9223372036854775808,570.622787,844.691015,1.0,...,HeatPump,FuseF,BrkTil,Vinyl,Vinyl,Cedar,Abnorml,,2021.0,165344.625514
2,3,Burnaby,BasementSuite,4.0,6.0,1981,1970,816.080798,2030.68508,1.0,...,Radiant,FuseA,CBlock,Stucco,Stucco,Asphalt,Abnorml,6.0,2022.0,262609.986627
3,4,Surrey,BasementSuite,5.0,5.0,1932,-9223372036854775808,589.70459,625.410833,2.0,...,Radiant,SBrkr,PConc,Vinyl,Vinyl,TorchOn,Partial,4.0,2021.0,134246.920554
4,5,Surrey,Detached,6.0,,1957,1994,1052.067733,1427.795371,3.0,...,Gas,FuseA,Stone,Stucco,Stucco,Asphalt,Normal,9.0,2020.0,167186.521789


In [2]:
df = df.ffill()

In [3]:
# Remove outliers using IQR method
Q1 = df.quantile(0.25, numeric_only=True)
Q3 = df.quantile(0.75, numeric_only=True)
IQR = Q3 - Q1

# Create masks for lower and upper bounds, applying only to numeric columns
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter only numeric columns to avoid alignment issues
numeric_columns = df.select_dtypes(include=['number']).columns
mask = ~((df[numeric_columns] < lower_bound) | (df[numeric_columns] > upper_bound)).any(axis=1)

# Apply the mask to the original dataframe
df_clean = df[mask]
df_clean.reset_index(drop=True, inplace=True)
df_clean.describe()

Unnamed: 0,Id,OverallQual,OverallCond,YearBuilt,YearRemodAdd,TotalBsmtSF,GrLivArea,FullBath,HalfBath,BedroomAbvGr,...,TotRmsAbvGrd,Fireplaces,GarageCars,GarageArea,LotFrontage,LotArea,MasVnrArea,MoSold,YrSold,SalePrice
count,2130.0,2130.0,2130.0,2130.0,2130.0,2130.0,2130.0,2130.0,2130.0,2129.0,...,2130.0,2130.0,2130.0,2130.0,2130.0,2130.0,2130.0,2130.0,2130.0,2130.0
mean,2508.384507,5.950235,4.961972,1969.604225,1989.79108,1052.314281,1483.950674,1.995305,1.0,2.996712,...,5.904695,1.089202,1.961502,493.117808,70.695587,10046.518273,110.822811,5.992019,2020.976526,181638.614996
std,1461.476349,1.534437,1.487685,29.481979,24.20701,338.795672,477.336525,1.032573,0.0,0.985081,...,1.884357,0.889649,1.031865,199.090706,23.790041,2867.275097,85.081592,2.804317,1.534147,48427.665221
min,1.0,2.0,1.0,1887.0,1921.0,114.338365,400.0,0.0,1.0,1.0,...,2.0,0.0,0.0,0.0,20.0,2086.39781,0.0,1.0,2017.0,48403.452665
25%,1210.25,5.0,4.0,1948.0,1973.0,809.638555,1156.028381,1.0,1.0,2.0,...,5.0,0.0,1.0,354.33127,53.324244,8070.839675,35.715898,4.0,2020.0,148694.962679
50%,2529.5,6.0,5.0,1970.0,1990.0,1062.676026,1488.888716,2.0,1.0,3.0,...,6.0,1.0,2.0,491.148432,69.981067,10101.07978,104.549096,6.0,2021.0,182014.067677
75%,3786.75,7.0,6.0,1991.0,2006.0,1286.444133,1809.571293,3.0,1.0,4.0,...,7.0,2.0,3.0,633.370422,87.118281,12039.373913,171.827963,8.0,2022.0,215321.790365
max,4996.0,10.0,9.0,2052.0,2056.0,1979.854922,2821.167255,4.0,1.0,5.0,...,10.0,3.0,4.0,1032.767709,136.684541,17957.572944,374.056412,12.0,2024.0,315785.745471
