In [1]:
import sys
import os

# Get the absolute path of the scripts directory
root_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Add the scripts directory to Python's module search path
sys.path.append(root_dir)

# data manipulation and plotting
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import seaborn as sns

# for saving the pipeline
import joblib

# from Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline

# from feature-engine
from feature_engine.imputation import AddMissingIndicator, CategoricalImputer

from feature_engine.encoding import RareLabelEncoder, OrdinalEncoder, OneHotEncoder

from feature_engine.transformation import YeoJohnsonTransformer

from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

# from scripts
from scripts.preprocessors import IdealistaDataLoader

# to visualise al the columns in the dataframe
pd.pandas.set_option("display.max_columns", None)
# to display all the columns of the dataframe in the notebook
pd.pandas.set_option("display.max_columns", None)

# to display the plots in a nice way
custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style="ticks", rc=custom_params)

In [2]:
data_loader = IdealistaDataLoader(
    read_path=f"../data/idealista",
    city="lisbon",
    operation="rent",
    date_or_unioned="unioned",
    include_geodata=True,
    index_col="propertyCode",
)

data = data_loader.load_data()

In [3]:
# rowsandcolumnsofthedata
print(data.shape)

# visualisethedataset
data.head()

(2963, 60)


Unnamed: 0_level_0,thumbnail,externalReference,numPhotos,floor,price,propertyType,operation,size,rooms,bathrooms,address,province,municipality,district,country,latitude,longitude,showAddress,url,distance,description,hasVideo,status,newDevelopment,hasLift,priceByArea,detailedType,suggestedTexts,hasPlan,has3DTour,has360,hasStaging,topNewDevelopment,topPlus,neighborhood,newDevelopmentFinished,snapshotDate,formerPrice,priceChange,isPriceLowered,hasParkingSpace,isParkingSpaceIncludedInPrice,parkingSpacePrice,totalPrice,distanceToNearestMall,nearestMallName,distanceToNearestTrain,nearestTrainName,distanceToNearestFair,nearestFairName,distanceToNearestPark,nearestParkName,distanceToNearestMarket,nearestMarketName,distanceToNearestMetro,nearestMetroName,distanceToNearestViewpoint,nearestViewpointName,distanceToNearestPlayground,nearestPlaygroundName
propertyCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1
33829652,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,KWPT-009130,33,1,2475.0,flat,rent,179.0,3,3,São Sebastião da Pedreira,Lisboa,Avenidas Novas,São Sebastião da Pedreira,pt,38.733429,-9.145003,False,https://www.idealista.pt/imovel/33829652/,439.0,Um apartamento nas Avenidas Novas? Conte-me ma...,False,good,False,True,14.0,{'typology': 'flat'},"{'subtitle': 'Avenidas Novas, Lisboa', 'title'...",True,False,False,False,False,False,,False,2025-01-13,2475.0,0.0,False,False,False,0.0,2475.0,83.378815,Dolce Vita Monumental,1626.342524,Entrecampos,1337.410328,Feira São João de Deus,466.306263,Jardim do Arco do Cego,1012.055636,Mercado Arco do Cego,250.26182,Saldanha,1142.440298,Miradouro do Parque Eduardo VII,641.621174,Parque Infantil do Jardim Gomes Amorim
33596155,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,KWPT-004534,33,bj,2150.0,flat,rent,170.0,3,3,rua das Picoas,Lisboa,Avenidas Novas,São Sebastião da Pedreira,pt,38.733908,-9.144531,False,https://www.idealista.pt/imovel/33596155/,373.0,Um apartamento para arrendar? Conte-me mais! E...,True,good,False,True,13.0,{'typology': 'flat'},"{'subtitle': 'Avenidas Novas, Lisboa', 'title'...",True,False,False,False,False,False,,False,2025-01-13,2150.0,0.0,False,False,False,0.0,2150.0,150.274403,Centro Comercial Atrium Saldanha,1576.198925,Entrecampos,1251.392666,Feira São João de Deus,380.225877,Jardim do Arco do Cego,929.438169,Mercado Arco do Cego,198.676223,Saldanha,1217.962245,Miradouro do Parque Eduardo VII,562.731979,Parque Infantil do Jardim Gomes Amorim
33896876,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,Match_01.216,21,8,1950.0,flat,rent,135.0,2,2,"rua Jorge Castilho, 8",Lisboa,Areeiro,Casal Vistoso,pt,38.740173,-9.128233,True,https://www.idealista.pt/imovel/33896876/,1303.0,Apartamento T2 Com Varanda e Estacionamento Lo...,False,good,False,True,14.0,{'typology': 'flat'},"{'subtitle': 'Areeiro, Lisboa', 'title': 'Apar...",True,False,False,False,False,False,,False,2025-01-13,1950.0,0.0,False,True,True,0.0,1950.0,342.638864,Olaias Plaza,1031.109077,Roma Areeiro,1133.090394,Feira São João de Deus,448.850431,Jardim sobre a Fonte Monumental - Alameda Dom ...,696.420381,Mercado da Picheleira - Mercado Alfacinha,582.276537,Olaias,1410.666684,Miradouro da Penha de França,557.606783,Parque Infantil da Rua Aquiles Machado
31196503,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,CF119,9,1,700.0,studio,rent,25.0,0,1,beco dos Paus,Lisboa,Santa Maria Maior,Alfama - Sé,pt,38.712857,-9.125107,False,https://www.idealista.pt/imovel/31196503/,3082.0,"Excelente apartamento T0, disponível com todo ...",False,good,False,False,28.0,"{'typology': 'flat', 'subTypology': 'studio'}","{'subtitle': 'Santa Maria Maior, Lisboa', 'tit...",False,False,False,False,False,False,,False,2025-01-13,700.0,0.0,False,False,False,0.0,700.0,1254.908858,Centro Comercial Mouraria,477.276828,Santa Apolónia,370.139507,Feira da Ladra,451.116633,Jardim Botto Machado / Jardim de Santa Clara,2455.76625,Mercado da Ribeira,344.207875,Santa Apolónia,304.879765,Miradouro de Santa Clara,203.098162,Parque Infantil da Rua do Vigário
33896540,https://img4.idealista.pt/blur/WEB_LISTING/0/i...,,24,1,3000.0,flat,rent,130.0,2,3,"rua Sousa Pinto, 11",Lisboa,Santo António,Rato - Amoreiras,pt,38.723469,-9.158115,True,https://www.idealista.pt/imovel/33896540/,2009.0,AMOREIRAS Apartamento T2 de construção recente...,True,good,False,True,23.0,{'typology': 'flat'},"{'subtitle': 'Santo António, Lisboa', 'title':...",True,False,False,False,False,False,,False,2025-01-13,3000.0,0.0,False,True,True,0.0,3000.0,316.578453,Amoreiras Plaza,1672.220918,Campolide,1364.683979,Feira Biológica do Príncipe Real,286.221116,Jardim das Amoreiras (Jardim Marcelino de Mesq...,1426.327747,Mercado Campo de Ourique,607.249371,Rato,1062.957225,Miradouro do Parque Eduardo VII,238.793997,Parque Infantil do Jardim Marcelino Mesquita


In [None]:
FEATURES = [
    "floor",
    "propertyType",
    "size",
    "rooms",
    "bathrooms",
    "municipality",
    "district",
    "latitude",
    "longitude",
    "showAddress",
    "hasVideo",
    "status",
    "newDevelopment",
    "hasLift",
    "hasPlan",
    "has3DTour",
    "has360",
    "hasStaging",
    "topNewDevelopment",
    "topPlus",
    "newDevelopmentFinished",
    "isPriceLowered",
    "hasParkingSpace",
    "isParkingSpaceIncludedInPrice",
    "distanceToNearestMall",
    "nearestMallName",
    "distanceToNearestTrain",
    "nearestTrainName",
    "distanceToNearestFair",
    "nearestFairName",
    "distanceToNearestPark",
    "nearestParkName",
    "distanceToNearestMarket",
    "nearestMarketName",
    "distanceToNearestMetro",
    "nearestMetroName",
    "distanceToNearestViewpoint",
    "nearestViewpointName",
    "distanceToNearestPlayground",
    "nearestPlaygroundName",
]

TARGET = ["totalPrice"]

CATEGORICAL_FEATURES = [
    "floor",
    "propertyType",
    "municipality",
    "district",
    "status",
    "nearestMallName",
    "nearestTrainName",
    "nearestFairName",
    "nearestParkName",
    "nearestMarketName",
    "nearestMetroName",
    "nearestViewpointName",
    "nearestPlaygroundName",
]

NUMERICAL_FEATURES = [
    "size",
    "rooms",
    "bathrooms",
    "latitude",
    "longitude",
    "distanceToNearestMall",
    "distanceToNearestTrain",
    "distanceToNearestFair",
    "distanceToNearestPark",
    "distanceToNearestMarket",
    "distanceToNearestMetro",
    "distanceToNearestViewpoint",
    "distanceToNearestPlayground",
]

FEATURES_TO_DROP = [
    "thumbnail",
    "externalReference",
    "numPhotos",
    "price",
    "operation",
    "address",
    "province",
    "country",
    "url",
    "distance",
    "description",
    "priceByArea",
    "detailedType",
    "suggestedTexts",
    "neighborhood",
    "snapshotDate",
    "formerPrice",
    "priceChange",
    "parkingSpacePrice",
]

CATEGORICAL_FEATURES_TO_IMPUTE_WITH_MISSING = ["floor", "district", "status"]

CATEGORICAL_FEATURES_TO_IMPUTE_WITH_MODE = ["floor", "district", "status"]

NUMERICAL_FEATURES_TO_CLEAN = ["size", "rooms", "bathrooms"]

NUMERICAL_FEATURES_TO_TRANSFORM_WITH_YEO_JOHNSON = ["size"]

NUMERICAL_FEATURES_CLEANING_OPERATIONS = {
    "size": {
        "sequence": [
            {
                "method": "frequent_value_removal",
                "threshold": 0.05,
            },
            {
                "method": "missing_indicator",
            },
            {
                "method": "knn_imputation",
            },
            {
                "method": "z_score",
                "threshold": 3,
            },
        ]
    },
    "rooms": {
        "method": "iqr",
        "threshold": 1.5,
    },
    "bathrooms": {
        "method": "rare_value_removal",
        "threshold": 0.01,
    },
}

In [5]:
drop_features = DropFeatures(FEATURES_TO_DROP)
data = drop_features.fit_transform(data)

In [6]:
from sklearn.model_selection import train_test_split

# Let's separate into train and test set
X_train, X_test, y_train, y_test = train_test_split(
    data[FEATURES],
    data[TARGET],
    test_size=0.2,
    random_state=42,
    stratify=data["municipality"],
)

In [None]:
X_train.shape, X_test.shape

((2370, 40), (593, 40))

In [None]:
yeo_johnson_transformer_target = YeoJohnsonTransformer()
yeo_johnson_transformer_target.fit(y_train)

y_train = yeo_johnson_transformer_target.transform(y_train)
y_test = yeo_johnson_transformer_target.transform(y_test)

In [None]:
# clean outliers in the target

##Numberoflabels:cardinality

Let'sevaluatehowmanydifferentcategoriesarepresentineachofthevariables.


In [None]:
# wecountuniquecategorieswithpandasunique()
# andthenplotthemindescendingorder

data_train[cat_vars].nunique().sort_values(ascending=False).plot.bar(figsize=(12, 5))

Insomevariables,theaveragerentpriceinhouseswheretheinformationismissing,differsfromtheaveragerentpriceinlistingswherethisinformationexists.Thissuggeststhatdatabeingmissingcouldbeagoodpredictorofrentprice.


In [None]:
fromfeature_engine.imputationimportCategoricalImputer

ci = CategoricalImputer(imputation_method="frequent", variables=with_mode)
data_train = ci.fit_transform(data_train)

In [None]:
fromfeature_engine.imputationimportCategoricalImputer

ci = CategoricalImputer(
    imputation_method="missing", fill_value="Missing", variables=with_missing
)
data_train = ci.fit_transform(data_train)

##Rarelabels:

Let'sgoaheadandinvestigatenowiftherearelabelsthatarepresentonlyinasmallnumberofhouses:


In [None]:
defanalyse_rare_labels(df,var,rare_perc):
df=df.copy()

#determinethe%ofobservationspercategory
tmp=df.groupby(var)[target[0]].count()/len(df)

#returncategoriesthatarerare
returntmp[tmp<rare_perc]

#printcategoriesthatarepresentinlessthan
#1%oftheobservations

forvarincat_vars:
print(analyse_rare_labels(data_train,var,0.01))
print()

In [None]:
fromfeature_engine.encodingimportRareLabelEncoder

# let'sgroupthecategoriesthatappearinlessthan1%ofthedata
rle = RareLabelEncoder(n_categories=1, tol=0.01, variables=cat_vars)
data_train = rle.fit_transform(data_train)

In [None]:
# wecountuniquecategorieswithpandasunique()
# andthenplotthemindescendingorder

data_train[cat_vars].nunique().sort_values(ascending=False).plot.bar(figsize=(12, 5))

In [None]:
cardinality = data_train[cat_vars].nunique().sort_values(ascending=False)

In [None]:
one_hot_vars = cardinality[cardinality <= 5].index.tolist()
ordinal_vars = cardinality[cardinality > 5].index.tolist()

Someofthecategoricalvariablesshowmultiplelabelsthatarepresentinlessthan1%ofthehouses.

Labelsthatareunder-representedinthedatasettendtocauseover-fittingofmachinelearningmodels.

Thatiswhywewanttoremovethem.

Finally,wewanttoexploretherelationshipbetweenthecategoriesofthedifferentvariablesandthehousesaleprice:


In [None]:
forvarincat_vars:
order_idx=data_train.groupby(var)[target[0]].median().sort_values().index
#makeboxplotwithCatplot
sns.catplot(
x=var,y=target[0],data=data_train,kind="box",height=4,aspect=1.5,order=order_idx
)
#adddatapointstoboxplotwithstripplot
strip_plot=sns.stripplot(
x=var,y=target[0],data=data_train,jitter=0.1,alpha=0.3,color="k",order=order_idx
)
#Setproperticklocationsbeforechanginglabels
strip_plot.set_xticks(range(len(data_train[var].unique())))
strip_plot.set_xticklabels(strip_plot.get_xticklabels(),rotation=90)

plt.show()

In [None]:
data_train.shape

In [None]:
len(features)

Clearly,thecategoriesgiveinformationontheSalePrice,asdifferentcategoriesshowdifferentmediansaleprices.


In [None]:
fromfeature_engine.encodingimportOrdinalEncoder

# setuptheencoder
oe = OrdinalEncoder(encoding_method="ordered", variables=ordinal_vars)

# createthemappings
oe.fit(data_train[features], data_train[target].squeeze())
data_train[features] = oe.transform(data_train[features])

# mappingsarestoredandclasscanbesaved
oe.encoder_dict_

In [None]:
data_train.shape

In [None]:
fromfeature_engine.encodingimportOneHotEncoder

# setuptheencoder
ohe = OneHotEncoder(drop_last=True, variables=one_hot_vars)

ohe.fit(data_train[features])
new_features = ohe.get_feature_names_out()
data_train[new_features] = ohe.transform(data_train[features])
data_train = data_train.drop(columns=one_hot_vars)
features = new_features

In [None]:
data_train

#Numericalvariables

Let'sgoaheadandfindoutwhatnumericalvariableswehaveinthedataset


In [None]:
print("Numberofnumericalvariables:", len(num_vars))

# visualisethenumericalvariables
data_train[num_vars].head()

##Discretevariables

Let'sgoaheadandfindwhichvariablesarediscrete,i.e.,showafinitenumberofvalues


In [None]:
# let'smalealistofdiscretevariables
discrete_vars = [varforvarinnum_varsiflen(data_train[var].unique()) < 20]


print("Numberofdiscretevariables:", len(discrete_vars))

In [None]:
# let'svisualisethediscretevariables

data_train[discrete_vars].head()

Thesediscretevariablesrefertothenumberofroomsandbathrooms.

Weexpecthigherprices,withbiggernumbers.

Let'sgoaheadandanalysetheircontributiontothelistingprice.


In [None]:
forvarindiscrete_vars:
#makeboxplotwithCatplot
sns.catplot(x=var,y=target[0],data=data_train,kind="box",height=4,aspect=1.5)
#adddatapointstoboxplotwithstripplot
sns.stripplot(x=var,y=target[0],data=data_train,jitter=0.1,alpha=0.3,color='k')
plt.show()

Clearly,thepricedoesincreasewithincreasingnumberofroomsandbathrooms.Thereseemtobefewoutliersinbothvariables,sowe'llhavetodealwiththoseinabit.


##Continuousvariables

Let'sgoaheadandfindthedistributionofthecontinuousvariables.Wewillconsidercontinuousvariablestoallthosethatarenottemporalordiscrete.


In [None]:
# makelistofcontinuousvariables
cont_vars = [varforvarinnum_varsifvarnotindiscrete_vars]

print("Numberofcontinuousvariables:", len(cont_vars))

In [None]:
# let'svisualisethecontinuousvariables

data_train[cont_vars].head()

In [None]:
data_train[cont_vars].describe()

In [None]:
# letsplothistogramsforallcontinuousvariables

data_train[cont_vars].hist(bins=30, figsize=(15, 15))
plt.show()

Thevariablesarenotnormallydistributed,andmostofthemareleftskewed.

Sometimes,transformingthevariablestoimprovethevaluespread,improvesthemodelperformance.Butitisunlikelythatatransformationwillhelpchangethedistributionofthesuperskewedvariablesdramatically.

WecanapplyaYeo-Johnsontransformation,let'sgoaheadanddothat.


In [None]:
#Identifycolumnswithnon-finitevalues
forvarincont_vars:
ifnotnp.all(np.isfinite(data_train[var])):
print(f"Non-finitevaluesfoundincolumn:{var}")

###Yeo-Johnsontransformation


In [None]:
#Let'sgoaheadandanalysethedistributionsofthevariables
#afterapplyingayeo-johnsontransformation

#temporarycopyofthedata
tmp=data_train.copy()

forvarincont_vars+["totalPrice"]:
#ApplyPowerTransformer
pt=PowerTransformer()
tmp[var]=pt.fit_transform(data_train[var].values.reshape(-1,1))


#plotthehistogramsofthetransformedvariables
tmp[cont_vars].hist(bins=30,figsize=(15,15))
plt.show()

Forlatitudeandlongitude,thetransformationdidn'tdoanythinguseful,asthevaluesaretoosmall.

Fortheothers,thevaluesseemtobespreadmoreevenlyintherangeandmorenormallydistributed.Whetherthishelpsimprovethepredictivepower,remainstobeseen.Todetermineifthisisthecase,weshouldtrainamodelwiththeoriginalvaluesandonewiththetransformedvalues,anddeterminemodelperformance,andfeatureimportance.


In [None]:
yeo_johnson_vars=[varforvarincont_varsifvar=='size'or'distance'invar]

In [None]:
yeo_johnson_vars

#Outliers

We'reinterestedinremovingtheoutliersforafewkeyfeatures:size,rooms,andbathrooms.Additionally,werecognizedthattherearefewre-ocurringvaluesinthesizefeaturethatarelikelydataentryerrorsand/ordatainconsistencies.

Wehavedifferentstrategiesforeachofthem,sowe'lltacklethoseinabit.First,we'lltacketheoverlyfrequentvaluesinthesizefeature.


In [None]:
#Identifyoverlyfrequentvaluesdynamically
defflag_frequent_values(df,column,threshold=0.05):
"""
Identifiesvaluesinacolumnthatappearwithafrequencyhigherthan`threshold`(default:5%ofthedata).
Returnsalistofthosevalues.
"""
value_counts=df[column].value_counts(normalize=True)
frequent_values=value_counts[value_counts>threshold].index.tolist()
returnfrequent_values

In [None]:
fromsklearn.imputeimportKNNImputer, MissingIndicator

# Findoverlycommonvaluesinthe'size'column
frequent_sizes = flag_frequent_values(data_train, "size", threshold=0.05)

# ReplaceoverlycommonvalueswithNaN
data_train.loc[data_train["size"].isin(frequent_sizes), "size"] = np.nan

# ApplyMissingIndicator
missing_indicator = MissingIndicator()
missing_indicator.fit(data_train[features])
new_features = missing_indicator.get_feature_names_out().tolist()
data_train[new_features] = missing_indicator.transform(data_train[features])
features.extend(new_features)

In [None]:
data_train

In [None]:
len(features)

In [None]:
# ApplyKNNImputer
size_imputer = KNNImputer(n_neighbors=5, weights="distance")
size_imputer.fit(data_train[num_vars])
new_features = size_imputer.get_feature_names_out().tolist()
data_train[new_features] = size_imputer.transform(data_train[num_vars])

In [None]:
# TransformwiththeYeo-Johnsontransformation
pt_features = PowerTransformer()
data_train[yeo_johnson_vars] = pt_features.fit_transform(data_train[yeo_johnson_vars])

pt_target = PowerTransformer()
data_train[target] = pt_target.fit_transform(data_train[target])

# Filteroutextremevalues
data_train = data_train[abs(data_train["size"]) <= 3]
data_train = data_train[abs(data_train[target[0]]) <= 3]

# Plot
data_train["size"].hist(bins=30, density=True)
plt.ylabel("Numberoflistings")
plt.xlabel("StandardizedYeo-Johnsonofsize")
plt.show()

In [None]:
# Plot
plt.scatter(x="size", y="totalPrice", data=data_train, alpha=0.1)
plt.ylabel("Yeo-Johnsonofprice")
plt.xlabel("StandardizedYeo-Johnsonofsize")
plt.show()

In [None]:
data_train

In [None]:
fig = plt.figure(figsize=(6, 6))
ax = fig.add_subplot(111)
stats.probplot(data_train["size"], dist="norm", plot=ax)
ax.set_title("QQPlotofStandardizedYeo-JohnsonSize")
plt.show()

In [None]:
data_train["size"].describe()

In [None]:
sns.displot(data_train, x="rooms", kind="hist", bins=30)
plt.show()

In [None]:
data_train["rooms"].describe()

In [None]:
defremove_outliers_iqr(df,column,threshold=1.5):
"""
RemovesoutliersfromaDataFramecolumnusingtheIQRmethod.

Parameters:
df(pd.DataFrame):TheinputDataFrame.
column(str):Thenameofthecolumntocheckforoutliers.
threshold(float,optional):TheIQRmultiplier(defaultis1.5).

Returns:
pd.DataFrame:AnewDataFramewithoutliersremoved.
"""
Q1=df[column].quantile(0.25)#Firstquartile(25thpercentile)
Q3=df[column].quantile(0.75)#Thirdquartile(75thpercentile)
IQR=Q3-Q1#Interquartilerange

lower_bound=Q1-threshold*IQR
upper_bound=Q3+threshold*IQR

df_filtered=df.copy()

df_filtered=df_filtered[(df_filtered[column]>=lower_bound)&(df_filtered[column]<=upper_bound)]

returndf_filtered

In [None]:
data_train = remove_outliers_iqr(data_train, "rooms")

In [None]:
sns.displot(data_train, x="rooms", kind="hist", bins=30)
plt.show()

In [None]:
data_train["rooms"].describe()

In [None]:
sns.displot(data_train, x="bathrooms", kind="hist", bins=30)
plt.show()

In [None]:
data_train["bathrooms"].describe()

In [None]:
# Countoccurrencesofeachvalue
value_counts = data_train["bathrooms"].value_counts(normalize=True)

# Defineathreshold(e.g.,valuesappearinginlessthan1%oflistingsareoutliers)
threshold = 0.01
outlier_values = value_counts[value_counts < threshold].index

# Removeorflagoutliers
data_train = data_train[~data_train["bathrooms"].isin(outlier_values)]

In [None]:
sns.displot(tmp, x="bathrooms", kind="hist", bins=30)
plt.show()

In [None]:
data_train["bathrooms"].describe()

#DataPreparation


In [None]:
boolean_vars=data_train.select_dtypes(include='bool').columns.tolist()
binary_vars=[varforvarindata_train.columnsifdata_train[var].nunique()==2orvarinboolean_vars]
non_binary_vars=[varforvarindata_train.columnsifvarnotinbinary_varsandvarnotintarget]

In [None]:
binary_vars

In [None]:
non_binary_vars

In [None]:
data_train = data_train.astype(float)

In [None]:
data_train.info()

In [None]:
fromsklearn.preprocessingimportStandardScaler

ss = StandardScaler()
ss.fit(data_train[non_binary_vars])
data_train[non_binary_vars] = ss.transform(data_train[non_binary_vars])

In [None]:
data_train

**Disclaimer:**

Thereiscertainlymorethatcanbedonetounderstandthenatureofthisdataandtherelationshipofthesevariableswiththetarget,SalePrice.Andalsoaboutthedistributionofthevariablesthemselves.

However,wehopethatthroughthisnotebookwegaveyouaflavourofwhatdataanalysislookslike.


#ExploratoryDataAnalysis


In [None]:
data_train.plot(
    kind="scatter",
    x="longitude",
    y="latitude",
    grid=True,
    c="totalPrice",
    cmap="jet",
    colorbar=True,
    legend=True,
    sharex=False,
    figsize=(10, 7),
)
plt.show()

In [None]:
plt.figure(figsize=(16, 8))
sns.heatmap(
    data_train[non_binary_vars + target].corr(numeric_only=True),
    annot=True,
    fmt=".2f",
    cmap="coolwarm",
    center=0,
    vmax=1,
    vmin=-1,
)
plt.show()

In [None]:
plt.figure(figsize=(16, 8))
sns.heatmap(
    data_train[binary_vars + target].corr(numeric_only=True),
    annot=True,
    fmt=".2f",
    cmap="coolwarm",
    center=0,
    vmax=1,
    vmin=-1,
)
plt.show()

In [None]:
temp = data_train.copy()
temp = temp.drop(columns=target)
all(temp.columns == features)

In [None]:
X_train = data_train[features]
y_train = data_train[target]

In [None]:
X_train_dev, X_val, y_train_dev, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, shuffle=True
)

In [None]:
fromsklearn.linear_modelimportLinearRegression

# setupthemodel
lin_model = LinearRegression()

# trainthemodel
lin_model.fit(X_train_dev, y_train_dev)

y_pred_dev = lin_model.predict(X_train_dev)
y_pred_val = lin_model.predict(X_val)

In [None]:
fromsklearn.metricsimportroot_mean_squared_error, r2_score

# evaluatethemodel
root_mean_squared_error(y_train_dev, y_pred_dev), r2_score(y_train_dev, y_pred_dev)

In [None]:
root_mean_squared_error(y_val, y_pred_val), r2_score(y_val, y_pred_val)

In [None]:
defplot_performance(model,X_train,y_train,X_test,y_test):
#Definetherangeofthedependentvariable
y_train_min,y_train_max=y_train.values.min(),y_train.values.max()
y_test_min,y_test_max=y_test.values.min(),y_test.values.max()
var_range=[min(y_train_min,y_test_min),max(y_train_max,y_test_max)]

#Predictontrainingandtestdata
y_train_pred=model.predict(X_train)
y_test_pred=model.predict(X_test)

#Plotactualvspredictedvalues
plt.figure(figsize=(12,4))

#Plottraindata
plt.subplot(1,2,1)
plt.scatter(y_train,y_train_pred,color='royalblue',alpha=0.5,label='Traindata')
plt.plot(var_range,var_range,'k--',lw=2)
plt.xlabel(f'Actualprice')
plt.xlim(var_range[0]-0.5,var_range[1]+0.5)
plt.ylabel(f'Predictedprice')
plt.ylim(var_range[0]-0.5,var_range[1]+0.5)
plt.text(var_range[0],var_range[1]-0.5,"RMSE={:.3f}".format(root_mean_squared_error(y_train,y_train_pred)))
plt.text(var_range[0],var_range[1]-1,"R²={:.3f}".format(r2_score(y_train,y_train_pred)))
plt.legend(loc="lowerright")

#Plottestdata
plt.subplot(1,2,2)
plt.scatter(y_test,y_test_pred,color='lightskyblue',alpha=0.5,label='Testdata')
plt.plot(var_range,var_range,'k--',lw=2)
plt.xlabel(f'Actualprice')
plt.xlim(var_range[0]-0.5,var_range[1]+0.5)
plt.ylabel(f'Predictedprice')
plt.ylim(var_range[0]-0.5,var_range[1]+0.5)
plt.text(var_range[0],var_range[1]-0.5,"RMSE={:.3f}".format(root_mean_squared_error(y_test,y_test_pred)))
plt.text(var_range[0],var_range[1]-1,"R²={:.3f}".format(r2_score(y_test,y_test_pred)))
plt.legend(loc="lowerright")

plt.show()

In [None]:
plot_performance(lin_model, X_train_dev, y_train_dev, X_val, y_val)

In [None]:
fromsklearn.linear_modelimportElasticNet

# setupthemodel
en_model = ElasticNet(alpha=0.01, l1_ratio=0.5, random_state=42)

# trainthemodel
en_model.fit(X_train_dev, y_train_dev)

y_pred_dev = en_model.predict(X_train_dev)
y_pred_val = en_model.predict(X_val)

In [None]:
root_mean_squared_error(y_train_dev, y_pred_dev), r2_score(y_train_dev, y_pred_dev)

In [None]:
root_mean_squared_error(y_val, y_pred_val), r2_score(y_val, y_pred_val)

In [None]:
plot_performance(en_model, X_train_dev, y_train_dev, X_val, y_val)

#AdditionalResources

-[FeatureEngineeringforMachineLearning](https://www.trainindata_train.com/p/feature-engineering-for-machine-learning)-OnlineCourse -[PacktFeatureEngineeringCookbook](https://www.amazon.com/Python-Feature-Engineering-Cookbook-transforming-dp-1804611301/dp/1804611301)-Book -[PredicthousepricewithFeature-engine](https://www.kaggle.com/solegalli/predict-house-price-with-feature-engine)-Kagglekernel -[ComprehensivedataexplorationwithPython](https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python)-Kagglekernel -[HowImadetop0.3%onaKagglecompetition](https://www.kaggle.com/lavanyashukla01/how-i-made-top-0-3-on-a-kaggle-competition)-Kagglekernel
