In [2]:
import numpy as np
import pandas as pd
from  sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
import json

## data cleaning

In [5]:
data_file = 'immo.csv'
df = pd.read_csv(data_file)

# drop column has nan missing values for more than 60%
df = pd.DataFrame(df)
df = df.drop(['ID','PostCode','HasLift','FloodZoneType','HasBalcony','HasGarden','KitchekType', 'Sub type', 'Floor', 'IsIsolated', 'HasSeaView', 'TotalRoomCount', 'HasAttic',  'HasDiningRoom',    'GardenArea', 'LivingRoomArea', 'NetHabitableSurface(msq)', 'SchoolDistance', 'ShopDistance', 'TransportDistance', 'BuildingCondition','RegionCode','street', 'HasBasement'], axis=1)

# age of building
list_years = []
for year in df['ConstructionYear']:
    years = 2022 - year
    list_years.append(years)
df = df.assign(ConstructionYear = list_years)

# replace & drop nan values
df[['NetHabitableSurface', 'ConstructionYear']] = df[['NetHabitableSurface','ConstructionYear']].fillna(df[['NetHabitableSurface', 'ConstructionYear']].mean())
columns_to_n = ['Type', 'locality']
df[columns_to_n] = df[columns_to_n].replace(np.nan, 'No Value')
df = df.drop_duplicates()
df = df.dropna(subset=['BedroomCount', 'Price', 'Province'])

# replace nan value in heating type with most frequent value
heating_type = df['HeatingType'].mode()[0]
facade_count = df['FacadeCount'].mode()[0]


df = df.fillna({'IsDoubleGlaze': False, 'HeatingType':heating_type, 'FacadeCount': facade_count})

#turn data type to int
df['NetHabitableSurface'] = df['NetHabitableSurface'].astype('int')
df['Price'] = df['Price'].astype('int')
df['BedroomCount'] = df['BedroomCount'].astype('int')
df['ConstructionYear'] = df['ConstructionYear'].astype('int')
df['FacadeCount'] = df['FacadeCount'].astype('int')

df = df.dropna(subset=['Price', 'NetHabitableSurface', 'BedroomCount', 'Province', 'Region'])
df

Unnamed: 0,Type,Price,BedroomCount,Province,locality,Region,NetHabitableSurface,ConstructionYear,FacadeCount,HeatingType,IsDoubleGlaze
0,HOUSE,328330,3,Limburg,Zonhoven,Flanders,148,37,3,GAS,True
2,HOUSE,327829,3,Limburg,Zonhoven,Flanders,148,37,3,GAS,True
4,HOUSE,378242,3,Limburg,Zonhoven,Flanders,148,37,4,GAS,True
5,HOUSE,1295000,5,Antwerp,Berlaar,Flanders,650,37,3,GAS,True
6,HOUSE,442000,4,Antwerp,Merksplas,Flanders,221,33,4,ELECTRIC,False
...,...,...,...,...,...,...,...,...,...,...,...
21042,HOUSE,398000,3,Flemish Brabant,Kessel-Lo,Flanders,145,60,2,FUELOIL,False
21043,APARTMENT,219000,2,West Flanders,Brugge,Flanders,95,37,2,GAS,True
21044,HOUSE,345000,3,Antwerp,Schriek,Flanders,166,56,2,GAS,True
21045,HOUSE,375000,3,West Flanders,Oostkamp,Flanders,160,51,2,GAS,False


## one hot encoder

In [4]:
ohe = OneHotEncoder(handle_unknown='ignore')

ohe_df = pd.DataFrame(ohe.fit_transform(df[['Type', 'locality', 'Province', 'Region', 'HeatingType','IsDoubleGlaze']]).toarray())

df = pd.concat([df, ohe_df], axis=1).drop(['Type', 'locality', 'Province', 'Region', 'HeatingType', 'IsDoubleGlaze'], axis=1)
df = df.dropna()

df

Unnamed: 0,Price,BedroomCount,NetHabitableSurface,ConstructionYear,FacadeCount,0,1,2,3,4,...,2226,2227,2228,2229,2230,2231,2232,2233,2234,2235
0,328330.0,3.0,148.0,37.0,3.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,327829.0,3.0,148.0,37.0,3.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,378242.0,3.0,148.0,37.0,4.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5,1295000.0,5.0,650.0,37.0,3.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
6,442000.0,4.0,221.0,33.0,4.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16036,369000.0,3.0,249.0,59.0,3.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
16037,249000.0,2.0,166.0,37.0,2.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
16038,570000.0,3.0,170.0,34.0,4.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
16039,649000.0,2.0,117.0,102.0,2.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


## split data

In [4]:
X = df.drop('Price', axis=1)
y = df['Price']

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0,  test_size=0.2)


## train model

In [None]:
# random forest
rf = RandomForestRegressor()

rf.fit(X_train, y_train)
y_predict = rf.predict(X_train)

## save model

In [98]:
# save model
joblib.dump(rf,open('model.pkl','wb'))

In [118]:
model = joblib.load('rt_model.pkl')

## dict1

In [9]:
df.iloc[0].to_dict()

{'Type': 'HOUSE',
 'Price': 328330,
 'BedroomCount': 3,
 'Province': 'Limburg',
 'locality': 'Zonhoven',
 'Region': 'Flanders',
 'NetHabitableSurface': 148,
 'ConstructionYear': 37,
 'FacadeCount': 3,
 'HeatingType': 'GAS',
 'IsDoubleGlaze': True}