In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression

In [2]:
df=pd.read_csv('cleaned_dataset.csv')

In [3]:
df.columns

Index(['bathroomcount', 'bedroomcount', 'constructionyear', 'country',
       'district', 'fireplace', 'floodingzone', 'furnished', 'garden',
       'kitchen', 'livingarea', 'locality', 'monthlycharges',
       'numberoffacades', 'peb', 'postalcode', 'price', 'propertyid',
       'province', 'region', 'roomcount', 'showercount', 'stateofbuilding',
       'subtypeofproperty', 'surfaceofplot', 'swimmingpool', 'terrace',
       'toiletcount', 'typeofproperty', 'typeofsale'],
      dtype='object')

In [4]:
df['new_roomcount'] = (
    df['bathroomcount'] +
    df['bedroomcount'] +
    df['kitchen'] +
    df['showercount'] +
    df['toiletcount']
)

In [5]:
df.drop(['country','monthlycharges','propertyid','typeofsale','roomcount'], axis=1, inplace=True)


In [6]:
df.floodingzone.unique()

array(['NON_FLOOD_ZONE', 'POSSIBLE_FLOOD_ZONE', 'RECOGNIZED_FLOOD_ZONE',
       'CIRCUMSCRIBED_WATERSIDE_ZONE',
       'POSSIBLE_N_CIRCUMSCRIBED_FLOOD_ZONE',
       'RECOGNIZED_N_CIRCUMSCRIBED_WATERSIDE_FLOOD_ZONE',
       'CIRCUMSCRIBED_FLOOD_ZONE',
       'RECOGNIZED_N_CIRCUMSCRIBED_FLOOD_ZONE',
       'POSSIBLE_N_CIRCUMSCRIBED_WATERSIDE_ZONE'], dtype=object)

In [7]:
category_to_number = {
    'NON_FLOOD_ZONE': 0,
    'POSSIBLE_FLOOD_ZONE': 1,
    'CIRCUMSCRIBED_WATERSIDE_ZONE': 2,
    'CIRCUMSCRIBED_FLOOD_ZONE': 3,
    'POSSIBLE_N_CIRCUMSCRIBED_FLOOD_ZONE': 4,
    'RECOGNIZED_FLOOD_ZONE': 5,
    'RECOGNIZED_N_CIRCUMSCRIBED_FLOOD_ZONE': 6,
    'RECOGNIZED_N_CIRCUMSCRIBED_WATERSIDE_FLOOD_ZONE': 7,
    'POSSIBLE_N_CIRCUMSCRIBED_WATERSIDE_ZONE': 8
}
df['floodingzone'] = df['floodingzone'].replace(category_to_number)


  df['floodingzone'] = df['floodingzone'].replace(category_to_number)


In [8]:
df.floodingzone.unique()

array([0, 1, 5, 2, 4, 7, 3, 6, 8], dtype=int64)

In [9]:
df.columns

Index(['bathroomcount', 'bedroomcount', 'constructionyear', 'district',
       'fireplace', 'floodingzone', 'furnished', 'garden', 'kitchen',
       'livingarea', 'locality', 'numberoffacades', 'peb', 'postalcode',
       'price', 'province', 'region', 'showercount', 'stateofbuilding',
       'subtypeofproperty', 'surfaceofplot', 'swimmingpool', 'terrace',
       'toiletcount', 'typeofproperty', 'new_roomcount'],
      dtype='object')

In [10]:
df.peb.unique()

array(['B', 'D', 'F', 'E', 'A', 'C', 'G', 'A++', 'A+', 'B_A', 'A_A+',
       'E_D', 'E_C', 'F_C', 'F_D', 'G_C', 'F_E'], dtype=object)

In [11]:
category_to_number = {peb: idx for idx, peb in enumerate(df['peb'].unique())}
df['peb'] = df['peb'].replace(category_to_number)

  df['peb'] = df['peb'].replace(category_to_number)


In [12]:
df.peb.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16],
      dtype=int64)

In [13]:
"""category_to_number = {district: idx for idx, district in enumerate(df['district'].unique())}
df['district'] = df['district'].replace(category_to_number)"""

"category_to_number = {district: idx for idx, district in enumerate(df['district'].unique())}\ndf['district'] = df['district'].replace(category_to_number)"

In [14]:
df.district.unique()    

array(['Brugge', 'Tournai', 'Veurne', 'Hasselt', 'Brussels', 'Mechelen',
       'Halle-Vilvoorde', 'Sint-Niklaas', 'Oostend', 'Ieper', 'Mons',
       'Namur', 'Leuven', 'Antwerp', 'Nivelles', 'Charleroi', 'Liège',
       'Maaseik', 'Verviers', 'Aalst', 'Soignies', 'Tongeren',
       'Marche-en-Famenne', 'Kortrijk', 'Gent', 'Eeklo', 'Diksmuide',
       'Dendermonde', 'Waremme', 'Philippeville', 'Huy', 'Dinant',
       'Neufchâteau', 'Mouscron', 'Tielt', 'Roeselare', 'Turnhout',
       'Oudenaarde', 'Thuin', 'Arlon', 'Virton', 'Ath', 'Bastogne'],
      dtype=object)

In [15]:
df.columns

Index(['bathroomcount', 'bedroomcount', 'constructionyear', 'district',
       'fireplace', 'floodingzone', 'furnished', 'garden', 'kitchen',
       'livingarea', 'locality', 'numberoffacades', 'peb', 'postalcode',
       'price', 'province', 'region', 'showercount', 'stateofbuilding',
       'subtypeofproperty', 'surfaceofplot', 'swimmingpool', 'terrace',
       'toiletcount', 'typeofproperty', 'new_roomcount'],
      dtype='object')

In [16]:
df.region.unique()

array(['Flanders', 'Wallonie', 'Brussels'], dtype=object)

In [17]:
columns_to_encode = ['province', 'region']
columns_to_encode = [col for col in columns_to_encode if col in df.columns]
df = pd.get_dummies(df, columns=columns_to_encode)
df.columns

Index(['bathroomcount', 'bedroomcount', 'constructionyear', 'district',
       'fireplace', 'floodingzone', 'furnished', 'garden', 'kitchen',
       'livingarea', 'locality', 'numberoffacades', 'peb', 'postalcode',
       'price', 'showercount', 'stateofbuilding', 'subtypeofproperty',
       'surfaceofplot', 'swimmingpool', 'terrace', 'toiletcount',
       'typeofproperty', 'new_roomcount', 'province_Antwerp',
       'province_Brussels', 'province_East Flanders',
       'province_Flemish Brabant', 'province_Hainaut', 'province_Limburg',
       'province_Liège', 'province_Luxembourg', 'province_Namur',
       'province_Walloon Brabant', 'province_West Flanders', 'region_Brussels',
       'region_Flanders', 'region_Wallonie'],
      dtype='object')

In [18]:
columns_to_encode = ['locality', 'postalcode']
columns_to_encode = [col for col in columns_to_encode if col in df.columns]
df = pd.get_dummies(df, columns=columns_to_encode)
df.columns

Index(['bathroomcount', 'bedroomcount', 'constructionyear', 'district',
       'fireplace', 'floodingzone', 'furnished', 'garden', 'kitchen',
       'livingarea',
       ...
       'postalcode_9968', 'postalcode_9970', 'postalcode_9971',
       'postalcode_9980', 'postalcode_9981', 'postalcode_9982',
       'postalcode_9988', 'postalcode_9990', 'postalcode_9991',
       'postalcode_9992'],
      dtype='object', length=6324)

In [19]:
df.subtypeofproperty.unique()

array(['flat_studio', 'apartment_block', 'house', 'apartment', 'kot',
       'ground_floor', 'mixed_use_building', 'penthouse', 'loft',
       'duplex', 'town_house', 'villa', 'mansion', 'triplex',
       'service_flat', 'bungalow', 'country_cottage', 'farmhouse',
       'exceptional_property', 'chalet', 'manor_house', 'other_property',
       'castle', 'pavilion'], dtype=object)

In [20]:
sorted_subtypeofproperty = {
    'flat_studio': 1, 
    'apartment': 2, 
    'service_flat': 3,
    'kot': 4,
    'ground_floor': 5,
    'house': 6,
    'loft': 7,
    'duplex': 8, 
    'triplex': 9, 
    'town_house': 10, 
    'bungalow': 11, 
    'apartment_block': 12, 
    'mixed_use_building': 13, 
    'penthouse': 14, 
    'chalet': 15, 
    'country_cottage': 16, 
    'farmhouse': 17, 
    'villa': 18, 
    'manor_house': 19, 
    'mansion': 20, 
    'castle': 21, 
    'pavilion': 22,
    'exceptional_property': 23,
    'other_property': 24


}
df['subtypeofproperty'] = df['subtypeofproperty'].replace(sorted_subtypeofproperty)

  df['subtypeofproperty'] = df['subtypeofproperty'].replace(sorted_subtypeofproperty)


In [21]:
df.subtypeofproperty.unique()

array([ 1, 12,  6,  2,  4,  5, 13, 14,  7,  8, 10, 18, 20,  9,  3, 11, 16,
       17, 23, 15, 19, 24, 21, 22], dtype=int64)

In [22]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['district'] = label_encoder.fit_transform(df['district'])


In [23]:
df.to_csv('last_dataset.csv', index=False)