In [41]:
# Autoload module updates
%load_ext autoreload
%autoreload 2

# Import modules
import numpy as np
import pandas as pd
import sklearn
import sys
sys.path.insert(0, '../')
import src.cleanup as cleanup
properties = pd.read_csv('..\data\cleaned_output.csv', skip_blank_lines=True)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [42]:
# Read dataframe

# Data Cleaning 2.0
After analysing my data, I want to use it for predicts. Before building my model I need to further clean my data of
* Any NaN values
* Any string values
* Any duplicates

I have already confirmed that there are no duplicates, so the first thing to deal with will be strings, then NaNs.

Columns `id` and `type of sale`  need dropping since these contain useless data. The `state of the building` column will be converted to binary with fully renovated properties processed as 1s, and properties that need any sort of renovation processed as 0s.

In [43]:
columns = ['id','type_of-sale']
properties = cleanup.drop_column(properties, columns)

Now clean the renovation data:

In [44]:
# Replace specific value in a given column with new_value
def replace_col_value(df, column, value, new_value):
    for index, row in df.iterrows():
        if row[column] == value:
            df.at[index, column] = new_value
    return df

In [45]:
properties = cleanup.replace_col_value(properties, 'state_of_the_building', 'needs renovating', 0)
properties = cleanup.replace_col_value(properties, 'state_of_the_building', '0', 0)

renovated = ['just_renovated', 'as_new', 'good']
for i in renovated:
    properties = cleanup.replace_col_value(properties, 'state_of_the_building', i, 1)

cleanup.unique_df_values(properties)




state_of_the_building
1    8278
0    3714
Name: count, dtype: int64 



Checking for NaNs:


In [46]:
properties.head()

Unnamed: 0,id,type_of_property,subtype_of_property,province,locality,postalCode,price,number_of_bedrooms,living_area,fully_equipped_kitchen,...,open_fire,terrace,terrace_area,garden,garden_area,total_property_area,total_land_area,number_of_facades,swimming_pool,state_of_the_building
0,10492614,HOUSE,HOUSE,Liège,Saint-Nicolas,4420.0,20000.0,3.0,38.0,1.0,...,0.0,1.0,,,,206.0,194.0,,0.0,1
1,10664145,HOUSE,CHALET,Luxembourg,DURBUY,6940.0,22000.0,2.0,0.0,,...,0.0,1.0,15.0,,,40.0,200.0,4.0,0.0,0
2,10578200,APARTMENT,APARTMENT,Hainaut,La Louviere,7110.0,39000.0,1.0,0.0,0.0,...,0.0,,,,,44.0,0.0,2.0,0.0,0
3,10248737,HOUSE,HOUSE,Liège,Flémalle,4400.0,40000.0,2.0,21.0,0.0,...,0.0,,,,,106.0,110.0,3.0,0.0,0
4,10576318,HOUSE,HOUSE,Liège,Liège Wandre,4020.0,59000.0,4.0,29.0,0.0,...,0.0,,,,,144.0,80.0,2.0,0.0,0


Replace NaNs in terrace/garden with 1s if terrace area exists:

In [47]:
properties = cleanup.check_for_adjascent_data(properties, 'terrace_area', 'terrace', 1)
properties = cleanup.check_for_adjascent_data(properties, 'garden_area', 'garden', 1)
0

0

Replace other Garden and Terrace Nans with zeroes:

In [48]:
properties.head()
properties = cleanup.sort_by_column(properties, 'living_area')
properties.head()

Unnamed: 0,id,type_of_property,subtype_of_property,province,locality,postalCode,price,number_of_bedrooms,living_area,fully_equipped_kitchen,...,open_fire,terrace,terrace_area,garden,garden_area,total_property_area,total_land_area,number_of_facades,swimming_pool,state_of_the_building
5995,10559233,HOUSE,HOUSE,Flemish Brabant,ZICHEM,3271.0,350000.0,3.0,0.0,1.0,...,1.0,1.0,25.0,1.0,,140.0,2590.0,4.0,,0
6689,10673158,HOUSE,HOUSE,Antwerp,Nijlen,2560.0,389000.0,3.0,0.0,1.0,...,0.0,1.0,40.0,1.0,260.0,160.0,570.0,4.0,,1
6678,10385356,APARTMENT,APARTMENT,West Flanders,Nieuwpoort,8620.0,389000.0,2.0,0.0,1.0,...,0.0,,,1.0,75.0,100.0,,4.0,0.0,1
6668,10678328,HOUSE,HOUSE,Antwerp,Heist-op-den-Berg,2220.0,389000.0,3.0,0.0,,...,1.0,1.0,,1.0,,151.0,792.0,4.0,,0
6667,10470833,HOUSE,HOUSE,West Flanders,Oudenburg,8460.0,389000.0,3.0,0.0,1.0,...,1.0,,,,,133.0,347.0,3.0,,1


Replace `furnished`, `open fire`, `terrace`, `terrace area`, `garden`, `garden area` and `swimming pool` nans with 0s.

In [49]:
columns = ['furnished', 'open_fire', 'terrace', 'terrace_area', 'garden', 'garden_area', 'swimming_pool']

for i in columns:
    properties = cleanup.replace_nan(properties, i)
properties.head()

Unnamed: 0,id,type_of_property,subtype_of_property,province,locality,postalCode,price,number_of_bedrooms,living_area,fully_equipped_kitchen,...,open_fire,terrace,terrace_area,garden,garden_area,total_property_area,total_land_area,number_of_facades,swimming_pool,state_of_the_building
5995,10559233,HOUSE,HOUSE,Flemish Brabant,ZICHEM,3271.0,350000.0,3.0,0.0,1.0,...,1.0,1.0,25.0,1.0,0.0,140.0,2590.0,4.0,0.0,0
6689,10673158,HOUSE,HOUSE,Antwerp,Nijlen,2560.0,389000.0,3.0,0.0,1.0,...,0.0,1.0,40.0,1.0,260.0,160.0,570.0,4.0,0.0,1
6678,10385356,APARTMENT,APARTMENT,West Flanders,Nieuwpoort,8620.0,389000.0,2.0,0.0,1.0,...,0.0,0.0,0.0,1.0,75.0,100.0,,4.0,0.0,1
6668,10678328,HOUSE,HOUSE,Antwerp,Heist-op-den-Berg,2220.0,389000.0,3.0,0.0,,...,1.0,1.0,0.0,1.0,0.0,151.0,792.0,4.0,0.0,0
6667,10470833,HOUSE,HOUSE,West Flanders,Oudenburg,8460.0,389000.0,3.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,133.0,347.0,3.0,0.0,1


Delete rows that are missing Kitchen Data

In [50]:
display(properties.shape)
properties = properties.dropna(subset=['fully_equipped_kitchen', 'total_land_area', 'total_property_area', 'number_of_facades'])

(11992, 21)

In [51]:
nan_counts = properties.isnull().sum()
print(nan_counts)

id                        0
type_of_property          0
subtype_of_property       0
province                  0
locality                  0
postalCode                0
price                     0
number_of_bedrooms        0
living_area               0
fully_equipped_kitchen    0
furnished                 0
open_fire                 0
terrace                   0
terrace_area              0
garden                    0
garden_area               0
total_property_area       0
total_land_area           0
number_of_facades         0
swimming_pool             0
state_of_the_building     0
dtype: int64


In [52]:
cleanup.column_type_float(properties)

Unnamed: 0,id,type_of_property,subtype_of_property,province,locality,postalCode,price,number_of_bedrooms,living_area,fully_equipped_kitchen,...,open_fire,terrace,terrace_area,garden,garden_area,total_property_area,total_land_area,number_of_facades,swimming_pool,state_of_the_building
5995,10559233.0,HOUSE,HOUSE,Flemish Brabant,ZICHEM,3271.0,350000.0,3.0,0.0,1.0,...,1.0,1.0,25.0,1.0,0.0,140.0,2590.0,4.0,0.0,0
6689,10673158.0,HOUSE,HOUSE,Antwerp,Nijlen,2560.0,389000.0,3.0,0.0,1.0,...,0.0,1.0,40.0,1.0,260.0,160.0,570.0,4.0,0.0,1
6667,10470833.0,HOUSE,HOUSE,West Flanders,Oudenburg,8460.0,389000.0,3.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,133.0,347.0,3.0,0.0,1
6666,10560493.0,HOUSE,HOUSE,West Flanders,Brugge,8000.0,389000.0,6.0,0.0,1.0,...,0.0,1.0,0.0,1.0,119.0,210.0,220.0,2.0,0.0,0
6665,10452028.0,HOUSE,HOUSE,West Flanders,Blankenberge,8370.0,389000.0,3.0,0.0,1.0,...,1.0,1.0,17.0,0.0,0.0,222.0,111.0,2.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9291,10395316.0,APARTMENT,LOFT,Liège,Liege,4020.0,599000.0,3.0,180.0,1.0,...,0.0,1.0,0.0,1.0,107.0,333.0,0.0,2.0,0.0,1
11102,9553014.0,APARTMENT,APARTMENT,Brussels,Forest,1190.0,1200000.0,3.0,210.0,1.0,...,0.0,0.0,0.0,0.0,0.0,315.0,0.0,2.0,0.0,1
3299,10337514.0,HOUSE,HOUSE,Hainaut,Luttre,6238.0,249900.0,3.0,378.0,1.0,...,0.0,1.0,40.0,1.0,50.0,260.0,180.0,3.0,0.0,1
11317,10666309.0,HOUSE,HOUSE,Brussels,Schaerbeek,1030.0,1400000.0,8.0,389.0,1.0,...,0.0,1.0,0.0,1.0,128.0,488.0,305.0,2.0,0.0,1


In [53]:
properties.to_csv('..\data\cleaned_output_modeling.csv', index=False)