In [1]:
# Autoload module updates
%load_ext autoreload
%autoreload 2

# Import modules
import numpy as np
import pandas as pd
import sklearn
import sys
sys.path.insert(0, '../')
import src.cleanup as cleanup
properties = pd.read_csv('..\data\cleaned_output.csv', skip_blank_lines=True)


In [2]:
# Read dataframe

# Data Cleaning 2.0
After analysing my data, I want to use it for predicts. Before building my model I need to further clean my data of
* Any NaN values
* Any string values
* Any duplicates

I have already confirmed that there are no duplicates, so the first thing to deal with will be strings, then NaNs.

Columns `id` and `type of sale`  need dropping since these contain useless data. The `state of the building` column will be converted to binary with fully renovated properties processed as 1s, and properties that need any sort of renovation processed as 0s.

In [3]:
columns = ['id','type of sale']
properties = cleanup.drop_column(properties, columns)

Now clean the renovation data:

In [4]:
# Replace specific value in a given column with new_value
def replace_col_value(df, column, value, new_value):
    for index, row in df.iterrows():
        if row[column] == value:
            df.at[index, column] = new_value
    return df

In [5]:
properties = cleanup.replace_col_value(properties, 'state of the building', 'needs renovating', 0)
properties = cleanup.replace_col_value(properties, 'state of the building', '0', 0)

renovated = ['just_renovated', 'as_new', 'good']
for i in renovated:
    properties = cleanup.replace_col_value(properties, 'state of the building', i, 1)

cleanup.unique_df_values(properties)




state of the building
1    8278
0    3714
Name: count, dtype: int64 



Checking for NaNs:


In [6]:
properties.head()

Unnamed: 0,type of property,subtype of property,province,locality,postalCode,price,number of bedrooms,living area,fully equipped kitchen,furnished,open fire,terrace,terrace area,garden,garden area,total property area,total land area,number of facades,swimming pool,state of the building
0,HOUSE,HOUSE,Liège,Saint-Nicolas,4420.0,20000.0,3,38,1.0,0.0,0,1.0,,,,206.0,194.0,,0,1
1,HOUSE,CHALET,Luxembourg,DURBUY,6940.0,22000.0,2,0,,0.0,0,1.0,15.0,,,40.0,200.0,4.0,0,0
2,APARTMENT,APARTMENT,Hainaut,La Louviere,7110.0,39000.0,1,0,0.0,0.0,0,,,,,44.0,0.0,2.0,0,0
3,HOUSE,HOUSE,Liège,Flémalle,4400.0,40000.0,2,21,0.0,0.0,0,,,,,106.0,110.0,3.0,0,0
4,HOUSE,HOUSE,Liège,Liège Wandre,4020.0,59000.0,4,29,0.0,0.0,0,,,,,144.0,80.0,2.0,0,0


Replace NaNs in terrace/garden with 1s if terrace area exists:

In [7]:
properties = cleanup.check_for_adjascent_data(properties, 'terrace area', 'terrace', 1)
properties = cleanup.check_for_adjascent_data(properties, 'garden area', 'garden', 1)


Replace other Garden and Terrace Nans with zeroes:

In [8]:
properties.head()
properties = cleanup.sort_by_column(properties, 'living area')
properties.head()

Unnamed: 0,type of property,subtype of property,province,locality,postalCode,price,number of bedrooms,living area,fully equipped kitchen,furnished,open fire,terrace,terrace area,garden,garden area,total property area,total land area,number of facades,swimming pool,state of the building
5995,HOUSE,HOUSE,Flemish Brabant,ZICHEM,3271.0,350000.0,3,0,1.0,,1,1.0,25.0,1.0,,140.0,2590.0,4.0,0,0
6689,HOUSE,HOUSE,Antwerp,Nijlen,2560.0,389000.0,3,0,1.0,0.0,0,1.0,40.0,1.0,260.0,160.0,570.0,4.0,0,1
6678,APARTMENT,APARTMENT,West Flanders,Nieuwpoort,8620.0,389000.0,2,0,1.0,0.0,0,,,1.0,75.0,100.0,,4.0,0,1
6668,HOUSE,HOUSE,Antwerp,Heist-op-den-Berg,2220.0,389000.0,3,0,,,1,1.0,,1.0,,151.0,792.0,4.0,0,0
6667,HOUSE,HOUSE,West Flanders,Oudenburg,8460.0,389000.0,3,0,1.0,,1,,,,,133.0,347.0,3.0,0,1


Replace `furnished`, `open fire`, `terrace`, `terrace area`, `garden` and `garden area` nans with 0s.

In [9]:
columns = ['furnished', 'open fire', 'terrace', 'terrace area', 'garden', 'garden area']

for i in columns:
    properties = cleanup.replace_nan(properties, i)
properties.head()

Unnamed: 0,type of property,subtype of property,province,locality,postalCode,price,number of bedrooms,living area,fully equipped kitchen,furnished,open fire,terrace,terrace area,garden,garden area,total property area,total land area,number of facades,swimming pool,state of the building
5995,HOUSE,HOUSE,Flemish Brabant,ZICHEM,3271.0,350000.0,3,0,1.0,0.0,1,1.0,25.0,1.0,0.0,140.0,2590.0,4.0,0,0
6689,HOUSE,HOUSE,Antwerp,Nijlen,2560.0,389000.0,3,0,1.0,0.0,0,1.0,40.0,1.0,260.0,160.0,570.0,4.0,0,1
6678,APARTMENT,APARTMENT,West Flanders,Nieuwpoort,8620.0,389000.0,2,0,1.0,0.0,0,0.0,0.0,1.0,75.0,100.0,,4.0,0,1
6668,HOUSE,HOUSE,Antwerp,Heist-op-den-Berg,2220.0,389000.0,3,0,,0.0,1,1.0,0.0,1.0,0.0,151.0,792.0,4.0,0,0
6667,HOUSE,HOUSE,West Flanders,Oudenburg,8460.0,389000.0,3,0,1.0,0.0,1,0.0,0.0,0.0,0.0,133.0,347.0,3.0,0,1


Delete rows that are missing Kitchen Data

In [10]:
display(properties.shape)
properties = properties.dropna(subset=['fully equipped kitchen', 'total land area', 'total property area', 'number of facades'])

(11992, 20)

In [11]:
nan_counts = properties.isnull().sum()
print(nan_counts)

type of property          0
subtype of property       0
province                  0
locality                  0
postalCode                0
price                     0
number of bedrooms        0
living area               0
fully equipped kitchen    0
furnished                 0
open fire                 0
terrace                   0
terrace area              0
garden                    0
garden area               0
total property area       0
total land area           0
number of facades         0
swimming pool             0
state of the building     0
dtype: int64


In [12]:
cleanup.column_type_float(properties)

Unnamed: 0,type of property,subtype of property,province,locality,postalCode,price,number of bedrooms,living area,fully equipped kitchen,furnished,open fire,terrace,terrace area,garden,garden area,total property area,total land area,number of facades,swimming pool,state of the building
5995,HOUSE,HOUSE,Flemish Brabant,ZICHEM,3271.0,350000.0,3,0,1.0,0.0,1,1.0,25.0,1.0,0.0,140.0,2590.0,4.0,0,0
6689,HOUSE,HOUSE,Antwerp,Nijlen,2560.0,389000.0,3,0,1.0,0.0,0,1.0,40.0,1.0,260.0,160.0,570.0,4.0,0,1
6667,HOUSE,HOUSE,West Flanders,Oudenburg,8460.0,389000.0,3,0,1.0,0.0,1,0.0,0.0,0.0,0.0,133.0,347.0,3.0,0,1
6666,HOUSE,HOUSE,West Flanders,Brugge,8000.0,389000.0,6,0,1.0,0.0,0,1.0,0.0,1.0,119.0,210.0,220.0,2.0,0,0
6665,HOUSE,HOUSE,West Flanders,Blankenberge,8370.0,389000.0,3,0,1.0,1.0,1,1.0,17.0,0.0,0.0,222.0,111.0,2.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9291,APARTMENT,LOFT,Liège,Liege,4020.0,599000.0,3,180,1.0,0.0,0,1.0,0.0,1.0,107.0,333.0,0.0,2.0,0,1
11102,APARTMENT,APARTMENT,Brussels,Forest,1190.0,1200000.0,3,210,1.0,0.0,0,0.0,0.0,0.0,0.0,315.0,0.0,2.0,0,1
3299,HOUSE,HOUSE,Hainaut,Luttre,6238.0,249900.0,3,378,1.0,0.0,0,1.0,40.0,1.0,50.0,260.0,180.0,3.0,0,1
11317,HOUSE,HOUSE,Brussels,Schaerbeek,1030.0,1400000.0,8,389,1.0,0.0,0,1.0,0.0,1.0,128.0,488.0,305.0,2.0,0,1


In [13]:
properties.to_csv('..\data\cleaned_output_2.csv', index=False)