In [971]:
import pandas as pd
import numpy as np
import re

In [972]:
df = pd.read_csv('../csv_files/properties_correlated.csv', low_memory=False)
df.head()

Unnamed: 0,source,hyperlink,locality,postcode,house_is,property_subtype,price,sale,rooms_number,area,...,open_fire,terrace,terrace_area,garden,garden_area,land_surface,land_plot_surface,facades_number,swimming_pool_has,building_state
0,6,8901695,4180,4180.0,True,MIXED_USE_BUILDING,295000,,3,242,...,False,True,36,True,1000,1403,1403,0,False,GOOD
1,6,8747010,8730,8730.0,True,VILLA,675000,,4,349,...,False,False,0,True,977,1526,1526,0,False,AS_NEW
2,6,8775843,4020,4020.0,True,APARTMENT_BLOCK,250000,,5,303,...,False,False,0,False,0,760,760,0,False,TO_RENOVATE
3,6,8910441,1200,1200.0,True,HOUSE,545000,,4,235,...,False,False,0,False,0,63,63,0,False,JUST_RENOVATED
4,6,8758672,1190,1190.0,True,MIXED_USE_BUILDING,500000,,2,220,...,False,False,0,True,60,193,193,0,False,AS_NEW


In [973]:
df.shape

(93068, 22)

# Remove leading and trailing spaces from column names

In [974]:
df.columns = [x.strip(' ') for x in df.columns.values]

# Remove leading and trailing spaces of every element

In [975]:
# remove leading and trailing spaces and newline characters from values if they are a string
df = df.applymap(lambda x: x.strip() if type(x)==str else x)

#### 1. PostCode

In [976]:
# 1 Converts postCode into int64

df['postcode'] = df['postcode'].astype('Int64')
df['postcode'].dtypes

Int64Dtype()

#### 2. Price

In [977]:
# 2 Converting price
def grabs_strips(x):
    if type(x) == str:
        # return x.str.extract('(\d*\.?\d*)', expand=False).astype(float)
        return re.match(r'(\d*(,\d{3})*\.?\d*)', x).group()
    return x


df['price'] = df['price'].apply(grabs_strips)

In [978]:
# Conversion into float
df['price'] = pd.to_numeric(df['price'], errors='coerce')

In [979]:
df['price'].shape

(93068,)

In [980]:
df.dropna(subset=['price'], inplace=True)
df.shape

(75802, 22)

## 3.house_is

In [1033]:
# fonction to Update the most relevant value of proprety_subtype

def updates_house_is(row):
    
    house_sub_type = ['HOUSE','house','VILLA','EXCEPTIONAL_PROPERTY', 'MANSION', 'villa', 'House', 'TOWN_HOUSE'
                      , 'Villa', 'COUNTRY-COTTAGE' ]
    
    app_sub_type = ['APARTEMENT', 'APARTEMENT','apartment','MIXED_USE_BUILDING','Apartment','DUPLEX','PENTHOUSE',
                   'APARTMENT_BLOCK','GROUND_FLOOR', 'duplex', 'ground-floor', 'Loft/Attic', 'APARTMENT_GROUP'
                    , 'Penthouse', 'penthouse', 'flat-studio', 'APARTMENT', 'apartement', 'Apartement']
    
    if row['property_subtype'] in house_sub_type:
        return True
    elif row['property_subtype'] in app_sub_type:
        return False
    return np.nan

# Storing in house_is prop of the df
df['house_is'] = df.apply(updates_house_is, axis=1)

df['house_is'].replace({
    "True": True,
    "TRUE": True,
    "Yes": True,
    "False": False,
    "FALSE": False,
    "No": False,
    "NaN": 'unknown'
}, inplace=True)


df.house_is = df.house_is.astype('float64')

## 4.Sale

In [982]:
df.sale = df['sale'].replace({
    'Wohnung': "Unknown",
    'Appartement': "Unknown",
    'Apartamento': "Unknown",
    '': "Unknown",
    'None': "Unknown",
    "unknown":"Unknown",
    "Maison":"Unknown",
    "Huis":"Unknown",
    "House":"Unknown"
})

df.sale = df['sale'].fillna('Unknown')

## 5.property_subtype

In [1030]:
df.property_subtype = df['property_subtype'].replace({
    'house': "HOUSE",
    'House': "HOUSE",
    'apartment': "APARTMENT",
    '': "unknown",
    'villa': "VILLA",
    'Villa': "VILLA",
    "duplex":"DUPLEX",
    "Huis":"HOUSE",
    "Maison":"HOUSE",
    'penthouse':'PENTHOUSE',
    'flat-studio':'FLAT_STUDIO',
    'ground-floor':'GROUND_FLOOR',
    'loft':'LOFT',
    'castle':'CASTLE',
    'unkonwn':'unknown',
    'mansion':'MANSION',
    'bungalow':'BUNGALOW'
    
})

df.property_subtype = df.property_subtype.replace(r'^\d+', 'unkonwn', regex=True)

df.property_subtype = df['property_subtype'].fillna('unknown')

In [984]:
pd.options.display.max_rows = 4000
df.property_subtype.value_counts()

HOUSE                                  25441
APARTMENT                              20658
unknown                                 6024
VILLA                                   4891
APARTMENT_BLOCK                         3709
MIXED_USE_BUILDING                      3322
DUPLEX                                  1820
PENTHOUSE                               1646
GROUND_FLOOR                            1320
Flat                                    1099
FLAT_STUDIO                              991
EXCEPTIONAL_PROPERTY                     905
MANSION                                  810
TOWN_HOUSE                               403
COUNTRY_COTTAGE                          346
SERVICE_FLAT                             304
LOFT                                     289
BUNGALOW                                 227
apartment-block                          167
mixed-use-building                       146
FARMHOUSE                                117
MANOR_HOUSE                              113
mansion   

## 9.room_number

In [985]:
# replace None to np.nan
df.rooms_number.fillna(value=np.nan, inplace=True)
df.rooms_number[df.rooms_number==None]

Series([], Name: rooms_number, dtype: object)

In [986]:
# replace 'None' to np.nan
df.rooms_number = df.rooms_number.apply(lambda x : np.nan if x=='None' else x)
df.rooms_number[df.rooms_number=='None']

Series([], Name: rooms_number, dtype: object)

In [987]:
df.rooms_number = df.rooms_number.replace('Not specified', np.nan)

In [988]:
# change data type of rooms_number from object to float64
df.rooms_number = df.rooms_number.astype('float64')

## 10.area

In [989]:
# remove 'm2' in value of area
df.area = df.area.replace("[^0-9.-]", "", regex=True)

In [990]:
# replace empty in value of area
df.area = df.area.replace('', np.nan)

In [991]:
# replace None to np.nan
df.area.fillna(value=np.nan, inplace=True)

In [992]:
# replace 'None' to np.nan
df.area = df.area.apply(lambda x : np.nan if x=='None' else x)

In [993]:
# change data type from object to float64
df.area = df.area.astype('float64')

In [994]:
# replace zero in value of area
df.area = df.area.replace(0, np.nan)

## 11.kitchen_has

In [995]:
df.kitchen_has.value_counts(dropna=False)

True     53991
False    15764
NaN       6047
Name: kitchen_has, dtype: int64

In [996]:
# change data type from object to float64
df.kitchen_has = df.kitchen_has.astype('float64')

In [997]:
df.kitchen_has.value_counts(dropna=False)

1.0    53991
0.0    15764
NaN     6047
Name: kitchen_has, dtype: int64

## 12.furnished

In [998]:
# change data type from object to float64
df.furnished = df.furnished.astype('float64')

## 13.open_fire

In [999]:
# change data type from object to float64
df.open_fire = df.open_fire.astype('float64')

## 14.terrace

In [1000]:
# change numerical data to np.nan
df.terrace = df.terrace.replace(r'\d\.?\d?', True, regex=True)

In [1001]:
# replace string False to False
df.terrace = df.terrace.replace('False', False)

In [1002]:
# replace string False to False
df.terrace = df.terrace.replace('TRUE', True)

In [1003]:
# replace string False to False
df.terrace = df.terrace.replace('True', True)

In [1004]:
# change data type from object to bool
df.terrace = df.terrace.astype('float64')

In [1005]:
df.terrace.value_counts(dropna=False)

1.0    34219
0.0    25809
NaN    15774
Name: terrace, dtype: int64

## 15.terrace_area

In [1006]:
# replace 'None' to np.nan
df.terrace_area = df.terrace_area.apply(lambda x : np.nan if x=='None' else x)
df.terrace_area[df.terrace_area=='None']

Series([], Name: terrace_area, dtype: object)

In [1007]:
df.terrace_area = df.terrace_area.replace(True, np.nan)
df.terrace_area = df.terrace_area.replace('TRUE', np.nan)

In [1008]:
# change data type from object to float64
df.terrace_area = df.terrace_area.astype('float64')

# replace zero in value of area
df.terrace_area = df.terrace_area.replace(0, np.nan)

# replace '1' in value of area
df.terrace_area = df.terrace_area.replace(1, np.nan)

## 16.Garden

In [1009]:
# replace string False to False
df.garden = df.garden.replace('False', False)

In [1010]:
# replace string False to False
df.garden = df.garden.replace('True', True)

In [1011]:
# change data type from object to bool
df.garden = df.garden.astype('float64')

In [1012]:
df.garden = df.garden.apply(lambda x:1 if x>1 else x)

## 17.Garden Area

In [1013]:
# replace None to np.nan
df.garden_area.fillna(value=np.NaN, inplace=True)
df.garden_area[df.garden_area==None]

# replace 'None' to np.nan
df.garden_area = df.garden_area.apply(lambda x : np.nan if x=='None' else x)
df.garden_area[df.garden_area=='None']

# change data type of rooms_number from object to float64
df.garden_area = df.garden_area.astype('float64')

# replace zero in value of area
df.garden_area = df.garden_area.replace(0, np.nan)

# replace '1' in value of area
df.garden_area = df.garden_area.replace(1, np.nan)

## 18.land_surface

In [1014]:
# replace None to np.nan
df.land_surface.fillna(value=np.NaN, inplace=True)

# replace np.nan TO 0
df.land_surface = df.land_surface.replace(np.nan, 0)

# replace None to np.nan
df.land_surface.fillna(value=np.nan, inplace=True)

# replace 'None' to np.nan
df.land_surface = df.land_surface.apply(lambda x : np.nan if x=='None' else x)

# change data type of rooms_number from object to float64
df.land_surface = df.land_surface.astype('float64')

# replace zero in value of area
df.land_surface = df.land_surface.replace(0, np.nan)

# replace '1' in value of area
df.land_surface = df.land_surface.replace(1, np.nan)

In [1015]:
df.land_surface.value_counts()

100.0        280
150.0        268
200.0        220
300.0        217
120.0        215
250.0        184
1000.0       175
400.0        174
110.0        173
50.0         162
70.0         160
500.0        158
160.0        157
90.0         147
60.0         144
170.0        143
600.0        142
180.0        141
130.0        139
220.0        137
140.0        136
800.0        127
80.0         125
1200.0       125
700.0        124
260.0        111
75.0         107
1500.0       102
550.0         97
2000.0        96
350.0         95
280.0         93
330.0         91
240.0         90
230.0         87
105.0         85
450.0         82
40.0          79
900.0         78
270.0         77
210.0         75
125.0         74
1300.0        74
135.0         71
165.0         68
190.0         68
360.0         66
1800.0        66
85.0          66
340.0         65
185.0         62
290.0         60
320.0         60
225.0         59
45.0          59
145.0         59
115.0         59
750.0         57
155.0         

## 19.land_plot_surface

In [1016]:
# replace 'yes' from value to 0
df.land_plot_surface = df.land_plot_surface.replace("[^0-9.-]", "", regex=True)

# replace 'm2' from value of area
df.land_plot_surface = df.land_plot_surface.replace('', np.nan)

# replace None to np.nan
df.land_plot_surface.fillna(value=np.nan, inplace=True)

# replace 'None' to np.nan
df.land_plot_surface = df.land_plot_surface.apply(lambda x : np.nan if x=='None' else x)

# change data type of rooms_number from object to float64
df.land_plot_surface = df.land_plot_surface.astype('float64')

# replace zero in value of area
df.land_plot_surface = df.land_plot_surface.replace(0, np.nan)

# replace '1' in value of area
df.land_plot_surface = df.land_plot_surface.replace(1, np.nan)

## 20.facades_number

In [1017]:
# replace 'None' to np.nan
df.facades_number = df.facades_number.apply(lambda x : np.nan if x=='None' else x)
df.facades_number[df.facades_number=='None']

# change data type of facades_number from object to float64
df.facades_number = df.facades_number.astype('float64')

## 21.swimming_pool_has

In [1018]:
 #change numerical data to np.nan
df.swimming_pool_has = df.swimming_pool_has.replace(r'\d\.?\d?', np.nan, regex=True)

# replace string 'False' to False
df.swimming_pool_has = df.swimming_pool_has.replace('False', False)

# replace string 'FALSE' to False
df.swimming_pool_has = df.swimming_pool_has.replace('FALSE', False)

# replace string 'True' to True
df.swimming_pool_has = df.swimming_pool_has.replace('True', True)

# replace string 'TRUE' to True
df.swimming_pool_has = df.swimming_pool_has.replace('TRUE', True)

# change data type from object to bool
df.swimming_pool_has = df.swimming_pool_has.astype('float64')

df.swimming_pool_has.value_counts(dropna=False)

0.0    59365
NaN    14475
1.0     1962
Name: swimming_pool_has, dtype: int64

## 22.building_state

In [1019]:
# change numerical data to np.nan
df.building_state = df.building_state.replace(r'\d\.?\d?', np.nan, regex=True)
df.building_state = df.building_state.apply(lambda x : np.nan if x=='None' else x)
df.building_state = df['building_state'].replace({
    'Good':"GOOD",
    'As new':"AS_NEW",
    'To renovate':"TO_RENOVATE",
    'To restore':"TO_RESTORE",
    '':np.nan,
    'To be done up':"TO_BE_DONE_UP",
    "Just renovated":"JUST_RENOVATED",
    'Not specified':np.nan
})

# Check if there are columns with mixed data types ==> NO

In [1020]:
from pandas.api.types import infer_dtype
# print data type of each column to check if there are
# any mixed ones, turns out that there are none
def is_mixed(col):
    return infer_dtype(col)

df.apply(is_mixed)

# ==> there are no columns with 'mixed' part of the inferred datatype

source                integer
hyperlink              string
locality               string
postcode              integer
house_is             floating
property_subtype       string
price                floating
sale                   string
rooms_number         floating
area                 floating
kitchen_has          floating
furnished            floating
open_fire            floating
terrace              floating
terrace_area         floating
garden               floating
garden_area          floating
land_surface         floating
land_plot_surface    floating
facades_number       floating
swimming_pool_has    floating
building_state         string
dtype: object

## Find which kind of empties there are ==> there are only NaNs

In [1021]:
# are there any empty strings? ==> no
#print(np.where(df.applymap(lambda x: x == '')))

# are there any NaNs? ==> yes
np.where(pd.isnull(df))

(array([    1,     2,     2, ..., 75801, 75801, 75801]),
 array([14, 14, 16, ..., 18, 20, 21]))

# Display the percent of NaNs per column

In [1022]:
# display the percent of NaNs per column
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'%_missing': percent_missing})
missing_value_df = missing_value_df.sort_values('%_missing', ascending = False)
missing_value_df

Unnamed: 0,%_missing
garden_area,86.045223
terrace_area,70.008707
land_surface,67.032532
building_state,42.791747
land_plot_surface,37.830136
hyperlink,22.172238
terrace,20.809477
swimming_pool_has,19.095802
furnished,15.605129
house_is,14.142107


### Put 'unknown' in place of NaN for everything else than int64 and float64 columns
### Please run this after converting numerical columns like price and facades from string to integer.
#### Even a value of NaN might help predict the price, so to avoid the correlation algorithm skipping it?, and because NaN is not allowed, we replace it.

In [1023]:
# replace all NaNs in strings with 'unknown'
df_nanfilled = df.select_dtypes(exclude=['int64','float64']).replace(np.nan, 'unknown')
df.update(df_nanfilled)

# replace all 'None'/'none' strings with uknown
df_nonefilled = df.select_dtypes(exclude=['int64','float64']).replace('none', 'unknown')
df.update(df_nonefilled)
df_nonefilled = df.select_dtypes(exclude=['int64','float64']).replace('None', 'unknown')
df.update(df_nonefilled)

In [1024]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 75802 entries, 0 to 93067
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   source             75802 non-null  int64  
 1   hyperlink          75802 non-null  object 
 2   locality           75802 non-null  object 
 3   postcode           72755 non-null  Int64  
 4   house_is           65082 non-null  float64
 5   property_subtype   75802 non-null  object 
 6   price              75802 non-null  float64
 7   sale               75802 non-null  object 
 8   rooms_number       75544 non-null  float64
 9   area               67176 non-null  float64
 10  kitchen_has        69755 non-null  float64
 11  furnished          63973 non-null  float64
 12  open_fire          70280 non-null  float64
 13  terrace            60028 non-null  float64
 14  terrace_area       22734 non-null  float64
 15  garden             68389 non-null  float64
 16  garden_area        105

# Remove duplicates
### should execute after fixing columns
### should execute after removing non-property detail or incomplete columns: source and hyperlink

In [1025]:
# drop columns 
df.drop(['source', 'hyperlink'], axis = 1, inplace = True)

# drop 100% duplicate rows
lenght_before = len(df)
df.drop_duplicates(ignore_index = True, inplace = True)
dropped = len(df) - lenght_before
print(f'Dropped: {dropped}')

Dropped: -21765


## 6.locality and postcode
### Drop postcode column, because postcode is more completely available in 'locality'
### first we fix 'locality' column to carry just postcode or 'unknown' (stripping sporadic address parts)

In [1026]:
df.drop('postcode', axis = 1, inplace = True)

# write a function that returns the cleaned postcode from elements
# containing the address
def clean_locality(locality): 
    # Search for the presence of a 4 digit number (starts with 1-9)
    if re.search('[1-9]\d{3}', locality):
        # get the number
        return re.findall("[1-9]\d{3}", locality)[0]
    else: 
        # if no postcode is inside insert 'unknown' 
        return 'unknown'
          
# Updated locality column
df['locality'] = df['locality'].apply(clean_locality)

## 7.Create a region column

In [1027]:
def get_region(locality):
    if locality == 'unknown':
        return 'unknown'
    else:
        if not re.search('[1-9]\d{3}', locality):
            print('Please run this on already cleaned locality column')
            return 'unknown'
        elif int(locality) >= 1000 and int(locality) <=1299:
            return 'Brussels'
        elif int(locality) >= 1300 and int(locality) <=1499:
            return 'Wallonia'
        elif int(locality) >= 4000 and int(locality) <=7999:
            return 'Wallonia'
        else:
            return 'Flanders'
        
df['region'] = df['locality'].apply(get_region)

# Print unique values per column

In [1041]:
uniques = pd.DataFrame()
for col in df:
    col_uniques = pd.DataFrame({f'{col}_value': df[f'{col}'].value_counts().index,
                                f'{col}_count': df[f'{col}'].value_counts().values})
    uniques = pd.concat([uniques, col_uniques], axis = 1)

uniques.iloc[0:30, 10:20].head(30)

Unnamed: 0,rooms_number_value,rooms_number_count,area_value,area_count,kitchen_has_value,kitchen_has_count,furnished_value,furnished_count,open_fire_value,open_fire_count
0,3.0,17567.0,150.0,954.0,1.0,39185.0,0.0,36600.0,0.0,45458.0
1,2.0,14765.0,120.0,932.0,0.0,8901.0,1.0,5789.0,1.0,3144.0
2,4.0,8808.0,100.0,891.0,,,,,,
3,1.0,4387.0,90.0,827.0,,,,,,
4,5.0,3843.0,160.0,808.0,,,,,,
5,6.0,1759.0,200.0,757.0,,,,,,
6,0.0,935.0,140.0,749.0,,,,,,
7,7.0,669.0,180.0,718.0,,,,,,
8,8.0,342.0,80.0,713.0,,,,,,
9,9.0,194.0,110.0,703.0,,,,,,


In [1036]:
df.to_csv('../csv_files/cleaned_properties.csv')