In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('concepcion.csv', index_col = 0)
df.shape

(2500, 42)

In [3]:
df = df[['price', 'propertyType', 'size', 'exterior', 'rooms',
         'bathrooms', 'address', 'province', 'municipality',
         'latitude', 'longitude', 'status', 'newDevelopment', 'hasLift',
         'parkingSpace', 'priceByArea', 'detailedType', 'highlight']]

In [4]:
df

Unnamed: 0,price,propertyType,size,exterior,rooms,bathrooms,address,province,municipality,latitude,longitude,status,newDevelopment,hasLift,parkingSpace,priceByArea,detailedType,highlight
0,295000.0,flat,67.0,True,1,1,Calle de los Misterios,Madrid,Madrid,40.441226,-3.643123,good,False,False,,4403.0,{'typology': 'flat'},
1,710000.0,chalet,172.0,,2,2,barrio Concepción,Madrid,Madrid,40.441675,-3.644040,renew,False,,"{'hasParkingSpace': True, 'isParkingSpaceInclu...",4128.0,"{'typology': 'chalet', 'subTypology': 'semidet...",
2,800000.0,flat,101.0,True,2,2,barrio Concepción,Madrid,Madrid,40.441166,-3.643758,newdevelopment,True,True,"{'hasParkingSpace': True, 'isParkingSpaceInclu...",7921.0,{'typology': 'flat'},
3,995000.0,flat,190.0,True,3,2,barrio Concepción,Madrid,Madrid,40.441759,-3.644219,newdevelopment,True,True,"{'hasParkingSpace': True, 'isParkingSpaceInclu...",5237.0,{'typology': 'flat'},
4,1020000.0,flat,177.0,True,3,2,barrio Concepción,Madrid,Madrid,40.440526,-3.643084,newdevelopment,True,True,"{'hasParkingSpace': True, 'isParkingSpaceInclu...",5763.0,{'typology': 'flat'},
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,699000.0,flat,65.0,False,2,2,Calle Conde De Peñalver,Madrid,Madrid,40.428102,-3.673420,renew,False,True,,10754.0,{'typology': 'flat'},
2496,980000.0,flat,150.0,True,4,2,barrio Lista,Madrid,Madrid,40.438901,-3.678145,good,False,True,,6533.0,{'typology': 'flat'},
2497,1870000.0,penthouse,201.0,True,3,2,Calle del Príncipe de Vergara,Madrid,Madrid,40.437791,-3.677965,renew,False,True,,9303.0,"{'typology': 'flat', 'subTypology': 'penthouse'}",
2498,401500.0,flat,98.0,True,4,1,Pasaje San Martin de Valdeiglesias,Madrid,Madrid,40.442448,-3.678345,renew,False,False,,4097.0,{'typology': 'flat'},{'groupDescription': 'Destacado'}


In [5]:
# Exterior cleaned
df['exterior'] = df['exterior'].fillna(False)

In [6]:
# hasLift cleaned
df['hasLift'] = df['hasLift'].fillna(False)

In [7]:
# highlight cleaned
df['highlight'] = df['highlight'].fillna("{'groupDescription': None}")

In [8]:
# parkingSpace cleaned
df['parkingSpace'] = df['parkingSpace'].fillna("{'hasParkingSpace': False, 'isParkingSpaceIncludedInPrice': False, 'parkingSpacePrice': None}")

# Feature engineering

In [9]:
# parkingSpace included fixed
df['parkingSpace'] = df['parkingSpace'].apply(lambda x: "{'hasParkingSpace': True, 'isParkingSpaceIncludedInPrice': True, 'parkingSpacePrice': None}" if x == "{'hasParkingSpace': True, 'isParkingSpaceIncludedInPrice': True}" else x)

In [10]:
# unique parkingSpaces are now consistent
df['parkingSpace'].unique()

array(["{'hasParkingSpace': False, 'isParkingSpaceIncludedInPrice': False, 'parkingSpacePrice': None}",
       "{'hasParkingSpace': True, 'isParkingSpaceIncludedInPrice': True, 'parkingSpacePrice': None}",
       "{'hasParkingSpace': True, 'isParkingSpaceIncludedInPrice': False, 'parkingSpacePrice': 25000.0}",
       "{'hasParkingSpace': True, 'isParkingSpaceIncludedInPrice': False, 'parkingSpacePrice': 18000.0}",
       "{'hasParkingSpace': True, 'isParkingSpaceIncludedInPrice': False, 'parkingSpacePrice': 24000.0}",
       "{'hasParkingSpace': True, 'isParkingSpaceIncludedInPrice': False, 'parkingSpacePrice': 15000.0}",
       "{'hasParkingSpace': True, 'isParkingSpaceIncludedInPrice': False, 'parkingSpacePrice': 20000.0}",
       "{'hasParkingSpace': True, 'isParkingSpaceIncludedInPrice': False, 'parkingSpacePrice': 21000.0}",
       "{'hasParkingSpace': True, 'isParkingSpaceIncludedInPrice': False, 'parkingSpacePrice': 19000.0}",
       "{'hasParkingSpace': True, 'isParkingSpaceInc

In [11]:
# fixing detailedType
df['detailedType'] = df['detailedType'].apply(lambda x: "{'typology': 'flat', 'subTypology': 'standardFlat'}" if x == "{'typology': 'flat'}" else x)
df['detailedType'] = df['detailedType'].apply(lambda x: "{'typology': 'chalet', 'subTypology': 'standardChalet'}" if x == "{'typology': 'chalet'}" else x)

In [12]:
# Converting string of dicts --> dicts

from ast import literal_eval # Library for string evaluation

df['parkingSpace'] = df['parkingSpace'].apply(literal_eval)
df['highlight'] = df['highlight'].apply(literal_eval)
df['detailedType'] = df['detailedType'].apply(literal_eval)

In [13]:
df = pd.concat([df,
           pd.json_normalize(df['parkingSpace']),
           pd.json_normalize(df['highlight']),
           pd.json_normalize(df['detailedType'])],
          axis = 1)

In [14]:
df

Unnamed: 0,price,propertyType,size,exterior,rooms,bathrooms,address,province,municipality,latitude,...,parkingSpace,priceByArea,detailedType,highlight,hasParkingSpace,isParkingSpaceIncludedInPrice,parkingSpacePrice,groupDescription,typology,subTypology
0,295000.0,flat,67.0,True,1,1,Calle de los Misterios,Madrid,Madrid,40.441226,...,"{'hasParkingSpace': False, 'isParkingSpaceIncl...",4403.0,"{'typology': 'flat', 'subTypology': 'standardF...",{'groupDescription': None},False,False,,,flat,standardFlat
1,710000.0,chalet,172.0,False,2,2,barrio Concepción,Madrid,Madrid,40.441675,...,"{'hasParkingSpace': True, 'isParkingSpaceInclu...",4128.0,"{'typology': 'chalet', 'subTypology': 'semidet...",{'groupDescription': None},True,True,,,chalet,semidetachedHouse
2,800000.0,flat,101.0,True,2,2,barrio Concepción,Madrid,Madrid,40.441166,...,"{'hasParkingSpace': True, 'isParkingSpaceInclu...",7921.0,"{'typology': 'flat', 'subTypology': 'standardF...",{'groupDescription': None},True,True,,,flat,standardFlat
3,995000.0,flat,190.0,True,3,2,barrio Concepción,Madrid,Madrid,40.441759,...,"{'hasParkingSpace': True, 'isParkingSpaceInclu...",5237.0,"{'typology': 'flat', 'subTypology': 'standardF...",{'groupDescription': None},True,True,,,flat,standardFlat
4,1020000.0,flat,177.0,True,3,2,barrio Concepción,Madrid,Madrid,40.440526,...,"{'hasParkingSpace': True, 'isParkingSpaceInclu...",5763.0,"{'typology': 'flat', 'subTypology': 'standardF...",{'groupDescription': None},True,True,,,flat,standardFlat
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,699000.0,flat,65.0,False,2,2,Calle Conde De Peñalver,Madrid,Madrid,40.428102,...,"{'hasParkingSpace': False, 'isParkingSpaceIncl...",10754.0,"{'typology': 'flat', 'subTypology': 'standardF...",{'groupDescription': None},False,False,,,flat,standardFlat
2496,980000.0,flat,150.0,True,4,2,barrio Lista,Madrid,Madrid,40.438901,...,"{'hasParkingSpace': False, 'isParkingSpaceIncl...",6533.0,"{'typology': 'flat', 'subTypology': 'standardF...",{'groupDescription': None},False,False,,,flat,standardFlat
2497,1870000.0,penthouse,201.0,True,3,2,Calle del Príncipe de Vergara,Madrid,Madrid,40.437791,...,"{'hasParkingSpace': False, 'isParkingSpaceIncl...",9303.0,"{'typology': 'flat', 'subTypology': 'penthouse'}",{'groupDescription': None},False,False,,,flat,penthouse
2498,401500.0,flat,98.0,True,4,1,Pasaje San Martin de Valdeiglesias,Madrid,Madrid,40.442448,...,"{'hasParkingSpace': False, 'isParkingSpaceIncl...",4097.0,"{'typology': 'flat', 'subTypology': 'standardF...",{'groupDescription': 'Destacado'},False,False,,Destacado,flat,standardFlat


In [15]:
del df['parkingSpace']
del df['highlight']
del df['detailedType']

In [16]:
df.columns

Index(['price', 'propertyType', 'size', 'exterior', 'rooms', 'bathrooms',
       'address', 'province', 'municipality', 'latitude', 'longitude',
       'status', 'newDevelopment', 'hasLift', 'priceByArea', 'hasParkingSpace',
       'isParkingSpaceIncludedInPrice', 'parkingSpacePrice',
       'groupDescription', 'typology', 'subTypology'],
      dtype='object')

In [24]:
df2 = pd.read_csv('recleaned_df.csv')

In [25]:
df.columns

Index(['price', 'propertyType', 'size', 'exterior', 'rooms', 'bathrooms',
       'address', 'province', 'municipality', 'latitude', 'longitude',
       'status', 'newDevelopment', 'hasLift', 'priceByArea', 'hasParkingSpace',
       'isParkingSpaceIncludedInPrice', 'parkingSpacePrice',
       'groupDescription', 'typology', 'subTypology'],
      dtype='object')

In [26]:
df.columns == df2.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True])

In [27]:
combined = pd.concat([df, df2], axis = 0)
combined.shape

(14810, 21)

In [28]:
combined = combined.drop_duplicates()
combined

Unnamed: 0,price,propertyType,size,exterior,rooms,bathrooms,address,province,municipality,latitude,...,status,newDevelopment,hasLift,priceByArea,hasParkingSpace,isParkingSpaceIncludedInPrice,parkingSpacePrice,groupDescription,typology,subTypology
0,295000.0,flat,67.0,True,1,1,Calle de los Misterios,Madrid,Madrid,40.441226,...,good,False,False,4403.0,False,False,,,flat,standardFlat
1,710000.0,chalet,172.0,False,2,2,barrio Concepción,Madrid,Madrid,40.441675,...,renew,False,False,4128.0,True,True,,,chalet,semidetachedHouse
2,800000.0,flat,101.0,True,2,2,barrio Concepción,Madrid,Madrid,40.441166,...,newdevelopment,True,True,7921.0,True,True,,,flat,standardFlat
3,995000.0,flat,190.0,True,3,2,barrio Concepción,Madrid,Madrid,40.441759,...,newdevelopment,True,True,5237.0,True,True,,,flat,standardFlat
4,1020000.0,flat,177.0,True,3,2,barrio Concepción,Madrid,Madrid,40.440526,...,newdevelopment,True,True,5763.0,True,True,,,flat,standardFlat
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12305,830000.0,flat,175.0,True,5,4,Calle de Hilarión Eslava,Madrid,Madrid,40.435605,...,good,False,True,4743.0,False,False,,,flat,standardFlat
12306,889000.0,flat,118.0,True,2,2,Mesonero Romanos,Madrid,Madrid,40.420548,...,good,False,True,7534.0,False,False,,Destacado,flat,standardFlat
12307,4200000.0,flat,471.0,True,6,5,Calle de Velázquez,Madrid,Madrid,40.425091,...,good,False,True,8917.0,True,True,,Destacado,flat,standardFlat
12308,1050000.0,duplex,233.0,True,3,4,barrio Nuevos Ministerios-Ríos Rosas,Madrid,Madrid,40.444252,...,good,False,True,4506.0,False,False,,,flat,duplex


In [29]:
combined.to_csv('recleaned_df.csv', index = False)

# Recleaned

In [30]:
import pandas as pd

In [34]:
pd.read_csv('recleaned_df.csv')

Unnamed: 0,price,propertyType,size,exterior,rooms,bathrooms,address,province,municipality,latitude,...,status,newDevelopment,hasLift,priceByArea,hasParkingSpace,isParkingSpaceIncludedInPrice,parkingSpacePrice,groupDescription,typology,subTypology
0,295000.0,flat,67.0,True,1,1,Calle de los Misterios,Madrid,Madrid,40.441226,...,good,False,False,4403.0,False,False,,,flat,standardFlat
1,710000.0,chalet,172.0,False,2,2,barrio Concepción,Madrid,Madrid,40.441675,...,renew,False,False,4128.0,True,True,,,chalet,semidetachedHouse
2,800000.0,flat,101.0,True,2,2,barrio Concepción,Madrid,Madrid,40.441166,...,newdevelopment,True,True,7921.0,True,True,,,flat,standardFlat
3,995000.0,flat,190.0,True,3,2,barrio Concepción,Madrid,Madrid,40.441759,...,newdevelopment,True,True,5237.0,True,True,,,flat,standardFlat
4,1020000.0,flat,177.0,True,3,2,barrio Concepción,Madrid,Madrid,40.440526,...,newdevelopment,True,True,5763.0,True,True,,,flat,standardFlat
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14700,830000.0,flat,175.0,True,5,4,Calle de Hilarión Eslava,Madrid,Madrid,40.435605,...,good,False,True,4743.0,False,False,,,flat,standardFlat
14701,889000.0,flat,118.0,True,2,2,Mesonero Romanos,Madrid,Madrid,40.420548,...,good,False,True,7534.0,False,False,,Destacado,flat,standardFlat
14702,4200000.0,flat,471.0,True,6,5,Calle de Velázquez,Madrid,Madrid,40.425091,...,good,False,True,8917.0,True,True,,Destacado,flat,standardFlat
14703,1050000.0,duplex,233.0,True,3,4,barrio Nuevos Ministerios-Ríos Rosas,Madrid,Madrid,40.444252,...,good,False,True,4506.0,False,False,,,flat,duplex
