In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('malasana.csv', index_col = 0)
df.shape

(2500, 42)

In [3]:
df = df[['price', 'propertyType', 'size', 'exterior', 'rooms',
         'bathrooms', 'address', 'province', 'municipality',
         'latitude', 'longitude', 'status', 'newDevelopment', 'hasLift',
         'parkingSpace', 'priceByArea', 'detailedType', 'highlight']]

In [4]:
df

Unnamed: 0,price,propertyType,size,exterior,rooms,bathrooms,address,province,municipality,latitude,longitude,status,newDevelopment,hasLift,parkingSpace,priceByArea,detailedType,highlight
0,709000.0,flat,85.0,True,2,2,barrio Trafalgar,Madrid,Madrid,40.431917,-3.699516,good,False,True,,8341.0,{'typology': 'flat'},
1,2400000.0,flat,429.0,True,5,5,Calle de Covarrubias,Madrid,Madrid,40.431858,-3.699549,good,False,True,"{'hasParkingSpace': True, 'isParkingSpaceInclu...",5594.0,{'typology': 'flat'},{'groupDescription': 'Top'}
2,1500000.0,flat,241.0,True,4,2,Calle de Juan de Austria,Madrid,Madrid,40.432368,-3.699781,renew,False,True,,6224.0,{'typology': 'flat'},{'groupDescription': 'Destacado'}
3,178000.0,flat,25.0,True,1,1,"Calle de Alburquerque, 29",Madrid,Madrid,40.431554,-3.699969,good,False,False,,7120.0,{'typology': 'flat'},{'groupDescription': 'Destacado'}
4,225000.0,studio,35.0,False,0,1,Calle de Hartzenbusch,Madrid,Madrid,40.432004,-3.700349,good,False,True,,6429.0,"{'typology': 'flat', 'subTypology': 'studio'}",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,830000.0,flat,175.0,True,5,4,Calle de Hilarión Eslava,Madrid,Madrid,40.435605,-3.715288,good,False,True,,4743.0,{'typology': 'flat'},
2496,889000.0,flat,118.0,True,2,2,Mesonero Romanos,Madrid,Madrid,40.420548,-3.706232,good,False,True,,7534.0,{'typology': 'flat'},{'groupDescription': 'Destacado'}
2497,4200000.0,flat,471.0,True,6,5,Calle de Velázquez,Madrid,Madrid,40.425091,-3.685739,good,False,True,"{'hasParkingSpace': True, 'isParkingSpaceInclu...",8917.0,{'typology': 'flat'},{'groupDescription': 'Destacado'}
2498,1050000.0,duplex,233.0,True,3,4,barrio Nuevos Ministerios-Ríos Rosas,Madrid,Madrid,40.444252,-3.703076,good,False,True,,4506.0,"{'typology': 'flat', 'subTypology': 'duplex'}",


In [5]:
# Exterior cleaned
df['exterior'] = df['exterior'].fillna(False)

In [6]:
# hasLift cleaned
df['hasLift'] = df['hasLift'].fillna(False)

In [7]:
# highlight cleaned
df['highlight'] = df['highlight'].fillna("{'groupDescription': None}")

In [8]:
# parkingSpace cleaned
df['parkingSpace'] = df['parkingSpace'].fillna("{'hasParkingSpace': False, 'isParkingSpaceIncludedInPrice': False, 'parkingSpacePrice': None}")

# Feature engineering

In [9]:
# parkingSpace included fixed
df['parkingSpace'] = df['parkingSpace'].apply(lambda x: "{'hasParkingSpace': True, 'isParkingSpaceIncludedInPrice': True, 'parkingSpacePrice': None}" if x == "{'hasParkingSpace': True, 'isParkingSpaceIncludedInPrice': True}" else x)

In [10]:
# unique parkingSpaces are now consistent
df['parkingSpace'].unique()

array(["{'hasParkingSpace': False, 'isParkingSpaceIncludedInPrice': False, 'parkingSpacePrice': None}",
       "{'hasParkingSpace': True, 'isParkingSpaceIncludedInPrice': False, 'parkingSpacePrice': 100000.0}",
       "{'hasParkingSpace': True, 'isParkingSpaceIncludedInPrice': True, 'parkingSpacePrice': None}",
       "{'hasParkingSpace': True, 'isParkingSpaceIncludedInPrice': False, 'parkingSpacePrice': 50000.0}",
       "{'hasParkingSpace': True, 'isParkingSpaceIncludedInPrice': False, 'parkingSpacePrice': 65000.0}",
       "{'hasParkingSpace': True, 'isParkingSpaceIncludedInPrice': False, 'parkingSpacePrice': 90000.0}",
       "{'hasParkingSpace': True, 'isParkingSpaceIncludedInPrice': False, 'parkingSpacePrice': 30000.0}",
       "{'hasParkingSpace': True, 'isParkingSpaceIncludedInPrice': False, 'parkingSpacePrice': 70000.0}",
       "{'hasParkingSpace': True, 'isParkingSpaceIncludedInPrice': False, 'parkingSpacePrice': 60000.0}",
       "{'hasParkingSpace': True, 'isParkingSpaceIn

In [11]:
# fixing detailedType
df['detailedType'] = df['detailedType'].apply(lambda x: "{'typology': 'flat', 'subTypology': 'standardFlat'}" if x == "{'typology': 'flat'}" else x)
df['detailedType'] = df['detailedType'].apply(lambda x: "{'typology': 'chalet', 'subTypology': 'standardChalet'}" if x == "{'typology': 'chalet'}" else x)

In [12]:
# Converting string of dicts --> dicts

from ast import literal_eval # Library for string evaluation

df['parkingSpace'] = df['parkingSpace'].apply(literal_eval)
df['highlight'] = df['highlight'].apply(literal_eval)
df['detailedType'] = df['detailedType'].apply(literal_eval)

In [13]:
df = pd.concat([df,
           pd.json_normalize(df['parkingSpace']),
           pd.json_normalize(df['highlight']),
           pd.json_normalize(df['detailedType'])],
          axis = 1)

In [14]:
df

Unnamed: 0,price,propertyType,size,exterior,rooms,bathrooms,address,province,municipality,latitude,...,parkingSpace,priceByArea,detailedType,highlight,hasParkingSpace,isParkingSpaceIncludedInPrice,parkingSpacePrice,groupDescription,typology,subTypology
0,709000.0,flat,85.0,True,2,2,barrio Trafalgar,Madrid,Madrid,40.431917,...,"{'hasParkingSpace': False, 'isParkingSpaceIncl...",8341.0,"{'typology': 'flat', 'subTypology': 'standardF...",{'groupDescription': None},False,False,,,flat,standardFlat
1,2400000.0,flat,429.0,True,5,5,Calle de Covarrubias,Madrid,Madrid,40.431858,...,"{'hasParkingSpace': True, 'isParkingSpaceInclu...",5594.0,"{'typology': 'flat', 'subTypology': 'standardF...",{'groupDescription': 'Top'},True,False,100000.0,Top,flat,standardFlat
2,1500000.0,flat,241.0,True,4,2,Calle de Juan de Austria,Madrid,Madrid,40.432368,...,"{'hasParkingSpace': False, 'isParkingSpaceIncl...",6224.0,"{'typology': 'flat', 'subTypology': 'standardF...",{'groupDescription': 'Destacado'},False,False,,Destacado,flat,standardFlat
3,178000.0,flat,25.0,True,1,1,"Calle de Alburquerque, 29",Madrid,Madrid,40.431554,...,"{'hasParkingSpace': False, 'isParkingSpaceIncl...",7120.0,"{'typology': 'flat', 'subTypology': 'standardF...",{'groupDescription': 'Destacado'},False,False,,Destacado,flat,standardFlat
4,225000.0,studio,35.0,False,0,1,Calle de Hartzenbusch,Madrid,Madrid,40.432004,...,"{'hasParkingSpace': False, 'isParkingSpaceIncl...",6429.0,"{'typology': 'flat', 'subTypology': 'studio'}",{'groupDescription': None},False,False,,,flat,studio
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,830000.0,flat,175.0,True,5,4,Calle de Hilarión Eslava,Madrid,Madrid,40.435605,...,"{'hasParkingSpace': False, 'isParkingSpaceIncl...",4743.0,"{'typology': 'flat', 'subTypology': 'standardF...",{'groupDescription': None},False,False,,,flat,standardFlat
2496,889000.0,flat,118.0,True,2,2,Mesonero Romanos,Madrid,Madrid,40.420548,...,"{'hasParkingSpace': False, 'isParkingSpaceIncl...",7534.0,"{'typology': 'flat', 'subTypology': 'standardF...",{'groupDescription': 'Destacado'},False,False,,Destacado,flat,standardFlat
2497,4200000.0,flat,471.0,True,6,5,Calle de Velázquez,Madrid,Madrid,40.425091,...,"{'hasParkingSpace': True, 'isParkingSpaceInclu...",8917.0,"{'typology': 'flat', 'subTypology': 'standardF...",{'groupDescription': 'Destacado'},True,True,,Destacado,flat,standardFlat
2498,1050000.0,duplex,233.0,True,3,4,barrio Nuevos Ministerios-Ríos Rosas,Madrid,Madrid,40.444252,...,"{'hasParkingSpace': False, 'isParkingSpaceIncl...",4506.0,"{'typology': 'flat', 'subTypology': 'duplex'}",{'groupDescription': None},False,False,,,flat,duplex


In [15]:
del df['parkingSpace']
del df['highlight']
del df['detailedType']

In [16]:
df2 = df

In [17]:
df = pd.read_csv('recleaned_df.csv', index_col = 0)

In [18]:
df.columns == df2.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True])

In [19]:
combined = pd.concat([df, df2], axis = 0)
combined.shape

(12369, 21)

In [20]:
combined = combined.drop_duplicates()
combined

Unnamed: 0,price,propertyType,size,exterior,rooms,bathrooms,address,province,municipality,latitude,...,status,newDevelopment,hasLift,priceByArea,hasParkingSpace,isParkingSpaceIncludedInPrice,parkingSpacePrice,groupDescription,typology,subTypology
0,335000.0,flat,136.0,True,3,2,SIGRID,Madrid,Rivas-Vaciamadrid,40.352160,...,good,False,True,2463.0,True,True,,Destacado,flat,standardFlat
1,1650000.0,flat,179.0,True,2,3,Calle del Príncipe de Vergara,Madrid,Madrid,40.433593,...,good,False,True,9218.0,True,True,,Destacado,flat,standardFlat
2,770000.0,flat,109.0,True,2,2,barrio Ibiza,Madrid,Madrid,40.416551,...,good,False,True,7064.0,False,False,,Destacado,flat,standardFlat
3,175000.0,flat,49.0,True,1,1,"Calle de los Morales, 10",Madrid,Madrid,40.361994,...,good,False,True,3571.0,True,False,25000.0,Destacado,flat,standardFlat
4,280000.0,flat,121.0,True,4,2,Calle del Río Ulla,Madrid,Madrid,40.432982,...,good,False,False,2314.0,False,False,,Destacado,flat,standardFlat
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,830000.0,flat,175.0,True,5,4,Calle de Hilarión Eslava,Madrid,Madrid,40.435605,...,good,False,True,4743.0,False,False,,,flat,standardFlat
2496,889000.0,flat,118.0,True,2,2,Mesonero Romanos,Madrid,Madrid,40.420548,...,good,False,True,7534.0,False,False,,Destacado,flat,standardFlat
2497,4200000.0,flat,471.0,True,6,5,Calle de Velázquez,Madrid,Madrid,40.425091,...,good,False,True,8917.0,True,True,,Destacado,flat,standardFlat
2498,1050000.0,duplex,233.0,True,3,4,barrio Nuevos Ministerios-Ríos Rosas,Madrid,Madrid,40.444252,...,good,False,True,4506.0,False,False,,,flat,duplex


In [21]:
combined.to_csv('recleaned_df.csv', index = False)

# Recleaned

In [22]:
import pandas as pd

In [23]:
df

Unnamed: 0,price,propertyType,size,exterior,rooms,bathrooms,address,province,municipality,latitude,...,status,newDevelopment,hasLift,priceByArea,hasParkingSpace,isParkingSpaceIncludedInPrice,parkingSpacePrice,groupDescription,typology,subTypology
0,335000.0,flat,136.0,True,3,2,SIGRID,Madrid,Rivas-Vaciamadrid,40.352160,...,good,False,True,2463.0,True,True,,Destacado,flat,standardFlat
1,1650000.0,flat,179.0,True,2,3,Calle del Príncipe de Vergara,Madrid,Madrid,40.433593,...,good,False,True,9218.0,True,True,,Destacado,flat,standardFlat
2,770000.0,flat,109.0,True,2,2,barrio Ibiza,Madrid,Madrid,40.416551,...,good,False,True,7064.0,False,False,,Destacado,flat,standardFlat
3,175000.0,flat,49.0,True,1,1,"Calle de los Morales, 10",Madrid,Madrid,40.361994,...,good,False,True,3571.0,True,False,25000.0,Destacado,flat,standardFlat
4,280000.0,flat,121.0,True,4,2,Calle del Río Ulla,Madrid,Madrid,40.432982,...,good,False,False,2314.0,False,False,,Destacado,flat,standardFlat
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1344,160000.0,flat,90.0,True,3,1,Paseo de Vigo,Madrid,Fuenlabrada,40.294182,...,good,False,True,1778.0,True,True,,,flat,standardFlat
1345,220000.0,flat,101.0,True,3,2,Calle Valladolid,Madrid,Alcorcón,40.341681,...,good,False,True,2178.0,False,False,,,flat,standardFlat
1346,174900.0,flat,102.0,True,4,1,barrio Buena Vista,Madrid,Madrid,40.370577,...,good,False,False,1715.0,False,False,,,flat,standardFlat
1347,219000.0,flat,109.0,True,4,2,Calle del Grafito,Madrid,Madrid,40.347064,...,good,False,True,2009.0,True,True,,Destacado,flat,standardFlat
