# Cleaning the data

Transforming the data from a mongodb collection to a csv file 


In [2]:
import pymongo

# Connection to Mongo DB
try:
    conn=pymongo.MongoClient()
    print("connected")
except pymongo.errors.ConnectionFailure as e:
    print ("Could not connect to MongoDB: %s" % e )

db = conn["idealista"]
collection = db["house_data"]
urls = db["urls"]


connected


In [3]:
import pandas as pd
import math
import numpy as np
df = pd.DataFrame()

In [4]:
# Exploring the data

collection.find_one()

{'_id': ObjectId('5dece54c833639ed33008205'),
 'price': '173.000 €',
 'features': [' 35 m² ', ' 2 hab. '],
 'description': ' "En la zona peatonal del barrio de Sant Pere i Santa Caterina encontramos esta propiedad en la segunda planta del edificio antiguo, sin ascensor, típico de la zona. Se trata de un estudio reformado hace unos años con calidades apropiadas para las exigencias de la zona. El piso se vende amueblado tal y como está en las fotos. Puede ser interesante para su alquiler (alrededor de 850€/mes). Ubicado en pleno centro de Barcelona, con todos los servicios cerca, sin necesidad de tener el coche.Ubicado en la mejor y más buscada área del Born, esta obra se encuentra a pocos pasos de Arc de Triomf, Parc de la Ciutadella, muy bien comunicada con transporte público (tren, metro, autobuses)." ',
 'details_house': ['35 m² construidos',
  '2 habitaciones',
  '1 baño',
  'Construido en 1848'],
 'details_building': [' Aire acondicionado '],
 'energy_class': 'd',
 'address': [' ',

In [9]:
# checking how to convert price to integer

cursor = collection.find({},{"price":1})
l = []
for doc in cursor:
    price = doc["price"]
    price = price.replace("€","").replace(".","")
    price = price.strip()
    try:
        price = int(price)
    except:
        print(price)
        price = 0
    l.append(price)

df["price"]=l
print("done")
    

done


In [65]:
# features
def append_zero(*lists):
    for list in lists:
        list.append(0)

cursor = collection.find({},{"features":1})
f =set()
m2 = []
ascensor = []
exterior = []
planta = []
garaje = []
habs = []

for doc in cursor:
    append_zero(ascensor,exterior,garaje,planta,habs)
    last_idx = len(ascensor)-1
    for feature in doc["features"]:        
        if "m²" in feature:
            m = feature.strip()
            m = feature.split(" ")[1] 
            m = m.strip()
            m = int(math.floor(float(m)))
            m2.append(m)
            continue
        if "con ascensor" in feature:
            ascensor[last_idx] = 1
        if "exterior" in feature:
            exterior[last_idx]=1
        if "ª planta" in feature:
            p = feature.split("ª planta")
            p = int(p[0])
            planta[last_idx] = p
            continue        
        if "Garaje incluido" in feature:
            garaje[last_idx] = 1
            continue
        if "hab" in feature:
            h = feature.strip()
            h = feature.split(" ")[1]
            h = int(h)
            habs[last_idx] = h
            continue
        skip_it = False
        for skip in [ "Bajo","Entreplanta","exterior","interior","ascensor","Garaje"]:
            if skip in feature:
                skip_it = True
                break
        if skip_it: continue
        f.add(feature) # untreated feature
#df["price"]=l
f
    

{'opc.'}

In [66]:
len(m2),len(planta),len(ascensor),len(exterior),len(garaje),len(habs)

(14367, 14367, 14367, 14367, 14367, 14367)

In [52]:
# details house

cursor = collection.find({},{"details_house":1})
dh = {}
construido_en = []
banos = []
calefacion_central = []
calefacion_individual = []
es_casa = []
orientacion_sur = []
orientacion_oeste = []
orientacion_este = []
orientacion_norte = []
buen_estado = []
obra_nueva = []
movilidad_reducida = []
balcon = []
armarios = []
terraza = []
trastero = []

def set_to_one(feature,*l_tests):
    for text,var in l_tests:
        if text in feature:
            if var:
                var[len(var)-1] = 1
            return True
    return False

for doc in cursor:
    append_zero(terraza, trastero, armarios,balcon,buen_estado,obra_nueva,
                orientacion_sur, orientacion_oeste, orientacion_este,
                movilidad_reducida,orientacion_norte, calefacion_central, calefacion_individual,es_casa)
    banos.append(1)
    construido_en.append(np.nan)
    last_idx = len(construido_en)-1
    for feature in doc["details_house"]:        
        if "m²" in feature:
            continue            
        if "Construido en " in feature:
            construido_en[last_idx] = int(feature.split(" en ")[1])
            continue
        if "baño" in feature:
            try: 
                banos[last_idx] = int(feature.split("baño")[0])
            except:
                banos[last_idx] = 0
            continue
        if "Casa" in feature or "Chalet" in feature or "Finca" in feature:
                es_casa[last_idx] = 1
                continue        
        if "Orientación" in feature:
            if "norte" in feature:
                orientacion_norte[last_idx] = 1           
            if "sur" in feature:
                orientacion_sur[last_idx] = 1
            if "oeste" in feature:
                orientacion_oeste[last_idx] = 1           
            if "este" in feature:
                orientacion_este[last_idx] = 1
            continue
            
        if set_to_one(feature, ("Calefacción central",calefacion_central),
                        ("Calefacción individual", calefacion_individual),
                     ("habitaci",None),("obra nueva",obra_nueva),
                      ("buen estado", buen_estado),
                      ("movilidad reducida" , movilidad_reducida ) ,
                      ("Balcón", balcon ), ("Armarios",armarios), 
                      ("Terraza",terraza), ("Trastero",trastero) ):
            continue
            
        skip_it = False
        for skip in [ "planta","Certificación energética","garaje","ascensor","Garaje",
                     "calefacción","Segunda mano"]:
            if skip in feature:
                skip_it = True
                break
        if skip_it: continue
        dh[feature] = dh.get(feature,0) + 1 # untreated feature
    

dh

{'Chimenea': 1}

In [74]:
#details building

cursor = collection.find({},{"details_building":1})
db = {}
jardin = []
piscina = []
aire = []

idx = 0 
for doc in cursor:
    append_zero(piscina,jardin,aire)
    last_idx = len(piscina)
    for feature in doc["details_building"]:        
        if "Planta" in feature and idx<len(planta):
            if planta[idx]==0:
                p = feature.split("ª")[0].split(" ")[2]
                print(p)
                planta[idx] = int(p)
            continue     
            
        if set_to_one(feature, ("Piscina", piscina), ("Aire", aire),("Jardín",jardin),("Zonas verdes",jardin) ):
            continue
            
        skip_it = False
        for skip in [ "Bajo","ótano","ascensor","Entreplanta","exterior","interior"]:
            if skip in feature:
                skip_it = True
                break
        if skip_it: continue
        db[feature] = db.get(feature,0) + 1 # untreated feature
    idx += 1

db

{' Planta 1ª ': 1}

In [68]:
idx

14367

In [56]:
a.a = 3
a

AttributeError: 'dict' object has no attribute 'a'

In [37]:
a.get("aa",0) + 2

2

In [42]:
dir(a)

['__class__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'clear',
 'copy',
 'fromkeys',
 'get',
 'items',
 'keys',
 'pop',
 'popitem',
 'setdefault',
 'update',
 'values']