In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns
import re
%matplotlib inline

In [2]:
#activar intellisense
%config IPCompleter.greedy=True

In [3]:
## Importación Dataset

In [4]:
df = pd.read_csv('properatti.csv', index_col = 0) ### Dataset Completo
## DataFrame Partido en distintas partes relevantes
df_posicion_geografica = df.loc[:,['place_name','state_name','country_name','place_with_parent_names' \
                                   ,'geonames_id','lat','lon','lat-lon', \
                                   'describe','title']] ### Parte con Posición Geográfica
df_localidad=df.loc[:,['place_name','place_with_parent_names','describe','title']] ## Parte con Localidades
df_ambientes = df.loc[:,['rooms','property_type','description','title']]  ## Parte con Ambientes
df_pisos = df.loc[:,['floor','property_type','description','title']]  ## Parte con Pisos
df_precio = df.loc[:,['price','currency','price_aprox_local_currency','price_aprox_usd',\
                      'surface_total_in_m2','surface_covered_in_m2','price_usd_per_m2',\
                      'price_per_m2','description','title','place_name','property_type','state_name']]  ## Parte con Precios y Superficies

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [5]:
##Funciones

In [6]:
def update(df,column,s):
    '''
    Reemplaza los valores de la columna del DataFrame con
    los valores no nulos de la serie
    '''
    df[column]=s.fillna(df[column])
    return df

def busqueda_ambientes(x):
    '''
    Busca la palabra amb (de ambientes) y devuelve el número que tiene por delante
    '''
    try:
        pattern = '(\d+)\s*amb'
        regex = re.compile(pattern, flags = re.IGNORECASE | re.UNICODE)
        m = regex.search(x)
        if m:
            return float(m.group(1))
        else:
            return np.NaN
    except:
        return np.NaN

def busqueda_monoambientes(x):
    '''
    Busca en la palabra monoambiente y devuelve 1
    '''
    if type(x) is str:
        if x.lower().find('monoambiente') >-1:
            return 1

def busqueda_moneda(x):
    try:
        pattern = 'U\$D\s*([\d\.]+)'
        regex = re.compile(pattern, flags = re.IGNORECASE | re.UNICODE)
        m = regex.search(x)
        if m:
            return float(m.group(1).replace('.',''))
        else:
            np.NaN
    except:
        return np.NaN
    
def busqueda_dolar(x):
    try:
        pattern = '^\b[a-zA-Z](?!.*U\$d).\b[a-zA-Z].(?:- ?)?(?<!\d[.,]?)(?:\d{4,}|\d{1,3}(?:\.\d{3})*)(?:,\d+)?(?![.,]?\d)'
        regex = re.compile(pattern, flags = re.IGNORECASE | re.UNICODE)
        m = regex.search(x)
        if m:
            return float(m.group(1).replace('.',''))
        else:
            np.NaN
    except:
        return np.NaN
    
def parsear_numeros(x):
    '''
    Reemplaza numeros escritos en letras por sus equivalentes
    en arábigos.
    '''
    try:
        numeros={'uno':1,'un':1,'una':1,'dos':2,'tres':3,'cuatro':4,'cinco':5,'seis':6,'siete':7,'ocho':8,'nueve':9,'diez':10}
        for key in numeros:
            x = x.lower()
            x=x.replace(key,str(numeros[key]))
        return x
    except:
        return x

def busqueda_por_texto(x):
    '''
    Busca palabras claves asociadas a ambientes, como dormitorio, comedor, living, etc.
    y devuelve el número de veces que estas aparecen.
    '''
    result = 0.0
    valor = 0.0
    try:
        pattern = r"(\d*\s*living\s*cocina\s*comedor|living\s*comedor|living)|(\d*\s*dormitorio)|(\d*\s*comedor)"
        regex = re.compile(pattern, flags = re.IGNORECASE | re.UNICODE)
        grupos = regex.findall(x)
        for grupo in grupos:
            for elemento in grupo:
                regex_dos = re.compile(r'(\d+)', flags = re.IGNORECASE | re.UNICODE)
                cantidad = regex_dos.search(elemento)
                if cantidad is not None:
                    valor+=float(cantidad.group())
            if result == valor:
                result+=1
                valor+=1
            else:
                result=valor
        if result > 0.0:
            return result
        else:
            return np.NaN
    except:
        return np.NaN
def convert_to_int(x):
    return int(x)

In [7]:
df_precio = update(df_precio,'price',df_precio.title.apply(busqueda_moneda))

In [8]:
df_precio.pivot_table(('price_aprox_usd'),columns=['property_type'],
                      fill_value=0,aggfunc=[np.mean])

Unnamed: 0_level_0,mean,mean,mean,mean
property_type,PH,apartment,house,store
price_aprox_usd,147689.419611,187406.880406,329696.776384,440257.06294


In [9]:
df_pivot=df_precio.pivot_table(('price_aprox_usd'),columns=['state_name'],
                      fill_value=0,aggfunc=[np.mean])

In [10]:
df_pivot.transpose() 

Unnamed: 0_level_0,Unnamed: 1_level_0,price_aprox_usd
Unnamed: 0_level_1,state_name,Unnamed: 2_level_1
mean,Bs.As. G.B.A. Zona Norte,324317.12317
mean,Bs.As. G.B.A. Zona Oeste,167545.648499
mean,Bs.As. G.B.A. Zona Sur,198555.715572
mean,Buenos Aires Costa Atlántica,143430.988472
mean,Buenos Aires Interior,183249.996582
mean,Capital Federal,269603.182879
mean,Catamarca,725471.707308
mean,Chaco,178583.551389
mean,Chubut,342338.181524
mean,Corrientes,206896.786681


In [11]:
df_precio.groupby(['price_aprox_usd']).transform('mean')

TypeError: Transform function invalid for data types