<a href="https://colab.research.google.com/github/gabrielborja/python_data_analysis/blob/main/real_state_analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Real State Analytics

Real state data from apartments for sale in Quito - Ecuador can be obtained from the following link [here](https://www.properati.com.ec/s/la-carolina/departamento/venta/m2:60-70_habitaciones:1/?sort=published_on_desc)

## Uploading packages and data

In [1]:
#Importing HTTP libraries
import requests
import pprint
from bs4 import BeautifulSoup as bs

In [1]:
#Importing data manipulation packages
import numpy as np
import pandas as pd
from time import sleep
from random import randint

##Retrieving data from URL

In [None]:
#class requests.Response
#Response.elapsed, Response.encoding, Response.headers, Response.json, Response.status_code, Response.text, Response.url

In [3]:
#Create a function to retrieve data from given URL
def get_url(url):
  """Retrieve data from given URL and return a dataframe"""
  
  #Get data and parse with BeautifulSoup
  try:
    response = requests.get(url)
    print(response)
    #pprint.pprint(response.text[:200]) #Print first 200 char from HTML.
    soup = bs(response.text, 'html.parser')

    #Save retrieved data to lists
    d1_names = [i.text for i in soup.find_all(class_ ='bwJAej')]
    d1_location = [i.text for i in soup.find_all(class_ = 'fqaBNm')]
    d1_prices = [i.text for i in soup.find_all(class_ ='bZCCaW')]
    d1_rooms = [i.span.text for i in soup.find_all(class_ = 'fgcFIO') if 'habitaci' in i.text]
    d1_bathrooms = [i.text for i in soup.find_all('span') if 'baño' in i.text]
    d1_sqr_meters = [i.text for i in soup.find_all('span') if 'm²' in i.text]
    d1_seller_names = [i.text for i in soup.find_all(class_ = 'seller-name')]
    d1_times = [i['datetime'] for i in soup.find_all('time')]
    
    #Save data to dataframe
    df = pd.DataFrame()
    df = df.assign(date=d1_times, name=d1_names, location=d1_location,
                   price=d1_prices, rooms=d1_rooms, bathrooms=d1_bathrooms,
                   sqr_meters=d1_sqr_meters, seller=d1_seller_names)
    return df

  except requests.exceptions.RequestException as e:
    print(e)

In [None]:
#Retrive data from URL and create a dataframe of apartments in Quito
#url = 'https://www.properati.com.ec/s/la-carolina/departamento/venta/m2:50-150_habitaciones:1_ba%C3%B1os:1,2,3_publicados:ultimos-7-dias/?sort=published_on_desc&page1'
#url = 'https://www.properati.com.ec/s/la-carolina/departamento/venta/publicados:ultimos-365-dias_m2:50-150_habitaciones:1,2,3_ba%C3%B1os:1,2,3/?sort=published_on_desc&page=12'
#url = 'https://www.properati.com.ec/s/bellavista-inaquito/departamento/venta/publicados:ultimos-365-dias_m2:50-150_habitaciones:1,2,3_ba%C3%B1os:1,2,3/?sort=published_on_desc'

#df1 = get_url(url)

In [4]:
#Scraping apartments from mulitple URLs and saving to a list
data = []
url = 'https://www.properati.com.ec/s/la-carolina/departamento/venta/publicados:ultimos-365-dias_m2:50-150_habitaciones:1,2,3_ba%C3%B1os:1,2,3/?sort=published_on_desc&page='

for i in range(1, 13):
  page = url + str(i)
  data.append(get_url(page))
  sleep(randint(3,10))


<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>


In [8]:
#Printing dataframes inside list
data[11].tail()

Unnamed: 0,date,name,location,price,rooms,bathrooms,sqr_meters,seller
0,2020-10-29T00:00:00.000Z,Departamento en La Carolina,"La Carolina, Iñaquito, Centro Norte, Quito, Pi...",$ 146.000,2 habitaciones,3 baños,86 m²,Goldman Propiedades
1,2020-09-08T00:00:00.000Z,Satori,"La Carolina, Iñaquito, Centro Norte, Quito, Pi...",$ 221.426,3 habitaciones,3 baños,113 m²,Properati
2,2020-07-24T00:00:00.000Z,Milo´s House,"La Carolina, Iñaquito, Centro Norte, Quito, Pi...",$ 179.530,3 habitaciones,3 baños,117 m²,Properati


In [10]:
#Concatenating dataframes from list
df1 = pd.concat(objs=[i for i in data], ignore_index=True)
df1.shape

(333, 8)

In [11]:
#Exporting to excel into local disk
from google.colab import files
df1.to_excel('2021_07_17_la_carolina.xlsx', index=False) #==> Excluding index from file
files.download('2021_07_17_la_carolina.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#Exporting to excel into local disk
from google.colab import files
df2.to_excel('2021_07_17_bellavista.xlsx', index=False) #==> Excluding index from file
files.download('2021_07_17_bellavista.xlsx')

##Data Cleaning

In [None]:
#Importing necessary packages
import numpy as np
import pandas as pd

In [2]:
#Remove previous versions of the uploaded file
!rm 2021_07_17_la_carolina.xlsx

In [3]:
#Uploading file from local drive
from google.colab import files
uploaded1 = files.upload()

Saving 2021_07_17_la_carolina.xlsx to 2021_07_17_la_carolina.xlsx


In [4]:
#Storing dataset in a Pandas Dataframe
import io
df1 = pd.read_excel(io.BytesIO(uploaded1['2021_07_17_la_carolina.xlsx']), parse_dates=True)

In [5]:
#Checking the dataframe information
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333 entries, 0 to 332
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   date        333 non-null    object
 1   name        333 non-null    object
 2   location    333 non-null    object
 3   price       333 non-null    object
 4   rooms       333 non-null    object
 5   bathrooms   333 non-null    object
 6   sqr_meters  333 non-null    object
 7   seller      333 non-null    object
dtypes: object(8)
memory usage: 20.9+ KB


In [6]:
#Checking the dataframe tail
df1.tail()

Unnamed: 0,date,name,location,price,rooms,bathrooms,sqr_meters,seller
328,2020-10-29T00:00:00.000Z,Departamento en La Carolina,"La Carolina, Iñaquito, Centro Norte, Quito, Pi...",$ 155.000,3 habitaciones,3 baños,121 m²,Genma Cabezas
329,2020-10-29T00:00:00.000Z,Departamento en La Carolina,"La Carolina, Iñaquito, Centro Norte, Quito, Pi...",$ 99.000,3 habitaciones,2 baños,76 m²,Goldman Propiedades
330,2020-10-29T00:00:00.000Z,Departamento en La Carolina,"La Carolina, Iñaquito, Centro Norte, Quito, Pi...",$ 146.000,2 habitaciones,3 baños,86 m²,Goldman Propiedades
331,2020-09-08T00:00:00.000Z,Satori,"La Carolina, Iñaquito, Centro Norte, Quito, Pi...",$ 221.426,3 habitaciones,3 baños,113 m²,Properati
332,2020-07-24T00:00:00.000Z,Milo´s House,"La Carolina, Iñaquito, Centro Norte, Quito, Pi...",$ 179.530,3 habitaciones,3 baños,117 m²,Properati


In [16]:
#Converting date column to datetime
df1['date'] = pd.to_datetime(df1['date'], format='%Y-%m-%d %H:%M:%S').dt.tz_convert('America/Guayaquil')

In [17]:
#Extracting city and zone from location
df1['city'] = [i[3].strip() for i in df1['location'].str.split(',')]
df1['zone'] = [i[0].strip() for i in df1['location'].str.split(',')]

In [18]:
#Eliminating $ and . from price and convert to int
df1['price'] = df1['price'].str.replace('\$*\.*', '', regex=True)
df1['price'] = df1['price'].astype(int)

In [19]:
#Converting number of rooms to int
df1['rooms'] = [i[0].strip() for i in df1['rooms'].str.split(' ')]
df1['rooms'] = df1['rooms'].astype(int)

In [20]:
#Converting bathrooms to int
df1['bathrooms'] = [i[0].strip() for i in df1['bathrooms'].str.split(' ')]
df1['bathrooms'] = df1['bathrooms'].astype(int)

In [21]:
#Converting square meters to int
df1['sqr_meters'] = [i[0].strip() for i in df1['sqr_meters'].str.split(' ')]
df1['sqr_meters'] = df1['sqr_meters'].astype(int)

In [22]:
#Checking unique values for square meters
df1['sqr_meters'].unique()

array([ 56,  85,  90, 113,  54,  77, 145,  84,  81, 111,  76,  67, 103,
        60, 128, 109,  70, 102, 117, 133,  63,  94,  80,  52, 100,  68,
        82,  97,  91,  65,  72,  55,  92,  95, 135,  75, 150,  57,  89,
       124, 129, 114,  59,  74,  61, 106, 127,  58, 120, 148,  66,  62,
       112, 140,  98,  93, 116,  69, 137,  64,  73, 142,  87,  88,  50,
       107, 118, 115, 104, 110,  86,  78,  79,  83, 130, 132,  71, 105,
       147, 126,  51, 123, 121, 108,  96, 122, 101, 125])

In [23]:
#Converting sellers to title case
df1['seller'] = df1['seller'].str.title()

In [24]:
#Rearranging order of columns
df1 = df1[['date', 'city', 'zone', 'name', 'seller', 'sqr_meters', 'rooms', 'bathrooms', 'price']]

In [25]:
#Checking the dataframe tail
df1.tail()

Unnamed: 0,date,city,zone,name,seller,sqr_meters,rooms,bathrooms,price
328,2020-10-28 19:00:00-05:00,Quito,La Carolina,Departamento en La Carolina,Genma Cabezas,121,3,3,155000
329,2020-10-28 19:00:00-05:00,Quito,La Carolina,Departamento en La Carolina,Goldman Propiedades,76,3,2,99000
330,2020-10-28 19:00:00-05:00,Quito,La Carolina,Departamento en La Carolina,Goldman Propiedades,86,2,3,146000
331,2020-09-07 19:00:00-05:00,Quito,La Carolina,Satori,Properati,113,3,3,221426
332,2020-07-23 19:00:00-05:00,Quito,La Carolina,Milo´s House,Properati,117,3,3,179530


In [26]:
#Checking the dataframe info
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333 entries, 0 to 332
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype                            
---  ------      --------------  -----                            
 0   date        333 non-null    datetime64[ns, America/Guayaquil]
 1   city        333 non-null    object                           
 2   zone        333 non-null    object                           
 3   name        333 non-null    object                           
 4   seller      333 non-null    object                           
 5   sqr_meters  333 non-null    int64                            
 6   rooms       333 non-null    int64                            
 7   bathrooms   333 non-null    int64                            
 8   price       333 non-null    int64                            
dtypes: datetime64[ns, America/Guayaquil](1), int64(4), object(4)
memory usage: 23.5+ KB


In [27]:
#Exporting to excel into local disk
from google.colab import files
df1.to_csv('2021_07_17_la_carolina.csv', index=False) #==> Excluding index from file
files.download('2021_07_17_la_carolina.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

##Data Visualization

In [None]:
#Plotly --upgrade
!pip install plotly --upgrade

In [None]:
#Importing necessary packages
import matplotlib.pyplot as plt
import plotly.express as px

In [None]:
#Uploading file from local drive
from google.colab import files
uploaded1 = files.upload()

Saving 2021_06_30_la_carolina.csv to 2021_06_30_la_carolina.csv


In [None]:
#Storing dataset in a Pandas Dataframe
import io
df1_dep = pd.read_csv(io.BytesIO(uploaded1['2021_06_30_la_carolina.csv']), parse_dates=True)

In [None]:
#Checking the dataframe info
df1_dep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59 entries, 0 to 58
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   date        59 non-null     object
 1   name        59 non-null     object
 2   price       59 non-null     int64 
 3   rooms       59 non-null     int64 
 4   bathrooms   59 non-null     int64 
 5   sqr_meters  59 non-null     int64 
 6   seller      59 non-null     object
dtypes: int64(4), object(3)
memory usage: 3.4+ KB


In [None]:
#Convert date column to datetime
df1_dep['date'] = pd.to_datetime(df1_dep['date'], format='%Y-%m-%d %H:%M:%S').dt.tz_convert('America/Guayaquil')

In [None]:
#Checkind the dataframe head
df1_dep.head()

Unnamed: 0,date,name,price,rooms,bathrooms,sqr_meters,seller
0,2021-06-23 19:00:00-05:00,Departamento en La Carolina,180000,1,2,60,Community Group
1,2021-06-17 19:00:00-05:00,Departamento en La Carolina,134882,1,1,67,Urbec Constructora Inmobiliaria
2,2021-06-17 19:00:00-05:00,Edificio Emmanuelle,79130,1,1,55,Properati
3,2021-06-15 19:00:00-05:00,Departamento en La Carolina,90500,1,2,70,Cos Inmobiliaria
4,2021-06-14 19:00:00-05:00,Departamento en La Carolina,89000,1,1,68,Maria Del Carmen


In [None]:
#Grouping by month
df1_grouped = df1_dep.assign(price_sqr = round((df1_dep['price']/df1_dep['sqr_meters']), 2)).copy()
df1_grouped = df1_grouped.groupby(['date']).agg(price = ('price', 'mean'), price_sqr = ('price_sqr', 'mean')).reset_index()
df1_grouped.head()

Unnamed: 0,date,price,price_sqr
0,2020-10-28 19:00:00-05:00,120130.071429,1859.341429
1,2020-11-15 19:00:00-05:00,112400.0,1873.33
2,2020-12-17 19:00:00-05:00,175000.0,2651.52
3,2021-01-05 19:00:00-05:00,124000.0,2066.665
4,2021-01-18 19:00:00-05:00,107250.0,1676.05875


In [None]:
#Plot the price vs time
fig1_a = px.line(data_frame=df1_grouped, x='date', y='price', title='Suites en La Carolina')
fig1_a.show()

In [None]:
#Plot the price per sqr vs time
fig1_b = px.line(data_frame=df1_grouped, x='date', y='price_sqr', title='$/m2 en La Carolina')
fig1_b.show()