<a href="https://colab.research.google.com/github/gabrielborja/python_data_analysis/blob/main/real_state_analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Real State Analytics

Real state data from 1 bedroom apartments for sale in 'La Carolina' in Quito - Ecuador can be obtained from the following link [here](https://www.properati.com.ec/s/la-carolina/departamento/venta/m2:60-70_habitaciones:1/?sort=published_on_desc)

## Uploading packages and data

In [1]:
#Importing HTTP libraries
import requests
import pprint
from bs4 import BeautifulSoup as bs

In [2]:
#Importing data manipulation packages
import numpy as np
import pandas as pd

##Retrieving data from URL

In [None]:
#class requests.Response
#Response.elapsed, Response.encoding, Response.headers, Response.json, Response.status_code, Response.text, Response.url

In [149]:
#Create a function to retrieve data from given URL
def get_url(url):
  """Retrieve data from given URL and return a dataframe"""
  
  #Initialize empty lists to store data
  d1_names = []
  d1_prices = []
  d1_rooms = []
  d1_bathrooms = []
  d1_sqr_meters = []
  d1_seller_names = []
  d1_times = []

  #Get data and parse with BeautifulSoup
  try:
    response = requests.get(url)
    print(response)
    pprint.pprint(response.text[:200]) #Print first 200 char from HTML.
    soup = bs(response.text, 'html.parser')

    #Save retrieved data to lists
    d1_names = [i.text for i in soup.find_all(class_ ='bwJAej')]
    d1_prices = [i.text for i in soup.find_all(class_ ='bZCCaW')]
    d1_rooms = [i.span.text for i in soup.find_all(class_ = 'fgcFIO') if 'habitación' in i.text]
    d1_bathrooms = [i.text for i in soup.find_all('span') if 'baño' in i.text]
    d1_sqr_meters = [i.text for i in soup.find_all('span') if 'm²' in i.text]
    d1_seller_names = [i.text for i in soup.find_all(class_ = 'seller-name')]
    d1_times = [i['datetime'] for i in soup.find_all('time')]
    #d1_bathrooms.insert(25, '1 baño')

    #Save data to dataframe
    df = pd.DataFrame()
    df = df.assign(date=d1_times, name=d1_names, price=d1_prices,
                   rooms=d1_rooms, bathrooms=d1_bathrooms,
                   sqr_meters=d1_sqr_meters, seller=d1_seller_names)
    return df

  except requests.exceptions.RequestException as e:
    print(e)

In [150]:
#Retrive data from URL and create a dataframe
url = 'https://www.properati.com.ec/s/la-carolina/departamento/venta/m2:60-70_habitaciones:1/?sort=published_on_desc'

df1 = get_url(url)

<Response [200]>
('<!DOCTYPE html><html lang="es-EC"><head><style data-styled="cVrwCh hPeyFD '
 'jYjReO gnhday gpVkLR dxpaVS bthhka czzzpz bkRiDO kuucCe eMNdyn dvSwZS bKXQnq '
 'SUNlS bNHCyt dsYucZ jUvxaS fRtNaN gVFMpO epxrlA c')


In [152]:
#Print dataframe tail
df1.tail()

Unnamed: 0,date,name,price,rooms,bathrooms,sqr_meters,seller
54,2020-10-29T00:00:00.000Z,Departamento en La Carolina,$ 110.000,1 habitación,2 baños,65 m²,Gabriela-Jarrin
55,2020-10-29T00:00:00.000Z,Departamento en La Carolina,$ 118.000,1 habitación,1 baño,69 m²,Gabriela-Jarrin
56,2020-10-29T00:00:00.000Z,Departamento en La Carolina,$ 121.968,1 habitación,2 baños,61 m²,INMOIMPAKTO-Paulina Peñafiel
57,2020-10-29T00:00:00.000Z,Departamento en La Carolina,$ 98.000,1 habitación,2 baños,65 m²,Goldman Propiedades
58,2020-10-29T00:00:00.000Z,Departamento en La Carolina,$ 114.000,1 habitación,1 baño,65 m²,Goldman Propiedades


In [153]:
#Exporting to excel into local disk
from google.colab import files
df1.to_excel('2021_06_30_la_carolina.xlsx', index=False) #==> Excluding index from file
files.download('2021_06_30_la_carolina.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

##Data Cleaning

In [None]:
#Importing necessary packages
import numpy as np
import pandas as pd

In [None]:
#Remove previous versions of the uploaded file
!rm 2021_06_30_la_carolina.xlsx

In [154]:
#Uploading file from local drive
from google.colab import files
uploaded1 = files.upload()

Saving 2021_06_30_la_carolina.xlsx to 2021_06_30_la_carolina (1).xlsx


In [243]:
#Storing dataset in a Pandas Dataframe
import io
df1_dep = pd.read_excel(io.BytesIO(uploaded1['2021_06_30_la_carolina.xlsx']), parse_dates=True)

In [244]:
#Checking the dataframe information
df1_dep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59 entries, 0 to 58
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   date        59 non-null     object
 1   name        59 non-null     object
 2   price       59 non-null     object
 3   rooms       59 non-null     object
 4   bathrooms   59 non-null     object
 5   sqr_meters  59 non-null     object
 6   seller      59 non-null     object
dtypes: object(7)
memory usage: 3.4+ KB


In [245]:
#Convert date column to datetime
df1_dep['date'] = pd.to_datetime(df1_dep['date'], format='%Y-%m-%d %H:%M:%S').dt.tz_convert('America/Guayaquil')

In [246]:
#Eliminate $ and . from price and convert to int
df1_dep['price'] = df1_dep['price'].str.replace('\$*\.*', '', regex=True)
df1_dep['price'] = df1_dep['price'].astype(int)

In [247]:
#Convert number of rooms to int
df1_dep['rooms'] = df1_dep['rooms'].str.replace('1 habitación', '1')
df1_dep['rooms'] = df1_dep['rooms'].astype(int)

In [248]:
#Convert bathrooms to int
df1_dep['bathrooms'] = df1_dep['bathrooms'].str[:1]
df1_dep['bathrooms'] = df1_dep['bathrooms'].astype(int)

In [249]:
#Convert square meters to int
df1_dep['sqr_meters'] = df1_dep['sqr_meters'].str[:2]
df1_dep['sqr_meters'] = df1_dep['sqr_meters'].astype(int)

In [250]:
#Convert sellers to title case
df1_dep['seller'] = df1_dep['seller'].str.title()

In [251]:
#Checking the dataframe head
df1_dep.head()

Unnamed: 0,date,name,price,rooms,bathrooms,sqr_meters,seller
0,2021-06-23 19:00:00-05:00,Departamento en La Carolina,180000,1,2,60,Community Group
1,2021-06-17 19:00:00-05:00,Departamento en La Carolina,134882,1,1,67,Urbec Constructora Inmobiliaria
2,2021-06-17 19:00:00-05:00,Edificio Emmanuelle,79130,1,1,55,Properati
3,2021-06-15 19:00:00-05:00,Departamento en La Carolina,90500,1,2,70,Cos Inmobiliaria
4,2021-06-14 19:00:00-05:00,Departamento en La Carolina,89000,1,1,68,Maria Del Carmen


In [252]:
#Checking the dataframe info
df1_dep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59 entries, 0 to 58
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype                            
---  ------      --------------  -----                            
 0   date        59 non-null     datetime64[ns, America/Guayaquil]
 1   name        59 non-null     object                           
 2   price       59 non-null     int64                            
 3   rooms       59 non-null     int64                            
 4   bathrooms   59 non-null     int64                            
 5   sqr_meters  59 non-null     int64                            
 6   seller      59 non-null     object                           
dtypes: datetime64[ns, America/Guayaquil](1), int64(4), object(2)
memory usage: 3.4+ KB


In [253]:
#Exporting to excel into local disk
from google.colab import files
df1_dep.to_csv('2021_06_30_la_carolina.csv', index=False) #==> Excluding index from file
files.download('2021_06_30_la_carolina.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

##Data Visualization

In [None]:
#Plotly --upgrade
!pip install plotly --upgrade

In [254]:
#Importing necessary packages
import matplotlib.pyplot as plt
import plotly.express as px

In [255]:
#Uploading file from local drive
from google.colab import files
uploaded1 = files.upload()

Saving 2021_06_30_la_carolina.csv to 2021_06_30_la_carolina (1).csv


In [256]:
#Storing dataset in a Pandas Dataframe
import io
df1_dep = pd.read_csv(io.BytesIO(uploaded1['2021_06_30_la_carolina.csv']), parse_dates=True)

In [257]:
#Checking the dataframe info
df1_dep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59 entries, 0 to 58
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   date        59 non-null     object
 1   name        59 non-null     object
 2   price       59 non-null     int64 
 3   rooms       59 non-null     int64 
 4   bathrooms   59 non-null     int64 
 5   sqr_meters  59 non-null     int64 
 6   seller      59 non-null     object
dtypes: int64(4), object(3)
memory usage: 3.4+ KB


In [258]:
#Convert date column to datetime
df1_dep['date'] = pd.to_datetime(df1_dep['date'], format='%Y-%m-%d %H:%M:%S').dt.tz_convert('America/Guayaquil')

In [277]:
#Checkind the dataframe head
df1_dep.head()

Unnamed: 0,date,name,price,rooms,bathrooms,sqr_meters,seller,month
0,2021-06-23 19:00:00-05:00,Departamento en La Carolina,180000,1,2,60,Community Group,June
1,2021-06-17 19:00:00-05:00,Departamento en La Carolina,134882,1,1,67,Urbec Constructora Inmobiliaria,June
2,2021-06-17 19:00:00-05:00,Edificio Emmanuelle,79130,1,1,55,Properati,June
3,2021-06-15 19:00:00-05:00,Departamento en La Carolina,90500,1,2,70,Cos Inmobiliaria,June
4,2021-06-14 19:00:00-05:00,Departamento en La Carolina,89000,1,1,68,Maria Del Carmen,June


In [279]:
#Grouping by month
df1_grouped = df1_dep.assign(price_sqr = round((df1_dep['price']/df1_dep['sqr_meters']), 2)).copy()
df1_grouped = df1_grouped.groupby(['date']).agg(price = ('price', 'mean'), price_sqr = ('price_sqr', 'mean')).reset_index()
df1_grouped.head()

Unnamed: 0,date,price,price_sqr
0,2020-10-28 19:00:00-05:00,120130.071429,1859.341429
1,2020-11-15 19:00:00-05:00,112400.0,1873.33
2,2020-12-17 19:00:00-05:00,175000.0,2651.52
3,2021-01-05 19:00:00-05:00,124000.0,2066.665
4,2021-01-18 19:00:00-05:00,107250.0,1676.05875


In [282]:
#Plot the price vs time
fig1_a = px.line(data_frame=df1_grouped, x='date', y='price', title='Suites en La Carolina')
fig1_a.show()

In [281]:
#Plot the price per sqr vs time
fig1_b = px.line(data_frame=df1_grouped, x='date', y='price_sqr', title='$/m2 en La Carolina')
fig1_b.show()