In [77]:
import geopandas as gpd
import pandas as pd
import json
from tqdm import tqdm
import matplotlib.pyplot as plt
from datetime import datetime
from shapely.geometry import Point, Polygon
import googlemaps
import re
import string
import requests
from tqdm.auto import tqdm
from unidecode import unidecode
import numpy as np

tqdm.pandas()
pd.set_option('display.max_columns', None)

#city = "Porto"
city = "Lisboa"
if city == "Porto":
    airbnb = pd.read_csv("data/airbnb/porto.csv")
    airbnb = airbnb[airbnb["neighbourhood_group"] == "PORTO"]
    gdf = gpd.read_file('data/geojson/porto.geojson')
    joao = pd.read_csv("data/rnal/joao_rnal_porto.csv", sep=";")
elif city == "Lisboa":
    airbnb = pd.read_csv("data/airbnb/lisboa.csv")
    airbnb = airbnb[airbnb["neighbourhood_group"] == "Lisboa"]
    gdf = gpd.read_file('data/geojson/lisboa.geojson')
    joao = pd.read_csv("data/rnal/joao_rnal_lisboa.csv")
rnal = pd.read_csv("data/rnal/rnal_travelBI.csv")

polygon = gdf.geometry[0]
def is_inside(row):
    point = Point(row['X'], row['Y'])
    return polygon.contains(point)
    
rnal['is_inside'] = rnal.apply(is_inside, axis=1)
rnal = rnal[rnal['is_inside'] == True].drop("is_inside", axis=1)
rnal['DataAberturaPublico'] = rnal['DataAberturaPublico'].apply(lambda x: pd.to_datetime(x).date())
rnal['DataRegisto'] = rnal['DataRegisto'].apply(lambda x: pd.to_datetime(x).date())

In [78]:
# Clean the NrRegisto column in joao
joao['NrRegisto'] = joao['NrRegisto'].str.replace('/AL', '')

# Convert NrRegisto to numeric to ensure proper comparison
joao['NrRegisto'] = pd.to_numeric(joao['NrRegisto'], errors='coerce')

# Find the data points in rnal that are not in joao
rnal_not_in_joao = rnal[~rnal['NrRNAL'].isin(joao['NrRegisto'])]

# Find the data points in joao that are not in rnal
joao_not_in_rnal = joao[~joao['NrRegisto'].isin(rnal['NrRNAL'])]

# Find the data points that are in both rnal and joao
both_in_rnal_and_joao = rnal[rnal['NrRNAL'].isin(joao['NrRegisto'])]

# Print the numbers
print("Number of data points in rnal:", rnal.shape[0])
print("Number of data points in joao:", joao.shape[0])
print("Number of data points in both rnal and joao:", both_in_rnal_and_joao.shape[0])
print("Number of data points in joao but not in rnal:", joao_not_in_rnal.shape[0])
print("Number of data points in rnal but not in joao:", rnal_not_in_joao.shape[0])

Number of data points in rnal: 19318
Number of data points in joao: 19151
Number of data points in both rnal and joao: 19129
Number of data points in joao but not in rnal: 22
Number of data points in rnal but not in joao: 189


In [79]:
rnal['entradas_repetidas'] = rnal.groupby(['X', 'Y']).transform('size')
min_value = rnal['entradas_repetidas'].min()
max_value = rnal['entradas_repetidas'].max()
rnal['weight'] = (rnal['entradas_repetidas'] - min_value) / (max_value - min_value)


In [80]:
rnal['host_listings_number'] = rnal.groupby('Email')["NrRNAL"].transform('count')
percentage_single_hosts = (rnal['host_listings_number'] == 1).mean() * 100
percentage_super_hosts = (rnal['host_listings_number'] >= 2).mean() * 100
percentage_mega_hosts = (rnal['host_listings_number'] >= 5).mean() * 100

# Print the percentages
print(f"Listings from single hosts: {percentage_single_hosts:.2f}%")
print(f"Listings from super hosts (>= 2): {percentage_super_hosts:.2f}%")
print(f"Listings from mega hosts (>= 5): {percentage_mega_hosts:.2f}%")


Listings from single hosts: 32.41%
Listings from super hosts (>= 2): 67.59%
Listings from mega hosts (>= 5): 41.88%


In [81]:
# Filter out the data points with entradas_repetidas > 1
rnal_filtered = rnal[rnal['entradas_repetidas'] > 1]

# For each group, keep only the one with the highest host_listings_number
idx = rnal_filtered.groupby(['X', 'Y'])['host_listings_number'].idxmax()

# Select the rows with the highest host_listings_number
rnal_unique = rnal.loc[idx]

# Combine with the data points where entradas_repetidas == 1
rnal = pd.concat([rnal[rnal['entradas_repetidas'] == 1], rnal_unique])

# Reset index if necessary
rnal.reset_index(drop=True, inplace=True)

In [82]:
type_map = {'Apartamento':1,'Moradia':2,'EstabelecimentoHospedagem':3,'EstabelecimentoHospedagemHostel':4,'Quartos':5}
freg_map_porto = {'União das freguesias de Cedofeita, Santo Ildefonso, Sé, Miragaia, São Nicolau e Vitória':1,'Bonfim':2,'União das freguesias de Lordelo do Ouro e Massarelos':3,'Paranhos':4,'União das freguesias de Aldoar, Foz do Douro e Nevogilde':5,'Campanhã':6,'Ramalde':7} 
freg_map_lisboa = {freg: idx for idx, freg in enumerate(rnal["Freguesia"].unique(), start=1)}

start_date = datetime(2011, 1, 1) # AL licenses start in 2011
end_date = datetime(2024, 9, 25)

geojson = {}
geojson["type"] = "FeatureCollection"
geojson["features"] = []

for idx, single_al in tqdm(rnal.sort_values(by='host_listings_number', ascending = True).reset_index().iterrows()):
    al_date = single_al.DataAberturaPublico
    al_date = datetime.combine(al_date, datetime.min.time())
    al_entry = {}
    al_entry["type"] = "Feature"
    al_entry["properties"] = {}
    al_entry["properties"]["id"] = idx+1
    al_entry["properties"]["year"] = al_date.strftime('%y')
    al_entry["properties"]["month"] = al_date.strftime('%m')
    
    january_2014 = datetime(2014, 1, 1)
    # Calculate the difference in months between al_date and january_2014
    months_diff = ((al_date.year - january_2014.year) * 12 + al_date.month - january_2014.month) / 128.0  # 113 months between January 2014 and September 2024
    # Ensure the normalized variable is within the range [0, 1]
    normalized_variable = max(0, min(1, months_diff))
    al_entry["properties"]["normalized_date"] = normalized_variable    
    al_entry["properties"]["type"] = type_map[single_al.Modalidade]
    al_entry["properties"]["ts"] = round(min(max((al_date - start_date).days / (end_date - start_date).days, 0), 1), 2)
    if city == "Porto":
        al_entry["properties"]["freg"] = freg_map_porto[single_al.Freguesia]
    elif city == "Lisboa":
        al_entry["properties"]["freg"] = freg_map_lisboa[single_al.Freguesia]
    al_entry["properties"]["weight"] = single_al.weight
    al_entry["properties"]["endereco"] = single_al.Endereco
    al_entry["properties"]["entradas_repetidas"] = single_al.entradas_repetidas
    al_entry["properties"]["host_listings_number"] = single_al.host_listings_number
    al_entry["geometry"] = {}
    al_entry["geometry"]["type"] = "Point"
    al_entry["geometry"]["coordinates"] = [round(single_al.X,6),round(single_al.Y,6)]
    geojson["features"].append(al_entry)

if city == "Porto":
    with open(f'al.json', 'w') as fp:
        json.dump(geojson, fp, separators=(',', ':'))
elif city == "Lisboa":
    with open(f'al.json', 'w') as fp:
        json.dump(geojson, fp, separators=(',', ':'))

9804it [00:00, 22929.16it/s]
