In [1]:
import pandas as pd
import psycopg2
from sqlalchemy import create_engine
import json

# Conectar ao banco de dados PostgreSQL
conn_string = "postgresql://postgres:manager@localhost:5432/postgres"
db = create_engine(conn_string)

In [2]:
query = """
WITH exploded_features AS (
    SELECT
        location_id,
        unnest(features) AS feature
    FROM
        ta_location_details
    UNION ALL
    SELECT
        location_id,
        unnest(cuisines) AS feature
    FROM
        ta_location_details
)
SELECT
    location_id,
    jsonb_object_agg(feature, 1) AS features
FROM
    exploded_features
GROUP BY
    location_id;
"""


df_features = pd.read_sql(query, db)
df_features.head()

Unnamed: 0,location_id,features
0,4528830,"{'Pizza': 1, 'Entrega': 1, 'Italiana': 1, 'Par..."
1,6004734,"{'Brasileira': 1, 'Para levar': 1, 'Sul-americ..."
2,7863162,"{'Bufê': 1, 'Familiar': 1, 'Reservas': 1, 'Bra..."
3,4086016,"{'Churrasco': 1, 'Brasileira': 1, 'Para levar'..."
4,7077377,"{'Bar': 1, 'Reservas': 1, 'Brasileira': 1, 'Pa..."


In [3]:
# Normalizar o JSON em colunas separadas
df_features_expanded = pd.json_normalize(df_features['features'])
df_features_expanded

Unnamed: 0,Pizza,Entrega,Italiana,Para levar,Serviço de mesa,Acesso para cadeirantes,Brasileira,Sul-americana,Bufê,Familiar,...,Havaiana,Comidas terapêuticas,Britânica,Austríaca,Catalunha,Afegã,Africana,Belga,Nova Zelândia,Argelina
0,1.0,1.0,1.0,1.0,1.0,1.0,,,,,...,,,,,,,,,,
1,,,,1.0,,1.0,1.0,1.0,,,...,,,,,,,,,,
2,,,,1.0,,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,
3,,,,1.0,1.0,1.0,1.0,,,,...,,,,,,,,,,
4,,,,1.0,1.0,1.0,1.0,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1647,,,,,,,1.0,1.0,,,...,,,,,,,,,,
1648,,,,,,,,,,,...,,,,,,,,,,
1649,,,,,,,,,,,...,,,,,,,,,,
1650,1.0,,,,,,,,,,...,,,,,,,,,,


In [4]:
# Adicionar o location_id de volta ao DataFrame
df_features_expanded['location_id'] = df_features['location_id']
df_features_expanded

Unnamed: 0,Pizza,Entrega,Italiana,Para levar,Serviço de mesa,Acesso para cadeirantes,Brasileira,Sul-americana,Bufê,Familiar,...,Comidas terapêuticas,Britânica,Austríaca,Catalunha,Afegã,Africana,Belga,Nova Zelândia,Argelina,location_id
0,1.0,1.0,1.0,1.0,1.0,1.0,,,,,...,,,,,,,,,,4528830
1,,,,1.0,,1.0,1.0,1.0,,,...,,,,,,,,,,6004734
2,,,,1.0,,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,7863162
3,,,,1.0,1.0,1.0,1.0,,,,...,,,,,,,,,,4086016
4,,,,1.0,1.0,1.0,1.0,,,,...,,,,,,,,,,7077377
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1647,,,,,,,1.0,1.0,,,...,,,,,,,,,,21349098
1648,,,,,,,,,,,...,,,,,,,,,,5351235
1649,,,,,,,,,,,...,,,,,,,,,,23260273
1650,1.0,,,,,,,,,,...,,,,,,,,,,23313276


In [5]:
# Preencher valores nulos com 0 (ausência da feature)
df_features_expanded = df_features_expanded.fillna(0)

In [6]:
# Exibir o DataFrame expandido
df_features_expanded

Unnamed: 0,Pizza,Entrega,Italiana,Para levar,Serviço de mesa,Acesso para cadeirantes,Brasileira,Sul-americana,Bufê,Familiar,...,Comidas terapêuticas,Britânica,Austríaca,Catalunha,Afegã,Africana,Belga,Nova Zelândia,Argelina,location_id
0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4528830
1,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6004734
2,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7863162
3,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4086016
4,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7077377
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1647,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21349098
1648,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5351235
1649,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23260273
1650,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23313276


In [7]:
# Carregar a tabela original
df_original = pd.read_sql("""SELECT name, 
                          ranking_position, 
                          rating, 
                          num_reviews, 
                          review_rating_1, 
                          review_rating_2, 
                          review_rating_3, 
                          review_rating_4, 
                          review_rating_5, 
                          food_rating, 
                          service_rating, 
                          value_rating, 
                          price_level,   
                          location_id 
                          FROM 
                          ta_location_details""", db)

# Fazer o join com a tabela original
df_full = pd.merge(df_original, df_features_expanded, on='location_id', how='inner')

# Exibir o DataFrame final
print(df_full.head())

                          name  ranking_position  rating  num_reviews  \
0                Ile de France             255.0     4.0          178   
1          Yu Cozinha Oriental             160.0     4.5          238   
2              Mangiare Felice              65.0     4.5          531   
3        Armazém Santo Antônio              85.0     4.5          299   
4  Lellis Trattoria - Curitiba              74.0     4.5          826   

   review_rating_1  review_rating_2  review_rating_3  review_rating_4  \
0                7               16               24               36   
1                4                4               21               95   
2                9               17               40              180   
3                7                6               14               80   
4               17               23               67              266   

   review_rating_5  food_rating  ...  Havaiana  Comidas terapêuticas  \
0               95          4.5  ...       0.0    

In [8]:
price_level_mapping = {
    '$': 1 ,
    '$$ - $$$':2 ,
    '$$$$':3 
}

df_full.replace({'price_level': price_level_mapping}, inplace=True)

In [9]:
df_weekdays = pd.read_sql("""SELECT weekdays_opening_hours,
                          location_id 
                          FROM 
                          ta_location_details
                          where weekdays_opening_hours <> '{}' """, db)

In [10]:
# Função para classificar o período do dia
def classify_period(hour):
    if 6 <= hour < 12:
        return "morning"
    elif 12 <= hour < 17:
        return "afternoon"
    elif 17 <= hour < 22:
        return "evening"
    else:
        return "night"

In [11]:
# Função para processar os horários de um dia
def process_day_hours(day_hours):
    # Inicializar a lista de períodos
    periods = {"morning": 0, "afternoon": 0, "evening": 0, "night": 0}

    if pd.isna(day_hours) or "Closed" in day_hours:
        return periods

    # Extrair os horários
    time_ranges = day_hours.split(': ')[1].split(', ')
    for time_range in time_ranges:
        start_time, end_time = time_range.split(' - ')
        start_hour = int(start_time.split(':')[0])
        end_hour = int(end_time.split(':')[0])

        # Classificar o período de abertura e fechamento
        start_period = classify_period(start_hour)
        end_period = classify_period(end_hour)

        # Atribuir 1 aos períodos em que o restaurante está aberto
        current_period = start_period
        while current_period != end_period:
            periods[current_period] = 1
            # Avançar para o próximo período
            if current_period == "morning":
                current_period = "afternoon"
            elif current_period == "afternoon":
                current_period = "evening"
            elif current_period == "evening":
                current_period = "night"
            elif current_period == "night":
                current_period = "morning"
        periods[end_period] = 1

    return periods

In [12]:
def transform_opening_hours(row):
    # Inicializar um dicionário para armazenar as colunas binárias
    result = {"location_id": row["location_id"]}  # Manter o location_id

    # Parsear a string JSON
    try:
        # Remover as chaves {} e dividir as entradas
        entries = row['weekdays_opening_hours'].strip('{}').split('","')
        # Remover as aspas duplas restantes
        entries = [entry.strip('"') for entry in entries]
    except Exception as e:
        print(f"Erro ao parsear JSON: {e}")
        return pd.Series(result)

    for day_hours in entries:
        if pd.isna(day_hours):
            continue

        # Extrair o dia da semana
        day = day_hours.split(':')[0]
        # Processar os horários do dia
        periods = process_day_hours(day_hours)

        # Adicionar as colunas binárias ao resultado
        for period, value in periods.items():
            result[f"{day}_Open_{period.capitalize()}"] = value

    return pd.Series(result)

In [13]:
# Aplicar a transformação ao DataFrame
df_transformed = df_weekdays.apply(transform_opening_hours, axis=1)

# Combinar o DataFrame transformado com o original
df_final = pd.merge(df_full, df_transformed, on='location_id', how='inner')

# Exibir o DataFrame final
print(df_final)

                               name  ranking_position  rating  num_reviews  \
0                     Ile de France             255.0     4.0          178   
1               Yu Cozinha Oriental             160.0     4.5          238   
2                   Mangiare Felice              65.0     4.5          531   
3             Armazém Santo Antônio              85.0     4.5          299   
4       Lellis Trattoria - Curitiba              74.0     4.5          826   
..                              ...               ...     ...          ...   
938         Maxi Pizza - Cristo Rei               NaN     0.0            0   
939               Yakitori Curitiba            2871.0     3.0            5   
940                      Tapa Madre            1717.0     5.0            1   
941  Pizzaria Baggio - Praça Osório            2801.0     3.0            1   
942      Ba-Chan Japanese Fast Food               NaN     0.0            0   

     review_rating_1  review_rating_2  review_rating_3  review_

In [14]:
# Ou salvar de volta no PostgreSQL
df_final.to_sql('ta_features_expanded', db, if_exists='replace', index=False)

171