In [2]:
import pandas as pd
import json

In [3]:
# Inicializa un DataFrame vacío
df_hotels_details = pd.DataFrame()
# Número total de conjuntos de datos que deseas leer
num_datasets = 201
# Ruta base donde se encuentran los archivos JSON
path_base = "../2. Datasets/original/hotelbeds/"

In [23]:
# Itera sobre los nombres de los archivos y los concatena en el DataFrame total
for i in range(1, num_datasets + 1):
    file_json = f"hotels_us_details_api_{i}.json"
    path_file = path_base + file_json
    
    # Lee el archivo JSON y conviértelo en un DataFrame
    df = pd.read_json(path_file)
    
    # Concatena el DataFrame actual al DataFrame total
    df_hotels_details = pd.concat([df_hotels, df], ignore_index=True)

In [24]:
df_hotels_d = df_hotels_details.copy()

In [12]:
# Carga el JSON desde un archivo (reemplaza esto con tu propia ruta de archivo)
with open(path_base + "hotels_us_details_api_1.json", "r") as archivo:
    data_json = json.load(archivo)

# Accede a la parte de "hotels" si existe en el JSON
if "hotels" in data_json:
    hotels_data = data_json["hotels"]
    # Ahora hotels_data contiene la parte correspondiente a "hotels"
else:
    print("La clave 'hotels' no existe en el JSON.")

In [13]:
df_hotels_d = pd.DataFrame(hotels_data)

In [14]:
df_hotels_d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 31 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   code               200 non-null    int64 
 1   name               200 non-null    object
 2   description        200 non-null    object
 3   country            200 non-null    object
 4   state              200 non-null    object
 5   destination        200 non-null    object
 6   zone               200 non-null    object
 7   coordinates        200 non-null    object
 8   category           200 non-null    object
 9   categoryGroup      200 non-null    object
 10  chain              162 non-null    object
 11  accommodationType  200 non-null    object
 12  boards             197 non-null    object
 13  segments           193 non-null    object
 14  address            200 non-null    object
 15  postalCode         200 non-null    object
 16  city               200 non-null    object
 1

In [15]:
# Define una función para verificar si un elemento es un objeto JSON o diccionario
def is_json_or_dict(element):
    try:
        if isinstance(element, dict):
            return True
        elif isinstance(element, str):
            json.loads(element)
            return True
        else:
            return False
    except (json.JSONDecodeError, TypeError):
        return False

In [16]:
# Aplica la función a cada elemento del DataFrame y crea una columna de booleanos
columnn_json = df_hotels_d.applymap(is_json_or_dict)

In [17]:
# Muestra solo las columnas que tienen al menos un True
columnn_json.any()

code                 False
name                  True
description           True
country               True
state                 True
destination           True
zone                  True
coordinates           True
category              True
categoryGroup         True
chain                 True
accommodationType     True
boards               False
segments             False
address               True
postalCode            True
city                  True
email                False
phones               False
rooms                False
facilities           False
interestPoints       False
images               False
wildcards            False
web                   True
lastUpdate           False
S2C                  False
ranking              False
terminals            False
issues               False
license              False
dtype: bool

In [29]:
# Define una función para extraer el contenido de 'content'
def extract_content(json_dict):
    if isinstance(json_dict, dict):
        return json_dict.get('content', '')
    else:
        return ''

In [30]:
# Aplica la función a la columna y crea una nueva columna con el contenido extraído
df_hotels_d['name'] = df_hotels_d['name'].apply(extract_content)
df_hotels_d['description'] = df_hotels_d['description'].apply(extract_content)
df_hotels_d['address'] = df_hotels_d['address'].apply(extract_content)
df_hotels_d['city'] = df_hotels_d['city'].apply(extract_content)

In [31]:
def extract_longitude(json_dict):
    if isinstance(json_dict, dict):
        return json_dict.get('longitude', '')
    else:
        return ''
    
def extract_latitude(json_dict):
    if isinstance(json_dict, dict):
        return json_dict.get('latitude', '')
    else:
        return ''

In [32]:
df_hotels_d['longitude'] = df_hotels_d['coordinates'].apply(extract_longitude)
df_hotels_d['latitude'] = df_hotels_d['coordinates'].apply(extract_latitude)

In [33]:
# Eliminar columnas con datos null: 'coordinates', 'license', 'issues'
df_hotels_d = df_hotels_d.drop(columns=['license', 'issues'])
# Eliminar Columna ya gestionada y dividida
df_hotels_d = df_hotels_d.drop(columns=['coordinates'])

In [18]:
df_hotels_d.head()

Unnamed: 0,code,name,description,country,state,destination,zone,coordinates,category,categoryGroup,...,interestPoints,images,wildcards,web,lastUpdate,S2C,ranking,terminals,issues,license
0,6474,{'content': 'Hilton Chicago'},{'content': 'Our Hotel's Policies Have Changed...,"{'code': 'US', 'isoCode': 'US', 'description':...","{'code': 'IL', 'name': 'ILLINOIS'}","{'code': 'ORD', 'name': {'content': 'Chicago -...","{'zoneCode': 2, 'name': 'Downtown', 'descripti...","{'longitude': -87.6244, 'latitude': 41.8725}","{'code': '4EST', 'description': {'content': '4...","{'code': 'GRUPO4', 'description': {'content': ...",...,"[{'facilityCode': 10, 'facilityGroupCode': 100...","[{'type': {'code': 'HAB', 'description': {'con...","[{'roomType': 'ROO.LK', 'roomCode': 'ROO', 'ch...",www3.hilton.com/en/hotels/illinois/hilton-chic...,2023-07-28,3*,47,,,
1,6478,{'content': 'Four Points by Sheraton Los Angel...,{'content': 'This charming hotel is just a mil...,"{'code': 'US', 'isoCode': 'US', 'description':...","{'code': 'CA', 'name': 'CALIFORNIA'}","{'code': 'LAX', 'name': {'content': 'Los Angel...","{'zoneCode': 9, 'name': 'Los Angeles Internati...","{'longitude': -118.3859708, 'latitude': 33.948...","{'code': '3EST', 'description': {'content': '3...","{'code': 'GRUPO3', 'description': {'content': ...",...,,"[{'type': {'code': 'RES', 'description': {'con...",,http://www.fourpointslax.com/la-airport-hotel,2023-07-28,3*,82,,,
2,6480,{'content': 'Sheraton Universal Hotel'},"{'content': 'This upscale, landmark hotel enjo...","{'code': 'US', 'isoCode': 'US', 'description':...","{'code': 'CA', 'name': 'CALIFORNIA'}","{'code': 'LAX', 'name': {'content': 'Los Angel...","{'zoneCode': 21, 'name': 'Universal Studios / ...","{'longitude': -118.35955291287974, 'latitude':...","{'code': '3EST', 'description': {'content': '3...","{'code': 'GRUPO3', 'description': {'content': ...",...,,"[{'type': {'code': 'GEN', 'description': {'con...",,,2023-07-28,4*,119,,,
3,6483,{'content': 'Westin Copley Place'},{'content': 'Experience one of Boston's most c...,"{'code': 'US', 'isoCode': 'US', 'description':...","{'code': 'MA', 'name': 'MASSACHUSETTS'}","{'code': 'BOS', 'name': {'content': 'Boston - ...","{'zoneCode': 2, 'name': 'Back Bay', 'descripti...","{'longitude': -71.077539, 'latitude': 42.348577}","{'code': 'H4_5', 'description': {'content': '4...","{'code': 'GRUPO4', 'description': {'content': ...",...,,"[{'type': {'code': 'GEN', 'description': {'con...","[{'roomType': 'DBL.ST', 'roomCode': 'DBL', 'ch...",http://www.westincopleyplaceboston.com/,2023-07-28,3*,62,,,
4,6487,{'content': 'Wyndham Garden New Orleans Airport'},{'content': 'Wyndham Garden New Orleans Airpor...,"{'code': 'US', 'isoCode': 'US', 'description':...","{'code': 'LA', 'name': 'LOUISIANA'}","{'code': 'MSY', 'name': {'content': 'New Orlea...","{'zoneCode': 8, 'name': 'Metairie', 'descripti...","{'longitude': -90.1227, 'latitude': 30}","{'code': '3EST', 'description': {'content': '3...","{'code': 'GRUPO3', 'description': {'content': ...",...,,"[{'type': {'code': 'RES', 'description': {'con...","[{'roomType': 'DBL.KG', 'roomCode': 'DBL', 'ch...",www.wyndhamhotels.com,2023-07-28,,55,,,


In [35]:
df_hotels_c.to_csv(path_base + "hotels_dataset.csv")

In [36]:
# Seleccionar solo la columna code para realizar la extracción a través de la otra api
df_hotels_c['code'].to_csv(path_base + "hotels_code.csv")