In [330]:
import pandas as pd
import glob
import os
import pandas as pd

PATTERN = '*.csv'
INPUT_PATH = "/home/lucia/Documentos/themeparks/data_themeparks"
OUTPUT_FILE = "themparks_database.csv"

TIMEZONES = {
    'shanghaidisneyresort': 'Asia/Shanghai',
    'disneyland': 'America/Los_Angeles',
    'disneycaliforniaadventure': 'America/Los_Angeles',
    'waltdisneystudiosparis': 'Europe/Paris',
    'disneylandparkparis': 'Europe/Paris',
    'epcot': 'America/New_York',
    'animalkingdom': 'America/New_York',
    'disneyhollywoodstudios': 'America/New_York',
    'disneymagickingdom': 'America/New_York'
}

def extract_files(file_list):
    dfs = []

    for file in file_list:
        df = pd.read_csv(file)
        df['is_open'] = df['is_open'].replace({'False': False, 'True': True})
        df['is_open'] = df['is_open'].astype(bool)
        df['wait_time'] = pd.to_numeric(df['wait_time'], errors='coerce').fillna(0).astype(int)
        df['land_id'] = pd.to_numeric(df['land_id'], errors='coerce').fillna(0).astype(int)
        df['ride_id'] = pd.to_numeric(df['ride_id'], errors='coerce').fillna(0).astype(int)
        dfs.append(df)


    if os.path.exists(OUTPUT_FILE):
        thempark_df = pd.read_csv(OUTPUT_FILE) 
        dfs.append(thempark_df)

    return dfs

def transform(dfs):
    df = pd.concat(dfs, ignore_index=False)
    df = df.reset_index(drop=True)
    df = df.drop_duplicates()
    df['last_update'] = pd.to_datetime(df['last_update'], errors='coerce')
    df = df.dropna(subset=['last_update'])

    df= df.groupby(['last_update', 'park_name']).filter(
        lambda x: not all((x['is_open'] == False) & (x['wait_time'] == 0))
    )
    
    df = df[~df['land_name'].str.endswith('(Closed)', na=False)]
    
    df['timezone'] = df['park_name'].str.lower().map(TIMEZONES)
    
    df['last_update'] = pd.to_datetime(df['last_update'], utc=True)

    zones = df['timezone'].unique()

    dfs_processed = []

    for zone in zones:
        
        df_zone = df[df['timezone'] == zone].copy()
        
        df_zone['local_time'] = df_zone['last_update'].dt.tz_convert(zone)
     
        df_zone['local_date'] = df_zone['local_time'].dt.date
        df_zone['local_hour'] = df_zone['local_time'].dt.hour
        df_zone['local_minute'] = df_zone['local_time'].dt.minute
        df_zone['local_day_of_week'] = df_zone['local_time'].dt.dayofweek  
        df_zone['local_weekday'] = df_zone['local_time'].dt.strftime('%A')  
        
        dfs_processed.append(df_zone)

        df_final = pd.concat(dfs_processed)

    return df_final.sort_index()

In [331]:
file_list = glob.glob(os.path.join(INPUT_PATH, PATTERN))
dfs = extract_files(file_list)
df = transform(dfs)
df


Unnamed: 0,land_id,land_name,ride_id,ride_name,is_open,wait_time,last_update,park_name,timezone,local_time,local_date,local_hour,local_minute,local_day_of_week,local_weekday
0,779,Adventure Isle,3079,Camp Discovery,False,0,2025-01-21 13:55:23+00:00,shanghaidisneyresort,Asia/Shanghai,2025-01-21 21:55:23+08:00,2025-01-21,21,55,1,Tuesday
1,779,Adventure Isle,3893,Challenge Trails at Camp Discovery,False,0,2025-01-21 13:55:23+00:00,shanghaidisneyresort,Asia/Shanghai,2025-01-21 21:55:23+08:00,2025-01-21,21,55,1,Tuesday
2,779,Adventure Isle,2986,Roaring Rapids,False,0,2025-01-21 13:55:23+00:00,shanghaidisneyresort,Asia/Shanghai,2025-01-21 21:55:23+08:00,2025-01-21,21,55,1,Tuesday
3,779,Adventure Isle,3883,Selfie Spot with Disney Jungle Characters,False,0,2025-01-21 13:55:23+00:00,shanghaidisneyresort,Asia/Shanghai,2025-01-21 21:55:23+08:00,2025-01-21,21,55,1,Tuesday
4,779,Adventure Isle,3002,Soaring Over the Horizon,False,0,2025-01-21 13:55:23+00:00,shanghaidisneyresort,Asia/Shanghai,2025-01-21 21:55:23+08:00,2025-01-21,21,55,1,Tuesday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71988,27,Toon Studio,34,RC Racer,True,20,2025-01-24 17:55:05+00:00,waltdisneystudiosparis,Europe/Paris,2025-01-24 18:55:05+01:00,2025-01-24,18,55,4,Friday
71989,27,Toon Studio,7280,RC Racer Single Rider,True,10,2025-01-24 17:55:05+00:00,waltdisneystudiosparis,Europe/Paris,2025-01-24 18:55:05+01:00,2025-01-24,18,55,4,Friday
71990,27,Toon Studio,36,Slinky® Dog Zigzag Spin,True,5,2025-01-24 17:55:05+00:00,waltdisneystudiosparis,Europe/Paris,2025-01-24 18:55:05+01:00,2025-01-24,18,55,4,Friday
71991,27,Toon Studio,35,Toy Soldiers Parachute Drop,False,0,2025-01-24 17:55:05+00:00,waltdisneystudiosparis,Europe/Paris,2025-01-24 18:55:05+01:00,2025-01-24,18,55,4,Friday


In [318]:
parques = list(df['park_name'].unique())
parques 

['shanghaidisneyresort',
 'waltdisneystudiosparis',
 'disneylandparkparis',
 'epcot',
 'animalkingdom',
 'disneycaliforniaadventure',
 'disneymagickingdom',
 'disneyhollywoodstudios',
 'disneyland']

In [332]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 61848 entries, 0 to 71992
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype              
---  ------             --------------  -----              
 0   land_id            61848 non-null  int64              
 1   land_name          61848 non-null  object             
 2   ride_id            61848 non-null  int64              
 3   ride_name          61848 non-null  object             
 4   is_open            61848 non-null  bool               
 5   wait_time          61848 non-null  int64              
 6   last_update        61848 non-null  datetime64[ns, UTC]
 7   park_name          61848 non-null  object             
 8   timezone           61848 non-null  object             
 9   local_time         61848 non-null  object             
 10  local_date         61848 non-null  object             
 11  local_hour         61848 non-null  int32              
 12  local_minute       61848 non-null  int32           