In [None]:
import pandas as pd
import os
import glob
import logging

# --- Configuration ---
# The folder containing your raw data files
RAW_DATA_DIR = 'raw/json' 
# Supported file extensions to convert FROM
SUPPORTED_INPUT_EXTENSIONS = ['*.parquet', '*.json']

# --- Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def convert_files_to_csv():
    """
    Iterates through the RAW_DATA_DIR, converts all supported files (parquet, json)
    to CSV format, and saves the new CSV files in the same directory.
    """
    logging.info(f"Starting data conversion in directory: {RAW_DATA_DIR}")

    if not os.path.isdir(RAW_DATA_DIR):
        logging.error(f"Directory not found: {RAW_DATA_DIR}. Please create it or adjust the RAW_DATA_DIR variable.")
        return

    # List to track all processed files
    processed_files_count = 0
    
    # 1. Iterate over all supported extensions
    for ext_pattern in SUPPORTED_INPUT_EXTENSIONS:
        # Create the full path pattern for glob (e.g., 'data/raw/*.parquet')
        search_path = os.path.join(RAW_DATA_DIR, ext_pattern)
        
        # Use glob.glob to find all matching files
        for input_filepath in glob.glob(search_path):
            try:
                # Get the base name without the original extension
                base_name = os.path.splitext(os.path.basename(input_filepath))[0]
                # Define the new output CSV file path
                output_filepath = os.path.join(RAW_DATA_DIR, f"{base_name}.csv")

                logging.info(f"Processing file: {input_filepath}")

                # 2. Read the file into a Pandas DataFrame
                if ext_pattern == '*.parquet':
                    df = pd.read_parquet(input_filepath)
                elif ext_pattern == '*.json':
                    # Assuming standard line-delimited or simple JSON structure
                    # Adjust 'lines=True' if your JSON is a single object array
                    df = pd.read_json(input_filepath, lines=True) 
                else:
                    # This case should ideally not be reached
                    continue

                # 3. Write the DataFrame to a CSV file
                # index=False prevents writing the DataFrame's index as a column
                df.to_csv(output_filepath, index=False)
                
                logging.info(f"Successfully converted to: {output_filepath}")
                processed_files_count += 1

            except Exception as e:
                logging.error(f"Error processing file {input_filepath}: {e}")

    logging.info(f"Conversion complete. Total files processed: **{processed_files_count}**.")

if __name__ == "__main__":

    convert_files_to_csv()

2025-12-07 17:15:42,214 - INFO - Starting data conversion in directory: raw/json
2025-12-07 17:15:42,215 - INFO - Processing file: raw/json/ticketmaster_events_20251207_023443.json
2025-12-07 17:15:42,341 - ERROR - Error processing file raw/json/ticketmaster_events_20251207_023443.json: Expected object or value
2025-12-07 17:15:42,356 - INFO - Processing file: raw/json/ticketmaster_us_music_20251207_031136.json
2025-12-07 17:15:42,502 - ERROR - Error processing file raw/json/ticketmaster_us_music_20251207_031136.json: Expected object or value
2025-12-07 17:15:42,514 - INFO - Conversion complete. Total files processed: **0**.


In [4]:
import pandas as pd
import os
import glob
import logging
import json
from pandas import json_normalize # json_normalize is now a top-level function

# --- Configuration ---
# The folder containing your raw data files
RAW_DATA_DIR = 'raw/json' 
# Supported file extensions to convert FROM
SUPPORTED_INPUT_EXTENSIONS = ['*.parquet', '*.json']

# --- Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def convert_files_to_csv():
    """
    Iterates through the RAW_DATA_DIR, converts all supported files (parquet, json)
    to CSV format, and saves the new CSV files in the same directory.
    
    Includes robust logic for nested JSON files common with API outputs.
    """
    logging.info(f"Starting data conversion in directory: {RAW_DATA_DIR}")

    if not os.path.isdir(RAW_DATA_DIR):
        logging.error(f"Directory not found: {RAW_DATA_DIR}. Please create it or adjust the RAW_DATA_DIR variable.")
        return

    # List to track all processed files
    processed_files_count = 0
    
    # 1. Iterate over all supported extensions
    for ext_pattern in SUPPORTED_INPUT_EXTENSIONS:
        # Create the full path pattern for glob (e.g., 'data/raw/*.parquet')
        search_path = os.path.join(RAW_DATA_DIR, ext_pattern)
        
        # Use glob.glob to find all matching files
        for input_filepath in glob.glob(search_path):
            try:
                # Get the base name without the original extension
                base_name = os.path.splitext(os.path.basename(input_filepath))[0]
                # Define the new output CSV file path
                output_filepath = os.path.join(RAW_DATA_DIR, f"{base_name}.csv")

                logging.info(f"Processing file: {input_filepath}")

                # 2. Read the file into a Pandas DataFrame
                if ext_pattern == '*.parquet':
                    # Read Parquet file
                    df = pd.read_parquet(input_filepath)

                elif ext_pattern == '*.json':
                    # Read JSON file with robust nested handling
                    with open(input_filepath, 'r', encoding='utf-8') as f:
                        data = json.load(f)

                    # --- ADVANCED JSON PARSING LOGIC ---
                    data_to_normalize = None
                    
                    if isinstance(data, list):
                        # Case 1: The root object is already a list of records
                        data_to_normalize = data
                    elif '_embedded' in data and 'events' in data['_embedded']:
                        # Case 2: Common Ticketmaster structure
                        data_to_normalize = data['_embedded']['events']
                    elif 'events' in data:
                        # Case 3: Data is directly under an 'events' key
                        data_to_normalize = data['events']
                    else:
                        # Case 4: Fallback to try and normalize the root dictionary
                        data_to_normalize = data

                    if data_to_normalize is None or (isinstance(data_to_normalize, list) and not data_to_normalize):
                         # Handle case where the list is empty or the key was not found
                         logging.warning(f"  -> Skipped: Could not find array of records to flatten in {base_name}. JSON may be empty or nested differently.")
                         continue
                        
                    # Flatten the JSON structure, expanding nested dictionaries into columns (e.g., '_embedded.events.venue.name')
                    # Use errors='ignore' to skip non-list/dict values if the key is not strictly a list
                    df = json_normalize(data_to_normalize, errors='ignore')

                else:
                    # This case should ideally not be reached
                    continue

                # 3. Write the DataFrame to a CSV file
                # index=False prevents writing the DataFrame's index as a column
                df.to_csv(output_filepath, index=False)
                
                logging.info(f"Successfully converted to: {output_filepath} (Rows: {len(df)})")
                processed_files_count += 1

            except Exception as e:
                logging.error(f"Error processing file {input_filepath}: {e}")

    logging.info(f"Conversion complete. Total files processed: **{processed_files_count}**.")

if __name__ == "__main__":
    # Note: Requires 'pip install pandas pyarrow' 
    convert_files_to_csv()

2025-12-07 17:20:43,868 - INFO - Starting data conversion in directory: raw/json
2025-12-07 17:20:43,875 - INFO - Processing file: raw/json/ticketmaster_events_20251207_023443.json


2025-12-07 17:20:44,448 - INFO - Successfully converted to: raw/json/ticketmaster_events_20251207_023443.csv (Rows: 1000)
2025-12-07 17:20:44,448 - INFO - Processing file: raw/json/ticketmaster_us_music_20251207_031136.json
2025-12-07 17:20:44,999 - INFO - Successfully converted to: raw/json/ticketmaster_us_music_20251207_031136.csv (Rows: 16050)
2025-12-07 17:20:45,000 - INFO - Conversion complete. Total files processed: **2**.


In [3]:
import pandas as pd
import os
import glob

# --- Configuration ---
DATA_DIR = 'raw/parquet' 
PARQUET_PATTERN = os.path.join(DATA_DIR, '*.parquet')

print("--- Parquet File Size and Integrity Check ---")

parquet_files = glob.glob(PARQUET_PATTERN)

if not parquet_files:
    print(f"No .parquet files found in {DATA_DIR}.")
else:
    for filepath in parquet_files:
        filename = os.path.basename(filepath)
        
        # 1. Get file size from the OS
        file_size_bytes = os.path.getsize(filepath)
        file_size_mb = file_size_bytes / (1024 * 1024)
        
        # 2. Check data integrity and record count without loading the full DataFrame
        try:
            # Use 'engine='pyarrow'' for potentially faster metadata reading
            # This loads only the metadata/schema, not the full data.
            metadata = pd.read_parquet(filepath, engine='pyarrow', columns=[]).index
            
            # The length of the index will give the row count
            row_count = len(metadata)
            
            print(f"\nFile: **{filename}**")
            print(f"  -> Disk Size: {file_size_mb:.2f} MB")
            print(f"  -> Row Count: {row_count:,}")
            
            if row_count == 0:
                print("  -> WARNING: File is empty (0 rows).")

        except Exception as e:
            print(f"\nFile: **{filename}**")
            print(f"  -> ERROR: Failed to read metadata. File may be corrupted. Details: {e}")

--- Parquet File Size and Integrity Check ---

File: **setlistfm_us_concerts_20251207_080243.parquet**
  -> Disk Size: 0.23 MB
  -> Row Count: 0

File: **predicthq_events_20251207_054022.parquet**
  -> Disk Size: 0.00 MB
  -> Row Count: 0

File: **phq_events_full_20251207_060912.parquet**
  -> Disk Size: 2.23 MB
  -> Row Count: 0

File: **setlistfm_us_concerts_20251207_080539.parquet**
  -> Disk Size: 0.42 MB
  -> Row Count: 0

File: **phq_events_20251207_054509.parquet**
  -> Disk Size: 0.03 MB
  -> Row Count: 0

File: **phq_events_full_20251207_065623.parquet**
  -> Disk Size: 2.23 MB
  -> Row Count: 0

File: **setlistfm_us_concerts_20251207_074105.parquet**
  -> Disk Size: 0.01 MB
  -> Row Count: 0

File: **musicbrainz_structured_features.parquet**
  -> Disk Size: 5.30 MB
  -> Row Count: 0

File: **phq_events_20251207_054229.parquet**
  -> Disk Size: 0.01 MB
  -> Row Count: 0

File: **phq_events_full_20251207_063822.parquet**
  -> Disk Size: 2.23 MB
  -> Row Count: 0
