# ETL JSON


## About data converting:

| Source | Typical Format | Notes |
|--------|---------------|-------|
| APIs | JSON, XML | Almost always hierarchical; needs flattening into DataFrame or table for analysis |
| Web scraping | HTML, JSON, CSV | Usually converted into DataFrames for processing |
| Logs | JSON, plain text, CSV | Parsed and structured into DataFrames or database tables |
| Files from partners | Excel, CSV, JSON, Parquet | Some may already be tabular (CSV/Excel/Parquet), some need transformation |
| Databases | SQL tables | Already structured; can be queried directly into DataFrame without conversion |

In [5]:
import json
import pandas as pd

INPUT_FILE = "gsr_pos_logs.json"
OUTPUT_CSV = "gsr_pos_logs_cleaned.csv"

In [6]:
import sys
print(sys.executable)


c:\Users\User\AppData\Local\Programs\Python\Python313\python.exe


In [9]:
with open("qsr_pos_logs.json", "r") as file:
    data = json.load(file)
    print(type(data))  # Check the type of data
    print(data[0])

<class 'list'>
{'order_id': 120000, 'store_id': 102, 'transaction_datetime': '2025-07-05 06:14:08', 'business_day': '2025-07-05', 'daypart': 'Breakfast', 'service_mode': 'Drive-Thru', 'menu_item': 'Bottled Water', 'modifier': None, 'quantity': 1, 'unit_price': 1.48, 'discount': 0.15, 'tax': 0.1, 'total_amount': 1.43, 'payment_type': 'Cash'}


## from AI: full mini-ETL to process json

In [10]:
def extract(file_path):
    """Read JSON file and return Python data structure"""
    with open(file_path, "r") as f:
        data = json.load(f)
    return data

def transform(data):
    """Convert JSON to DataFrame and clean/flatten"""
    # Flatten nested JSON if needed
    df = pd.json_normalize(data)
    
    # Example transformation: convert datetime column
    if 'transaction_datetime' in df.columns:
        df['transaction_datetime'] = pd.to_datetime(df['transaction_datetime'])
    
    # You can add more transformations here (e.g., renaming columns, filtering)
    return df

def load(df, output_path_csv=None, output_path_excel=None):
    """Save DataFrame to CSV or Excel"""
    if output_path_csv:
        df.to_csv(output_path_csv, index=False)
    if output_path_excel:
        df.to_excel(output_path_excel, index=False)

def etl_pipeline(input_file, output_csv=None, output_excel=None):
    """Full ETL process"""
    data = extract(input_file)
    df = transform(data)
    load(df, output_csv, output_excel)
    print("ETL completed successfully!")
    return df

# Example usage
if __name__ == "__main__":
    df_result = etl_pipeline(
        input_file="qsr_pos_logs.json",
        output_csv="qsr_pos_logs_cleaned.csv"
    )
    print(df_result.head())


ETL completed successfully!
   order_id  store_id transaction_datetime business_day    daypart  \
0    120000       102  2025-07-05 06:14:08   2025-07-05  Breakfast   
1    120001       201  2025-07-05 11:48:51   2025-07-05      Lunch   
2    120002       403  2025-04-13 08:05:35   2025-04-13  Breakfast   
3    120003       301  2025-06-25 06:42:17   2025-06-25  Breakfast   
4    120004       301  2025-08-01 12:58:36   2025-08-01      Lunch   

  service_mode            menu_item   modifier  quantity  unit_price  \
0   Drive-Thru        Bottled Water       None         1        1.48   
1      Dine-In     Chicken Sandwich       None         1        5.34   
2      Takeout        Bottled Water  Light Ice         1        1.49   
3   Drive-Thru           Egg Muffin  No Cheese         1        3.62   
4   Drive-Thru  Spicy Chicken Combo       None         2        9.17   

   discount   tax  total_amount payment_type  
0      0.15  0.10          1.43         Cash  
1      0.00  0.45       