Day 2: Loops over Lists/Dicts (Pipeline Config Iteration)

In [8]:
import pandas as pd
df = pd.read_csv("/Users/darvikkunalbanda/DATA_ENGINEERING/cloud_learnings/data/titanic.csv")
print(f"Titanic loaded : {df.head(3)}" , "\n" ,len(df), "rows")

Titanic loaded :    survived  pclass                                               name  \
0         0       3                            Braund, Mr. Owen Harris   
1         1       1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2         1       3                             Heikkinen, Miss. Laina   

      sex   age     fare  sibsp  parch  
0    male  22.0   7.2500      1      0  
1  female  38.0  71.2833      1      0  
2  female  26.0   7.9250      0      0   
 714 rows


Exercise 1: Basic for loop over list

In [None]:
# Pipeline config: list of tables/files to process
sources = [
    "passengers.csv",
    "cabins.csv",
    "tickets.csv"
]

print("===== Pipeline sources ===== \n")
for source in sources:
    print(f"Loading {source}...")
    print(f" {source} loaded ({len(df)} rows) \n")

print("\n Pipeline complete!")

===== Pipeline sources ===== 

Loading passengers.csv...
 passengers.csv loaded (714 rows) 

Loading cabins.csv...
 cabins.csv loaded (714 rows) 

Loading tickets.csv...
 tickets.csv loaded (714 rows) 


 Pipeline complete!


In [23]:
sources = [
    "passengers.csv",
    "cabins.csv",
    "tickets.csv"
]

print("=== Pipeline Started === \n")

for i , source in enumerate(sources):
    print(f"Step {i+1} : {source}")
    print(f"{source} loading")
    print(f"{source} loaded {len(df)} rows \n")

print("\n Pipeline Completed")

=== Pipeline Started === 

Step 1 : passengers.csv
passengers.csv loading
passengers.csv loaded 714 rows 

Step 2 : cabins.csv
cabins.csv loading
cabins.csv loaded 714 rows 

Step 3 : tickets.csv
tickets.csv loading
tickets.csv loaded 714 rows 


 Pipeline Completed


Exercise 2: Loop over dict config

In [24]:
pipeline_config = {
    "databases" : {
        "SNOWFLAKE_WH": {"host": "snowflake.com" , "warehouse": "COMPUTE_WH"},
        "POSTGRES_SALES": {"host": "pg.company.com" , "db":"sales"}
    },
    "files": [
        {"path" : "titanic.csv", "rows_expected": 891},
        {"path" : "orders.csv", "rows_expected": 5000}
    ]
}

In [37]:
pipeline_config['databases'].items()

dict_items([('SNOWFLAKE_WH', {'host': 'snowflake.com', 'warehouse': 'COMPUTE_WH'}), ('POSTGRES_SALES', {'host': 'pg.company.com', 'db': 'sales'})])

In [63]:
print("=== Extract step : looping over config ===")
for db_nm , details in pipeline_config['databases'].items() :
    print(f"Connecting to {db_nm} : {details['host']}")

print("\n === File Loads ===")
for file_info in pipeline_config["files"]:
    actual_rows = len(df)
    if actual_rows >= file_info["rows_expected"]:
        print(f"{file_info["path"]} : {actual_rows} rows ==> COUNT OK")
    else:
        print(f" WARNING: LOW ROWS ==> {file_info["path"]}")
        print(f" Expected: {file_info['rows_expected']} rows , Acutal: {actual_rows} rows")

=== Extract step : looping over config ===
Connecting to SNOWFLAKE_WH : snowflake.com
Connecting to POSTGRES_SALES : pg.company.com

 === File Loads ===
 Expected: 891 rows , Acutal: 714 rows
 Expected: 5000 rows , Acutal: 714 rows


Exercise 3: Nested loop – tables → columns

Conditional Experession

value_if_true if condition else value_if_false

In [73]:
tables = {
    "passengers": df[["name", "age", "fare"]].columns.tolist(),
    "cabins": ["CabinId", "Class", "Deck"]
}

print("=== Data quality scan (nested loops) ===")
for table_name, columns in tables.items():
    print(f"\nScanning {table_name} ({len(columns)} cols):")
    for col in columns:
        col_nulls = df[col].isnull().sum() if col in df.columns else "N/A"
        print(f"{col}: {col_nulls} nulls")
        if col in df.columns and col_nulls > 100:
            print("HIGH NULLS")


=== Data quality scan (nested loops) ===

Scanning passengers (3 cols):
name: 0 nulls
age: 0 nulls
fare: 0 nulls

Scanning cabins (3 cols):
CabinId: N/A nulls
Class: N/A nulls
Deck: N/A nulls
