In [None]:
from azure.storage.blob import BlobServiceClient
import pandas as pd
import io
import warnings
warnings.filterwarnings('ignore')

# Connection string - store this in environment variable or Azure Key Vault
connection_string = "YOUR_AZURE_STORAGE_CONNECTION_STRING_HERE"

# Create BlobServiceClient
blob_service_client = BlobServiceClient.from_connection_string(connection_string)

# Connect to bronze container
container_name = "bronze"
container_client = blob_service_client.get_container_client(container_name)

# Verify connection
print(f"Connected to container: {container_name}")

# List blobs in the container
print("\nBlobs in bronze container:")
for blob in container_client.list_blobs():
    print(f"  - {blob.name}")

Connected to container: bronze

Blobs in bronze container:
  - SalesLT
  - SalesLT/Address
  - SalesLT/Address/Address.parquet
  - SalesLT/Customer
  - SalesLT/Customer/Customer.parquet
  - SalesLT/CustomerAddress
  - SalesLT/CustomerAddress/CustomerAddress.parquet
  - SalesLT/Product
  - SalesLT/Product/Product.parquet
  - SalesLT/ProductCategory
  - SalesLT/ProductCategory/ProductCategory.parquet
  - SalesLT/ProductDescription
  - SalesLT/ProductDescription/ProductDescription.parquet
  - SalesLT/ProductModel
  - SalesLT/ProductModel/ProductModel.parquet
  - SalesLT/ProductModelProductDescription
  - SalesLT/ProductModelProductDescription/ProductModelProductDescription.parquet
  - SalesLT/SalesOrderDetail
  - SalesLT/SalesOrderDetail/SalesOrderDetail.parquet
  - SalesLT/SalesOrderHeader
  - SalesLT/SalesOrderHeader/SalesOrderHeader.parquet


In [23]:
# Try with fastparquet engine
try:
    df = pd.read_parquet(io.BytesIO(data), engine='fastparquet')
    print(f"✓ Loaded with fastparquet: {df.shape[0]} rows, {df.shape[1]} columns")
    print(df.head())
except Exception as e:
    print(f"Error with fastparquet: {e}")
    print("\nTry running: pip install fastparquet")

✓ Loaded with fastparquet: 450 rows, 9 columns
   AddressID         AddressLine1 AddressLine2      City StateProvince  \
0          9    8713 Yosemite Ct.         None   Bothell    Washington   
1         11  1318 Lasalle Street         None   Bothell    Washington   
2         25     9178 Jumping St.         None    Dallas         Texas   
3         28     9228 Via Del Sol         None   Phoenix       Arizona   
4         32    26910 Indela Road         None  Montreal        Quebec   

   CountryRegion PostalCode                               rowguid ModifiedDate  
0  United States      98011  268af621-76d7-4c78-9441-144fd139821a   2006-07-01  
1  United States      98011  981b3303-aca2-49c7-9a96-fb670785b269   2007-04-01  
2  United States      75201  c8df3bd9-48f0-4654-a8dd-14a67a84d3c6   2006-09-01  
3  United States      85004  12ae5ee1-fc3e-468b-9b92-3b970b169774   2005-09-01  
4         Canada    H1Y 2H5  84a95f62-3ae8-4e7e-bbd5-5a6f00cd982d   2006-08-01  


In [24]:

# Display the full table
df

Unnamed: 0,AddressID,AddressLine1,AddressLine2,City,StateProvince,CountryRegion,PostalCode,rowguid,ModifiedDate
0,9,8713 Yosemite Ct.,,Bothell,Washington,United States,98011,268af621-76d7-4c78-9441-144fd139821a,2006-07-01
1,11,1318 Lasalle Street,,Bothell,Washington,United States,98011,981b3303-aca2-49c7-9a96-fb670785b269,2007-04-01
2,25,9178 Jumping St.,,Dallas,Texas,United States,75201,c8df3bd9-48f0-4654-a8dd-14a67a84d3c6,2006-09-01
3,28,9228 Via Del Sol,,Phoenix,Arizona,United States,85004,12ae5ee1-fc3e-468b-9b92-3b970b169774,2005-09-01
4,32,26910 Indela Road,,Montreal,Quebec,Canada,H1Y 2H5,84a95f62-3ae8-4e7e-bbd5-5a6f00cd982d,2006-08-01
...,...,...,...,...,...,...,...,...,...
445,1104,9927 N. Main St.,,Tooele,Utah,United States,84074,ce0fa600-8761-402c-a6c0-84fb6244984b,2005-08-01
446,1105,566 S. Main,,Cedar City,Utah,United States,84720,c16f92ab-817f-42d1-a560-bc0f86fe2783,2006-07-01
447,11380,165 North Main,,Austin,Texas,United States,78701,412e3088-1fb8-4a1f-b3a8-abb23a4f01c1,2006-07-01
448,11381,2000 300th Street,,Denver,Colorado,United States,80203,a9dbb460-a624-4e6a-aa22-311ee18f6746,2005-09-01


In [26]:
def clean_data(df):
    # Update ModifiedDate to DD-MM-YYYY format
    df['ModifiedDate'] = df['ModifiedDate'].dt.strftime('%d-%m-%Y')
    # Update column names to follow conventions
    df.columns = (
        df.columns
        .str.replace('id', '_id', case=False)  # Replace 'id' with '_id'
        .str.replace(r'(?<!^)(?=[A-Z])', '_', regex=True)  # Add underscore before uppercase in the middle
        .str.replace(r'(?=\d)', '_', regex=True)  # Add underscore before numbers
        .str.lower()  # Convert all to lowercase
    )
    # Remove the 'rowgu_id' column from df
    df = df.drop(columns=['rowgu_id'])
    # Convert modified_date to a date column
    df['modified_date'] = pd.to_datetime(df['modified_date'])
    return df

df_clean = clean_data(df.copy())
df_clean.head()

Unnamed: 0,address_id,address_line_1,address_line_2,city,state_province,country_region,postal_code,modified_date
0,9,8713 Yosemite Ct.,,Bothell,Washington,United States,98011,2006-01-07
1,11,1318 Lasalle Street,,Bothell,Washington,United States,98011,2007-01-04
2,25,9178 Jumping St.,,Dallas,Texas,United States,75201,2006-01-09
3,28,9228 Via Del Sol,,Phoenix,Arizona,United States,85004,2005-01-09
4,32,26910 Indela Road,,Montreal,Quebec,Canada,H1Y 2H5,2006-01-08
