# Loading Data From CSV to retail.CustomerAccount Table. Can be retested. It truncates the Table first. 

In [1]:
# Load CustomerAccount CSV data into retail.CustomerAccount table
# This script reads a CSV file, cleans the data, applies transformations, and writes it to a Delta table.
# ALWAYS TRUNCATES existing data before loading - designed for development/testing scenarios

# the Sample CSV file has account opening date from January 1, 2020 to March 31, 2025 

from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DecimalType, DateType
import os

# Configuration
print("üöÄ CUSTOMERACCOUNT DATA LOADING")
print("=" * 50)
print("üìÇ Source: Files/SampleData/retail/CustomerAccount.csv")
print("üéØ Target: retail.CustomerAccount table")
print("‚ö†Ô∏è  WARNING: This script ALWAYS truncates existing data first")

try:
    # Step 0: ALWAYS truncate existing data first
    print("\nüóëÔ∏è TRUNCATING EXISTING DATA...")
    try:
        # Get current count efficiently
        existing_count = spark.sql("SELECT COUNT(*) as count FROM retail.CustomerAccount").collect()[0]['count']
        print(f"üìä Current records in table: {existing_count}")
        
        # Always truncate regardless of count
        spark.sql("TRUNCATE TABLE retail.CustomerAccount")
        print("‚úÖ Table truncated successfully")
        
        # Verify truncation with efficient count
        new_count = spark.sql("SELECT COUNT(*) as count FROM retail.CustomerAccount").collect()[0]['count']
        print(f"üìä Records after truncation: {new_count}")
        
    except Exception as e:
        print(f"‚ÑπÔ∏è Table may not exist yet: {str(e)}")
        print("   Will proceed with creating new table...")

    # Step 1: Read CSV file with header and no schema inference
    print("\nüìñ Reading CSV file...")
    folder_path = 'Files/SampleData/retail/'
    full_file_path = os.path.join(folder_path, 'CustomerAccount.csv')
    
    df_raw = spark.read \
        .option("header", "true") \
        .option("inferSchema", "false") \
        .csv(full_file_path)
    
    print(f"‚úÖ CSV loaded: {df_raw.count()} rows")
    
    # Step 2: Display and clean column names (handle whitespace)
    print("\nüßπ Cleaning column names...")
    print("Original columns:")
    for i, col_name in enumerate(df_raw.columns):
        print(f"  {i+1}. '{col_name}'")
    
    # Clean column names by trimming whitespace
    cleaned_columns = [col_name.strip() for col_name in df_raw.columns]
    print("\nCleaned columns:")
    for i, col_name in enumerate(cleaned_columns):
        print(f"  {i+1}. '{col_name}'")
    
    # Rename columns to remove whitespace
    for old_name, new_name in zip(df_raw.columns, cleaned_columns):
        if old_name != new_name:
            df_raw = df_raw.withColumnRenamed(old_name, new_name)
            print(f"  Renamed: '{old_name}' ‚Üí '{new_name}'")
    
    # Step 3: Trim whitespace from string values
    print("\nüßπ Trimming whitespace from string values...")
    string_columns = [field.name for field in df_raw.schema.fields if field.dataType == StringType()]
    for col_name in string_columns:
        df_raw = df_raw.withColumn(col_name, F.trim(F.col(col_name)))
    print(f"‚úÖ Trimmed {len(string_columns)} string columns")
    
    # Step 4: Apply data type conversions with proper date handling
    print("\nüîÑ Converting data types...")
    df_typed = df_raw.select(
        F.col("CustomerId").cast(StringType()),
        F.col("CustomerAccountId").cast(StringType()),
        # Handle custom date format "July 12, 2022"
        F.to_date(F.col("CustomerAccountOpenedDate"), "MMMM d, yyyy").alias("CustomerAccountOpenedDate"),
        F.to_date(F.col("CustomerAccountClosedDate"), "MMMM d, yyyy").alias("CustomerAccountClosedDate"),
        F.col("IsoCurrencyCode").cast(StringType()),
        F.col("GlobalServiceRelationshipNumber").cast(DecimalType(18,1))
    )
    
    print("‚úÖ Data types converted with custom date format")
    
    # Step 5: Transform to match table structure (following reference pattern)
    print("\nüîß Transforming to match retail.CustomerAccount schema...")
    df_final = df_typed.select(
        F.col("CustomerAccountId"),
        F.lit(None).cast(StringType()).alias("CustomerAccountName"),
        F.col("CustomerAccountOpenedDate"),
        F.col("CustomerAccountClosedDate"),
        F.col("GlobalServiceRelationshipNumber"),
        F.col("CustomerId"),
        F.lit(None).cast(StringType()).alias("ResponsibilityCenterId"),
        F.lit(None).cast(StringType()).alias("SubaccountOfCustomerAccountId"),
        F.lit(None).cast(StringType()).alias("LedgerId"),
        F.lit(None).cast(StringType()).alias("LedgerAccountNumber"),
        F.col("IsoCurrencyCode"),
        F.lit(None).cast(StringType()).alias("CustomerAccountTypeId"),
        F.lit(None).cast(IntegerType()).alias("CustomerAccountApplicationNumber")
    )
    
    print("‚úÖ Schema transformation complete")
    
    # Step 6: Show sample of cleaned data (limited to 3 rows to save time)
    print("\nüìä Sample of transformed data (first 3 rows):")
    df_final.show(3, truncate=False)
    
    # Step 7: Write to table (following reference pattern with format and mode)
    print("\nüíæ Writing to retail.CustomerAccount table...")
    df_final.write \
        .format('delta') \
        .mode('overwrite') \
        .option("overwriteSchema", "true") \
        .saveAsTable('retail.CustomerAccount')
    
    final_count = df_final.count()
    print(f"‚úÖ Successfully loaded {final_count} records into retail.CustomerAccount")
    
    # Step 8: Efficient validation using COUNT only
    print("\nüîç Validating loaded data...")
    table_count = spark.sql("SELECT COUNT(*) as count FROM retail.CustomerAccount").collect()[0]['count']
    
    if table_count == final_count:
        print(f"‚úÖ Validation passed: {table_count} records in table")
    else:
        print(f"‚ö†Ô∏è Count mismatch: Expected {final_count}, Found {table_count}")
    
    # Show schema only (no data retrieval)
    print("\nüìä Table schema:")
    spark.table("retail.CustomerAccount").printSchema()
    
    print("\nüéâ CUSTOMERACCOUNT LOADING COMPLETE!")
    print("=" * 50)

except Exception as e:
    print(f"\n‚ùå Error loading CustomerAccount data:")
    print(f"   {str(e)}")
    print("\nüîß TROUBLESHOOTING:")
    print("1. Ensure CustomerAccount.csv is uploaded to Files/SampleData/retail/")
    print("2. Check CSV file format and encoding")
    print("3. Verify retail schema exists (run CreateShemaAndTablesPySpark.ipynb)")
    print("4. Check lakehouse permissions")
    
    # Show detailed error for debugging
    import traceback
    print(f"\nDetailed error:")
    print(traceback.format_exc())

StatementMeta(, e0a36677-6d2f-4ede-9bfc-2d9c55ff0991, 3, Finished, Available, Finished)

üöÄ CUSTOMERACCOUNT DATA LOADING
üìÇ Source: Files/SampleData/retail/CustomerAccount.csv
üéØ Target: retail.CustomerAccount table

üóëÔ∏è TRUNCATING EXISTING DATA...
üìä Current records in table: 440
‚úÖ Table truncated successfully
üìä Records after truncation: 0

üìñ Reading CSV file...
‚úÖ CSV loaded: 440 rows

üßπ Cleaning column names...
Original columns:
  1. 'Index'
  2. 'CustomerId'
  3. 'CustomerAccountId'
  4. 'FirstName'
  5. 'LastName'
  6. 'Gender'
  7. 'CustomerAccountOpenedDate'
  8. 'CustomerAccountClosedDate'
  9. 'IsoCurrencyCode'
  10. 'GlobalServiceRelationshipNumber'

Cleaned columns:
  1. 'Index'
  2. 'CustomerId'
  3. 'CustomerAccountId'
  4. 'FirstName'
  5. 'LastName'
  6. 'Gender'
  7. 'CustomerAccountOpenedDate'
  8. 'CustomerAccountClosedDate'
  9. 'IsoCurrencyCode'
  10. 'GlobalServiceRelationshipNumber'

üßπ Trimming whitespace from string values...
‚úÖ Trimmed 10 string columns

üîÑ Converting data types...
‚úÖ Data types converted with custom d

# Verify Results 

In [2]:
# Basic query to see all records
df = spark.sql("SELECT * FROM retail.CustomerAccount")
# df.show()

# Or display with better formatting
df.show(20, truncate=False)

# Now you should also be able to use the Fabric Lakehouse GUI 'Table View' to see the data loaded


StatementMeta(, e0a36677-6d2f-4ede-9bfc-2d9c55ff0991, 4, Finished, Available, Finished)

+-----------------+-------------------+-------------------------+-------------------------+-------------------------------+--------------+----------------------+-----------------------------+--------+-------------------+---------------+---------------------+--------------------------------+
|CustomerAccountId|CustomerAccountName|CustomerAccountOpenedDate|CustomerAccountClosedDate|GlobalServiceRelationshipNumber|CustomerId    |ResponsibilityCenterId|SubaccountOfCustomerAccountId|LedgerId|LedgerAccountNumber|IsoCurrencyCode|CustomerAccountTypeId|CustomerAccountApplicationNumber|
+-----------------+-------------------+-------------------------+-------------------------+-------------------------------+--------------+----------------------+-----------------------------+--------+-------------------+---------------+---------------------+--------------------------------+
|Account_No_1     |NULL               |2022-07-12               |NULL                     |1000000000001000.0             |C