In [293]:
# =============================================================================
# 🔧 ENTITY RESOLUTION FRAMEWORK
# =============================================================================

# 1. IMPORTS & SETUP
# Python Data 
# import streamlit as st
import pandas as pd
from pydantic import BaseModel, Field

# Python Formatting & Display
import humanize 
from datetime import datetime
from textwrap import dedent

#  Snowpark
import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T

from snowflake.snowpark.context import get_active_session

# Cortex
import snowflake.cortex as C

In [294]:

# Define list of valid US state abbreviations
US_STATE_CODES = [
    'AL',  # Alabama
    'AK',  # Alaska 
    'AZ',  # Arizona
    'AR',  # Arkansas
    'CA',  # California
    'CO',  # Colorado
    'CT',  # Connecticut
    'DE',  # Delaware
    'FL',  # Florida
    'GA',  # Georgia
    'HI',  # Hawaii
    'ID',  # Idaho
    'IL',  # Illinois
    'IN',  # Indiana
    'IA',  # Iowa
    'KS',  # Kansas
    'KY',  # Kentucky
    'LA',  # Louisiana
    'ME',  # Maine
    'MD',  # Maryland
    'MA',  # Massachusetts
    'MI',  # Michigan
    'MN',  # Minnesota
    'MS',  # Mississippi
    'MO',  # Missouri
    'MT',  # Montana
    'NE',  # Nebraska
    'NV',  # Nevada
    'NH',  # New Hampshire
    'NJ',  # New Jersey
    'NM',  # New Mexico
    'NY',  # New York
    'NC',  # North Carolina
    'ND',  # North Dakota
    'OH',  # Ohio
    'OK',  # Oklahoma
    'OR',  # Oregon
    'PA',  # Pennsylvania
    'RI',  # Rhode Island
    'SC',  # South Carolina
    'SD',  # South Dakota
    'TN',  # Tennessee
    'TX',  # Texas
    'UT',  # Utah
    'VT',  # Vermont
    'VA',  # Virginia
    'WA',  # Washington
    'WV',  # West Virginia
    'WI',  # Wisconsin
    'WY',  # Wyoming
    'DC',  # District of Columbia
    'PR',  # Puerto Rico
    'VI',  # Virgin Islands
    'AS',  # American Samoa
    'GU',  # Guam
    'MP',  # Northern Mariana Islands
    'UM',  # United States Minor Outlying Islands
    'VI',  # Virgin Islands (duplicate in original list)
    'XX',  # Unknown state
]


In [None]:
# 2. SESSION INITIALIZATION
def initialize_session():
    try:
        session = Session.builder.configs(json.load(open("/Users/jsoliz/.creds/gpn_connection.json"))).create()
        print("🔑 Local session initialized successfully")
        return session
    except Exception as e:
        try:
            session = get_active_session()
            print("🔑 Using active Snowflake session")
            return session
        except:
            print("❌ Session initialization failed")
            return None

session = initialize_session()

 pip install snowflake-connector-python[secure-local-storage]


Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...
Going to open: https://login.microsoftonline.com/3a7077d2-14ae-4001-a736-75e0437e2b89/saml2?SAMLRequest=lVJRb9owGPwrkfec2AkJAQuoMhhrprajhW7T3tzkAywcO7Ud0u7XzwkgdQ%2BttDfLvvPdd%2FdNrl4q4R1BG67kFIUBQR7IQpVc7qbocbP0R8gzlsmSCSVhil7BoKvZxLBK1DRr7F4%2BwHMDxnruI2lo9zBFjZZUMcMNlawCQ21B19ntDY0CQpkxoK2TQ2dKabjT2ltbU4zbtg3aQaD0DkeEEEzG2KE6yCf0RqL%2BWKPWyqpCiQvlxc30jkSISdxJOIRTWJ2Jn7k8RfCRytMJZOj1ZrPyV9%2FXG%2BRll%2BnmSpqmAr0GfeQFPD7cnAwY52APTFvhQvXbP8MkIlFgpGq3gh2gUFXdWPdr4E54CyUWasddVvliiuoDL1fP%2FNf18V5keXyolj%2FHybco0%2B38y%2B0ou9dxvt19FVangx0s8wJ5Py7NRl2zuTEN5LLr07orEiU%2BSf1wuCEJDVMakWCcjH8jb%2BH65JLZnnkx3fsIKl5oZdTWKim4hN7lgKUkTcvID2MGfuwy9Vk6GPppAiQepBA9jca4ay1Cp82hvRE9%2B988Jvgt%2B7yEd66XfLFSghev3lLpitn3awuDsL%2Fhpb%2FtoRQqxkVWlhqMcfUJodq5BmbdrlvdAMKzk%2Bq%2F

In [None]:
def display_df_info(spdf, name="DataFrame"):
    """
    Display first 10 rows and metrics for a Snowpark DataFrame
    
    Args:
        spdf: Snowpark DataFrame to analyze
        name: Name to display for the DataFrame
    """
    # Get row and column counts
    row_count = spdf.count()
    col_count = len(spdf.columns)
    
    print(f"\n📊 {name} Overview:")
    print(f"  • Rows: {humanize.intword(row_count)} ({humanize.intcomma(row_count)})")
    print(f"  • Columns: {col_count}")
    
    print("\n🔍 First 10 rows:")
    spdf.limit(10).show()

def show_full_df(df, num_rows=10):
    return df.limit(num_rows).to_pandas().style.set_properties(**{
        'text-align': 'left',
        'white-space': 'pre-wrap'
    }).set_table_styles([dict(selector='th', props=[('text-align', 'left')])])


### **Phase 2: Data Cleansing**

In [None]:
import humanize
mdm_spdf = session.table('bi.mdm.customer')
zi_spdf = session.table('zoominfo.brick.zi_full_brick')
lookup_spdf = session.table('sandbox.conklin.mcc_sic_naics_lookup')

mdm_count = mdm_spdf.count()
zi_count = zi_spdf.count()

print("📊 Data Volume Metrics:")
print(f"  • MDM Records: {humanize.intword(mdm_count)} ({humanize.intcomma(mdm_count)})")
print(f"  • ZoomInfo Records: {humanize.intword(zi_count)} ({humanize.intcomma(zi_count)})")
print(f"  • Total Records to Process: {humanize.intword(mdm_count + zi_count)} ({humanize.intcomma(mdm_count + zi_count)})")

📊 Data Volume Metrics:
  • MDM Records: 2.2 million (2,156,229)
  • ZoomInfo Records: 146.9 million (146,908,361)
  • Total Records to Process: 149.1 million (149,064,590)


In [None]:
# Clean out NA values
def clean_na_values(df, columns_to_select=None):
    """
    Replace 'NA' string values with None in all string columns of a dataframe
    
    Args:
        df: Snowpark DataFrame to clean
        columns_to_select: Optional list of columns to select in output DataFrame
        
    Returns:
        Snowpark DataFrame with 'NA' values replaced with None
    """
    # Get all string columns
    string_columns = [field.name for field in df.schema.fields 
                     if isinstance(field.datatype, T.StringType)]

    # Create list of column transformations
    column_transformations_list = [
        F.when(F.col(column) == 'NA', None)
         .when((F.col(column) == 'x') & (column == 'FEDTAXID'), None)
         .otherwise(F.col(column))
        for column in string_columns
    ]

    # Apply all transformations at once
    cleaned_df = df.with_columns(string_columns, column_transformations_list)
    
    # Select specified columns if provided
    if columns_to_select:
        cleaned_df = cleaned_df.select(columns_to_select)
        
    return cleaned_df

mdm_nona_spdf = clean_na_values(mdm_spdf)
zi_nona_spdf = clean_na_values(zi_spdf)

show_full_df(mdm_nona_spdf)
# show_full_df(zi_nona_spdf)

Unnamed: 0,MERCHANT_SEQUENCE_KEY,HPS_CLIENT_ID,PERSONOID,SELLER_LASTWORKED,PRICING_CHAIN,UNDERWRITING_CHAIN,MERCHANT_CENTER_CHAIN,TERMINATED_DATE,EARLY_TERM_FEE,INSTALL_DATE,RECENT_DEPOSIT_DATE,ACTIVE_DATE,FIRSTNONCONVPYRLDATE,LASTPAYROLLPROCESSED,NUM_EMPLOYEES,ATTRITION_DATE,DAYS_SINCE_LASTDEPOSIT,DAYS_SINCE_LASTPAYROLLPROCESSED,TENURE_MONTHS,FIRST_DEPOSIT_DATE,ACTUAL_VOLUME_LTM,ACTUAL_VOLUME_TTM,ANNUALIZED_VOLUME_CALC,CURR_STATUS_DATE,INVOICE_PARENT_MSK,ORIGINAL_SELLER_PERSONOID,MOST_RECENT_SELLER_PERSONOID,MOST_RECENT_ACTIVE_SELLER_PERSONOID,FEDTAXID,MASKED_FEDTAXID,LEGAL_NAME,DBA_NAME,ID_TYPE,ID_TYPE_SRC,PORTFOLIO_NAME,IDENTIFIER,EMPLOYER_NUMBER,PREV_MERCH_NBRS,SALESFORCE_CUSTOMER_ID,ORACLE_FINANCIAL_CUSTOMER_ID,ORDWAY_CUSTOMER_ID,SALESFORCE_SPRINGBOARD_ID,GPI_ACCOUNT_ID,GPI_PROCESSING_ACCOUNT_ID,SELLER_NAME,SELLER_FIRST_NAME,SELLER_LAST_NAME,SELLER_WORKEMAIL,SELLER_STATUS,SELLER_ACTIVE,DEALER_NAME,PRICING_CHAIN_NAME,UNDERWRITING_CHAIN_NAME,MERCHANT_CENTER_CHAIN_NAME,CURRENT_PRICING_PROGRAM,TSYS,MERCH_AND_CLIENT,BRICK_AND_MORTAR,ECOMM,WEBADDRESS,STATUS,TERMINATED_FLAG,EARLY_TERM_FLAG,REACTIVATED,PRIMARY_CONTACT_NAME,PRIMARY_CONTACT_FIRST_NAME,PRIMARY_CONTACT_LAST_NAME,PRIMARY_CONTACT_EMAIL,PRIMARY_CONTACT_CELLPHONE,PRIMARY_CONTACT_WRKPHONE,INFOCENTRAL_EMAIL,DBA_PHONE_NUMBER,DBA_ADDRESS,DBA_CITY,DBA_STATE,DBA_ZIP,DBA_FULL_ADDRESS,DBA_COUNTRY_NAME,LEGAL_ADDRESS,LEGAL_CITY,LEGAL_STATE,LEGAL_ZIP,LEGAL_FULL_ADDRESS,LEGAL_COUNTRY_NAME,NATIONAL_ACCOUNT,AFFILIATE_ID,AFFILIATE_TYPE,ALL_AFFILIATE_IDS,VENDOR,CURRENT_SALES_CHANNEL,CUSTOMER_KEY,MCC_CODE,MCC_DESCRIPTION,INDUSTRY,STATUS_BUCKET,SIZE_TYPE,SIZE,SIZE_TIER,TSG_VOL_TIER_ID,TSG_VOL_TIER,MIDMARKET_IND,OWNER_NAME,OWNER_PHONE,OWNER_EMAIL,REV_SHARE_FLAG,LINE_OF_BUSINESS,LOB_ORG_GROUP,SURCHARGE_METHOD,HCM_STATUS,INVOICE_PARENT_NAME_DBA,INVOICE_PARENT_NAME_LEGAL,ORIGINAL_SELLER_NAME,ORIGINAL_SELLER_STATUS,MOST_RECENT_SELLER_NAME,MOST_RECENT_SELLER_STATUS,MOST_RECENT_ACTIVE_SELLER_NAME,MOST_RECENT_ACTIVE_SELLER_STATUS,ZI_C_LOCATION_ID,ZI_C_LATITUDE,ZI_C_LONGITUDE,ZI_C_SIC4,ZI_C_SIC_TOP3,ZI_C_NAICS4,ZI_C_NAICS6,ZI_C_NAICS_TOP3,ZI_C_TIER_GRADE,ZI_C_REVENUE_RANGE,ZI_C_REVENUE,ZI_C_INDUSTRIES,ZI_C_SUB_INDUSTRIES,ZI_ES_INDUSTRY,ZI_C_ESTIMATED_AGE,ZI_C_YEAR_FOUNDED,ZI_C_IS_B2B,ZI_C_EMPLOYEE_RANGE,ZI_C_EMPLOYEES,ZI_C_IS_SMALL_BUSINESS,ZI_C_IS_PUBLIC,ZI_C_NUM_LOCATIONS,ZI_C_COMPANY_ID,ZI_C_IMMEDIATE_PARENT_COMPANY_ID,ZI_C_ULTIMATE_PARENT_COMPANY_ID,BI_ZI_SIC4_DESCRIPTION,BI_ZI_SIC4_INDUSTRY,BI_ZI_SIC4_INDUSTRY_GROUP,BI_ZI_SIC4_MAJOR_GROUP
0,1516526,,,,0,0,0,2022-01-29 00:00:00,,2016-08-20 00:00:00,,NaT,,,0.0,NaT,,,65,,,,,2022-01-29 00:00:00,856904,,,,09W@LApTs;s,*****Ts;s,PHILLIPS 66 COMPANY,BROOKSHIRE BROS #8,MERCHANT_ID,HCSDB,HPY,650000009245237,,,,,,,,,,,,,Inactive,Inactive-Unassigned,,,,,SRM,No,No,No,No,,Terminated,Yes,No,No,STORE MANAGER,STORE,MANAGER,,,,,,412 W PANOLA ST,CARTHAGE,TX,75633,"412 W Panola St, Carthage, TX, 75633",United States,1000 S PINE,PONCA CITY,OK,74601,"1000 S Pine, Ponca City, OK, 74601",United States,No,,,,VAPS,,2155050,5541,SERVICE STATIONS (WITH OR WITHOUT ANCILLARY SERVICES),Petroleum,Terminated,Undefined,Undefined,Undefined,,,No,Store Manager,,,No,NWS,NA Merchant Business,Cost Plus Net,,CONOCO PHILLIPS,Phillips 66 Company,,,,,,,407470334.0,32.15665,-94.34429,5411.0,5411|5912,4451.0,445110.0,445110.0,A,$0M-$1M,729.0,Retail,Grocery Retail,Retail,94.0,1928.0,1.0,'1-5,5.0,1.0,0.0,281.0,7685346.0,7685346.0,7685346.0,GROCERY STORES,"RETAIL TRADE, BUILDING MATERIALS, & RESTAURANTS",GROCERY STORES,FOOD STORES
1,1516527,,,,0,0,0,2022-01-29 00:00:00,,2016-08-20 00:00:00,,NaT,,,0.0,NaT,,,65,,,,,2022-01-29 00:00:00,856904,,,,09W@LApTs;s,*****Ts;s,PHILLIPS 66 COMPANY,BROOKSHIRE BROS 14,MERCHANT_ID,HCSDB,HPY,650000009245245,,,,,,,,,,,,,Inactive,Inactive-Unassigned,,,,,SRM,No,No,No,No,,Terminated,Yes,No,No,STORE MANAGER,STORE,MANAGER,,,,,,HWY 190 W,NEWTON,TX,75966,"Hwy 190 W, Newton, TX, 75966",United States,1000 S PINE,PONCA CITY,OK,74601,"1000 S Pine, Ponca City, OK, 74601",United States,No,,,,VAPS,,2155050,5541,SERVICE STATIONS (WITH OR WITHOUT ANCILLARY SERVICES),Petroleum,Terminated,Undefined,Undefined,Undefined,,,No,Store Manager,,,No,NWS,NA Merchant Business,Cost Plus Net,,CONOCO PHILLIPS,Phillips 66 Company,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,1516528,,,,0,0,0,NaT,,2016-08-20 00:00:00,,NaT,,,0.0,NaT,,,107,,,,,2016-08-20 00:00:00,856904,,,,09W@LApTs;s,*****Ts;s,PHILLIPS 66 COMPANY,BROOKSHIRE BROS #12,MERCHANT_ID,HCSDB,HPY,650000009245252,,,,,,,,,,,,,Inactive,Inactive-Unassigned,,,,,SRM,No,No,No,No,,Installed,No,,No,STORE MANAGER,STORE,MANAGER,,,,,,401 N 4TH ST,CROCKETT,TX,75835,"401 N 4th St, Crockett, TX, 75835",United States,1000 S PINE,PONCA CITY,OK,74601,"1000 S Pine, Ponca City, OK, 74601",United States,No,,,,VAPS,,2155050,5541,SERVICE STATIONS (WITH OR WITHOUT ANCILLARY SERVICES),Petroleum,Installed,Undefined,Undefined,Undefined,,,No,Store Manager,,,No,NWS,NA Merchant Business,Cost Plus Net,,CONOCO PHILLIPS,Phillips 66 Company,,,,,,,407470333.0,31.3209,-95.45798,5411.0,5411,4451.0,445110.0,445110.0,A,$1M-$5M,4430.0,Retail,Grocery Retail,Retail,94.0,1928.0,1.0,'6-10,9.0,1.0,0.0,281.0,7685346.0,7685346.0,7685346.0,GROCERY STORES,"RETAIL TRADE, BUILDING MATERIALS, & RESTAURANTS",GROCERY STORES,FOOD STORES
3,1516529,,,,0,0,0,NaT,,2016-08-20 00:00:00,,NaT,,,0.0,NaT,,,107,,,,,2016-08-20 00:00:00,856904,,,,09W@LApTs;s,*****Ts;s,PHILLIPS 66 COMPANY,BROOKSHIRE BROS 13,MERCHANT_ID,HCSDB,HPY,650000009245260,,,,,,,,,,,,,Inactive,Inactive-Unassigned,,,,,SRM,No,No,No,No,,Installed,No,,No,STORE MANAGER,STORE,MANAGER,,,,,,400 SECOND ST,HEARNE,TX,77859,"400 Second St, Hearne, TX, 77859",United States,1000 S PINE,PONCA CITY,OK,74601,"1000 S Pine, Ponca City, OK, 74601",United States,No,,,,VAPS,,2155050,5541,SERVICE STATIONS (WITH OR WITHOUT ANCILLARY SERVICES),Petroleum,Installed,Undefined,Undefined,Undefined,,,No,Store Manager,,,No,NWS,NA Merchant Business,Cost Plus Net,,CONOCO PHILLIPS,Phillips 66 Company,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,1516530,,,,0,0,0,NaT,,2016-08-20 00:00:00,,NaT,,,0.0,NaT,,,107,,,,,2016-08-20 00:00:00,856904,,,,09W@LApTs;s,*****Ts;s,PHILLIPS 66 COMPANY,BROOKSHIRE BROS #17,MERCHANT_ID,HCSDB,HPY,650000009245278,,,,,,,,,,,,,Inactive,Inactive-Unassigned,,,,,SRM,No,No,No,No,,Installed,No,,No,STORE MANAGER,STORE,MANAGER,,,,,,210 E CHEROKEE ST,JACKSONVILLE,TX,75766,"210 E Cherokee St, Jacksonville, TX, 75766",United States,1000 S PINE,PONCA CITY,OK,74601,"1000 S Pine, Ponca City, OK, 74601",United States,No,,,,VAPS,,2155050,5541,SERVICE STATIONS (WITH OR WITHOUT ANCILLARY SERVICES),Petroleum,Installed,Undefined,Undefined,Undefined,,,No,Store Manager,,,No,NWS,NA Merchant Business,Cost Plus Net,,CONOCO PHILLIPS,Phillips 66 Company,,,,,,,367548122.0,31.96583,-95.27349,5411.0,5411,4451.0,445110.0,445110.0,B,$10M-$25M,11835.0,Retail,Grocery Retail,Retail,94.0,1928.0,1.0,'51-100,79.0,1.0,0.0,281.0,7685346.0,7685346.0,7685346.0,GROCERY STORES,"RETAIL TRADE, BUILDING MATERIALS, & RESTAURANTS",GROCERY STORES,FOOD STORES
5,1516531,,,,0,0,0,NaT,,2016-08-20 00:00:00,,NaT,,,0.0,NaT,,,107,,,,,2016-08-20 00:00:00,856904,,,,09W@LApTs;s,*****Ts;s,PHILLIPS 66 COMPANY,BROOKSHIRE BROS #20,MERCHANT_ID,HCSDB,HPY,650000009245286,,,,,,,,,,,,,Inactive,Inactive-Unassigned,,,,,SRM,No,No,No,No,,Installed,No,,No,STORE MANAGER,STORE,MANAGER,,,,,,HWY 87 S,HEMPHILL,TX,75948,"Hwy 87 S, Hemphill, TX, 75948",United States,1000 S PINE,PONCA CITY,OK,74601,"1000 S Pine, Ponca City, OK, 74601",United States,No,,,,VAPS,,2155050,5541,SERVICE STATIONS (WITH OR WITHOUT ANCILLARY SERVICES),Petroleum,Installed,Undefined,Undefined,Undefined,,,No,Store Manager,,,No,NWS,NA Merchant Business,Cost Plus Net,,CONOCO PHILLIPS,Phillips 66 Company,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6,1516532,,,,0,0,0,NaT,,2016-08-20 00:00:00,,NaT,,,0.0,NaT,,,107,,,,,2016-08-20 00:00:00,856904,,,,09W@LApTs;s,*****Ts;s,PHILLIPS 66 COMPANY,BROOKSHIRE BROS #21,MERCHANT_ID,HCSDB,HPY,650000009245294,,,,,,,,,,,,,Inactive,Inactive-Unassigned,,,,,SRM,No,No,No,No,,Installed,No,,No,STORE MANAGER,STORE,MANAGER,,,,,,1252 W CHURCH ST,LIVINGSTON,TX,77351,"1252 W Church St, Livingston, TX, 77351",United States,1000 S PINE,PONCA CITY,OK,74601,"1000 S Pine, Ponca City, OK, 74601",United States,No,,,,VAPS,,2155050,5541,SERVICE STATIONS (WITH OR WITHOUT ANCILLARY SERVICES),Petroleum,Installed,Undefined,Undefined,Undefined,,,No,Store Manager,,,No,NWS,NA Merchant Business,Cost Plus Net,,CONOCO PHILLIPS,Phillips 66 Company,,,,,,,436038274.0,30.7108,-94.94674,5411.0,5411|541,4451.0,445110.0,445110.0,B,$10M-$25M,11324.0,Retail,Grocery Retail,Retail,94.0,1928.0,1.0,'51-100,68.0,1.0,0.0,281.0,7685346.0,7685346.0,7685346.0,GROCERY STORES,"RETAIL TRADE, BUILDING MATERIALS, & RESTAURANTS",GROCERY STORES,FOOD STORES
7,1516533,,,,0,0,0,NaT,,2016-08-20 00:00:00,,NaT,,,0.0,NaT,,,107,,,,,2016-08-20 00:00:00,856904,,,,09W@LApTs;s,*****Ts;s,PHILLIPS 66 COMPANY,BROOKSHIRE BROS 24,MERCHANT_ID,HCSDB,HPY,650000009245302,,,,,,,,,,,,,Inactive,Inactive-Unassigned,,,,,SRM,No,No,No,No,,Installed,No,,No,STORE MANAGER,STORE,MANAGER,,,,,,100 S HOME ST,CORRIGAN,TX,75939,"100 S Home St, Corrigan, TX, 75939",United States,1000 S PINE,PONCA CITY,OK,74601,"1000 S Pine, Ponca City, OK, 74601",United States,No,,,,VAPS,,2155050,5541,SERVICE STATIONS (WITH OR WITHOUT ANCILLARY SERVICES),Petroleum,Installed,Undefined,Undefined,Undefined,,,No,Store Manager,,,No,NWS,NA Merchant Business,Cost Plus Net,,CONOCO PHILLIPS,Phillips 66 Company,,,,,,,437469687.0,30.99544,-94.82612,5411.0,5411,4451.0,445110.0,445110.0,B,$5M-$10M,8877.0,Retail,Grocery Retail,Retail,94.0,1928.0,1.0,'21-50,35.0,1.0,0.0,281.0,7685346.0,7685346.0,7685346.0,GROCERY STORES,"RETAIL TRADE, BUILDING MATERIALS, & RESTAURANTS",GROCERY STORES,FOOD STORES
8,1516534,,,,0,0,0,NaT,,2016-08-20 00:00:00,,NaT,,,0.0,NaT,,,107,,,,,2016-08-20 00:00:00,856904,,,,09W@LApTs;s,*****Ts;s,PHILLIPS 66 COMPANY,BROOKSHIRE BROS 36,MERCHANT_ID,HCSDB,HPY,650000009245310,,,,,,,,,,,,,Inactive,Inactive-Unassigned,,,,,SRM,No,No,No,No,,Installed,No,,No,STORE MANAGER,STORE,MANAGER,,,,,,HWY 326 & HWY 69,KOUNTZE,TX,77625,"Hwy 326 & Hwy 69, Kountze, TX, 77625",United States,1000 S PINE,PONCA CITY,OK,74601,"1000 S Pine, Ponca City, OK, 74601",United States,No,,,,VAPS,,2155050,5541,SERVICE STATIONS (WITH OR WITHOUT ANCILLARY SERVICES),Petroleum,Installed,Undefined,Undefined,Undefined,,,No,Store Manager,,,No,NWS,NA Merchant Business,Cost Plus Net,,CONOCO PHILLIPS,Phillips 66 Company,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9,1516535,,,,0,0,0,2022-01-25 00:00:00,,2016-08-20 00:00:00,,NaT,,,0.0,NaT,,,65,,,,,2022-01-25 00:00:00,856904,,,,09W@LApTs;s,*****Ts;s,PHILLIPS 66 COMPANY,BROOKSHIRE BROS #25,MERCHANT_ID,HCSDB,HPY,650000009245328,,,,,,,,,,,,,Inactive,Inactive-Unassigned,,,,,SRM,No,No,No,No,,Terminated,Yes,No,No,STORE MANAGER,STORE,MANAGER,,,,,,1807 W FRANK AVE,LUFKIN,TX,75904,"1807 W Frank Ave, Lufkin, TX, 75904",United States,1000 S PINE,PONCA CITY,OK,74601,"1000 S Pine, Ponca City, OK, 74601",United States,No,,,,VAPS,,2155050,5541,SERVICE STATIONS (WITH OR WITHOUT ANCILLARY SERVICES),Petroleum,Terminated,Undefined,Undefined,Undefined,,,No,Store Manager,,,No,NWS,NA Merchant Business,Cost Plus Net,,CONOCO PHILLIPS,Phillips 66 Company,,,,,,,2199147689.0,31.33342,-94.75489,5411.0,5411,4451.0,445110.0,445110.0,A,$1B-$5B,1500000.0,Retail,Grocery Retail,Retail,94.0,1928.0,1.0,"'5,001-10,000",7046.0,0.0,0.0,281.0,7685346.0,7685346.0,7685346.0,GROCERY STORES,"RETAIL TRADE, BUILDING MATERIALS, & RESTAURANTS",GROCERY STORES,FOOD STORES


In [301]:
mdm_nona_spdf.write.mode("overwrite").save_as_table("mdm_nona")
zi_nona_spdf.write.mode("overwrite").save_as_table("zi_nona")

In [357]:
# Reload the lookup table
lookup_spdf = session.table("sandbox.conklin.mcc_sic_naics_lookup").withColumn('NAICS', F.col('NAICS').cast(T.StringType()))

lookup_spdf.printSchema()
lookup_spdf.show()

root
 |-- "NAICS_DESCRIPTION": StringType(250) (nullable = True)
 |-- "MCC": StringType(16777216) (nullable = True)
 |-- "MCC_CATEGORY": StringType(16777216) (nullable = True)
 |-- "MCC_DESCRIPTION": StringType(16777216) (nullable = True)
 |-- "SIC": StringType(16777216) (nullable = True)
 |-- "SIC_DIVISION": StringType(250) (nullable = True)
 |-- "SIC_MAJOR_GROUP_DESCRIPTION": StringType(250) (nullable = True)
 |-- "SIC_INDUSTRY_DESCRIPTION": StringType(250) (nullable = True)
 |-- "NAICS": StringType() (nullable = True)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"NAICS_DESCRIPTION"                          |"MCC"  |"MCC_CATEGORY"  |"MCC_DESCRIPTION"               |"SIC"  |"SIC_DIVISION"                                   |"SIC_MAJOR_GROUP_DESCRIPTION"  |"SIC_INDUSTRY_DESCRIPTION"       |"NAI

In [358]:
lookup_spdf.show()

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"NAICS_DESCRIPTION"                          |"MCC"  |"MCC_CATEGORY"  |"MCC_DESCRIPTION"               |"SIC"  |"SIC_DIVISION"                                   |"SIC_MAJOR_GROUP_DESCRIPTION"  |"SIC_INDUSTRY_DESCRIPTION"       |"NAICS"  |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|LUGGAGE AND LEATHER GOODS RETAILERS          |5948   |PERSONALRTL     |Leather Goods & Luggage Stores  |5948   |RETAIL TRADE, BUILDING MATERIALS, & RESTAURANTS  |MISCELLANEOUS RETAIL           |LUGGAGE & LEATHER GOODS STORES   |458320   |
|VENDING MACHINE OPERATORS              

In [302]:
# Reload the cleaned tables
mdm_nona_spdf = session.table("mdm_nona")
print("MDM table row count:", mdm_nona_spdf.count())
mdm_nona_spdf.show()

zi_nona_spdf = session.table("zi_nona")
print("ZI table row count:", zi_nona_spdf.count())
zi_nona_spdf.show()

MDM table row count: 2156229
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### Does the below number make sense

In [305]:
zi_nona_spdf.count()

146908361

### Does the above number make sense

In [None]:
# Create thin tables with just the key columns needed for matching
mdm_thin_spdf = mdm_nona_spdf.select([
    'IDENTIFIER',
    'FEDTAXID',
    'DBA_NAME',
    'DBA_PHONE_NUMBER', 
    'DBA_ADDRESS',
    'DBA_CITY',
    'DBA_STATE',
    'DBA_ZIP',
    'DBA_FULL_ADDRESS',
    'WEBADDRESS',
    'MCC_CODE',
    'MCC_DESCRIPTION',
    'INDUSTRY',
    'SIZE',
    'SIZE_TYPE'
])

zi_thin_spdf = zi_nona_spdf.select([
    'ZI_C_LOCATION_ID',
    'ZI_C_EIN',
    'ZI_C_NAME',
    'ZI_C_NAME_DISPLAY',
    'ZI_C_PHONE',
    'ZI_C_URL',
    'ZI_C_STREET',
    'ZI_C_STREET_2',
    'ZI_C_CITY',
    'ZI_C_STATE',
    'ZI_C_ZIP',
    'ZI_C_SIC4',
    'ZI_C_SIC_TOP3',
    'ZI_C_NAICS4',
    'ZI_C_NAICS6',
    'ZI_C_NAICS_TOP3',
    'ZI_C_INDUSTRY_PRIMARY',
    'ZI_ES_INDUSTRY',
    'ZI_C_EMPLOYEE_RANGE',
    'ZI_C_EMPLOYEES',
    'ZI_C_REVENUE_RANGE',
    'ZI_C_REVENUE',
    'ZI_C_INDUSTRIES',
    'ZI_C_SUB_INDUSTRIES'
]).withColumn('ZI_BEST_INDUSTRY_CODE', F.coalesce(F.col('ZI_C_NAICS6'), F.col('ZI_C_NAICS4'), F.col('ZI_C_SIC4')).cast(T.StringType()))


lookup_spdf.show()

lookup_thin_spdf = lookup_spdf.select([
    'NAICS',
    'NAICS_DESCRIPTION',
    'MCC',
    'MCC_CATEGORY',
    'MCC_DESCRIPTION',
    'SIC',
    'SIC_DIVISION',
    'SIC_MAJOR_GROUP_DESCRIPTION',
    'SIC_INDUSTRY_DESCRIPTION'
]).withColumn('BEST_INDUSTRY_CODE', F.coalesce(F.col('NAICS'), F.col('SIC')).cast(T.StringType()))

lookup_thin_spdf.show()
lookup_thin_spdf.printSchema()

# print("✅ Created thin tables with key columns for matching")


-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"NAICS_DESCRIPTION"                          |"MCC"  |"MCC_CATEGORY"  |"MCC_DESCRIPTION"               |"SIC"  |"SIC_DIVISION"                                   |"SIC_MAJOR_GROUP_DESCRIPTION"  |"SIC_INDUSTRY_DESCRIPTION"       |"NAICS"  |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|LUGGAGE AND LEATHER GOODS RETAILERS          |5948   |PERSONALRTL     |Leather Goods & Luggage Stores  |5948   |RETAIL TRADE, BUILDING MATERIALS, & RESTAURANTS  |MISCELLANEOUS RETAIL           |LUGGAGE & LEATHER GOODS STORES   |458320   |
|VENDING MACHINE OPERATORS              

In [365]:
print(mdm_thin_spdf.count())
print(zi_thin_spdf.count())
print(lookup_thin_spdf.count())
print(lookup_thin_spdf.distinct().count())

2156229
146908361
3862
3862


In [366]:
lookup_thin_spdf.printSchema()

root
 |-- "NAICS": StringType() (nullable = True)
 |-- "NAICS_DESCRIPTION": StringType(250) (nullable = True)
 |-- "MCC": StringType(16777216) (nullable = True)
 |-- "MCC_CATEGORY": StringType(16777216) (nullable = True)
 |-- "MCC_DESCRIPTION": StringType(16777216) (nullable = True)
 |-- "SIC": StringType(16777216) (nullable = True)
 |-- "SIC_DIVISION": StringType(250) (nullable = True)
 |-- "SIC_MAJOR_GROUP_DESCRIPTION": StringType(250) (nullable = True)
 |-- "SIC_INDUSTRY_DESCRIPTION": StringType(250) (nullable = True)
 |-- "BEST_INDUSTRY_CODE": StringType() (nullable = True)


In [371]:
# Join lookup table to MDM and ZI tables to get business descriptions
mdm_thin_with_lookup = mdm_thin_spdf.join(
    lookup_thin_spdf.select([
        F.col('NAICS').alias('LU_NAICS'),
        F.col('NAICS_DESCRIPTION').alias('LU_NAICS_DESCRIPTION'), 
        F.col('MCC').alias('LU_MCC'),
        F.col('MCC_CATEGORY').alias('LU_MCC_CATEGORY'),
        F.col('MCC_DESCRIPTION').alias('LU_MCC_DESCRIPTION'),
        F.col('SIC').alias('LU_SIC'),
        F.col('SIC_DIVISION').alias('LU_SIC_DIVISION'),
        F.col('SIC_MAJOR_GROUP_DESCRIPTION').alias('LU_SIC_MAJOR_GROUP_DESCRIPTION'),
        F.col('SIC_INDUSTRY_DESCRIPTION').alias('LU_SIC_INDUSTRY_DESCRIPTION'),
    ]),
    mdm_thin_spdf["MCC_CODE"] == F.col("LU_MCC"),
    'left'
)

mdm_thin_with_lookup.show()

zi_thin_spdf.show()

zi_thin_with_lookup = zi_thin_spdf.alias('zi').join(
    lookup_thin_spdf.select([
        F.col('BEST_INDUSTRY_CODE').alias('LU_BEST_INDUSTRY_CODE'),
        F.col('NAICS').alias('LU_NAICS'),
        F.col('NAICS_DESCRIPTION').alias('LU_NAICS_DESCRIPTION'),
        F.col('MCC').alias('LU_MCC'), 
        F.col('MCC_CATEGORY').alias('LU_MCC_CATEGORY'),
        F.col('MCC_DESCRIPTION').alias('LU_MCC_DESCRIPTION'),
        F.col('SIC').alias('LU_SIC'),
        F.col('SIC_DIVISION').alias('LU_SIC_DIVISION'),
        F.col('SIC_MAJOR_GROUP_DESCRIPTION').alias('LU_SIC_MAJOR_GROUP_DESCRIPTION'),
        F.col('SIC_INDUSTRY_DESCRIPTION').alias('LU_SIC_INDUSTRY_DESCRIPTION')
    ]).alias('lu'),
    (F.col("ZI_BEST_INDUSTRY_CODE").isNotNull() & F.col("LU_BEST_INDUSTRY_CODE").isNotNull() & (F.col("ZI_BEST_INDUSTRY_CODE") == F.col("LU_BEST_INDUSTRY_CODE"))),
    'left'
)

zi_thin_with_lookup.show()
lookup_thin_spdf.where(F.col('BEST_INDUSTRY_CODE') == '812990').show()

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"IDENTIFIER"     |"FEDTAXID"   |"DBA_NAME"                      |"DBA_PHONE_NUMBER"  |"DBA_ADDRESS"             |"DBA_CITY"    |"DBA_STATE"  |"DBA_ZIP"  |"DBA_FULL_ADDRESS"                           |"WEBADDRESS"                     |"MCC_CODE"  |"MCC_DESCRIPTION"                                   |"INDUSTRY"   |"SIZE"     |"SIZE_TYPE"        |"LU_NAICS"  |"LU_N

In [289]:
# Define business categories based on all lookup table columns and their combinations
AI_BUSINESS_CATEGORIES = [
    # Small Business / Restaurant Focus
    'Restaurants, Cafes, Catering, Food Trucks',
    'Independent Retail Shops, Boutiques, Specialty Stores',
    'Legal, Accounting, Consulting, Architecture Services',
    'Laundry, Cleaning, Personal Care, Pet Services',
    'Auto Repair, Car Wash, Towing, Parts Shops',
    'Doctors, Dentists, Chiropractors, Therapists',
    'Building, Plumbing, Electrical, HVAC Contractors',
    'Real Estate Agents, Property Management, Appraisers',
    'Hair Salons, Nail Salons, Spas, Massage Services',
    'Hotels, Motels, B&Bs, Inns',
    'Theaters, Arcades, Bowling Alleys, Sports Venues',
    'Gyms, Sports Clubs, Recreation Centers',
    'Tutoring, Training, Private Schools',
    'Small Scale Banking, Insurance, Investments',
    'Artists, Designers, Photographers',
    'Landscaping, Home Cleaning, Home Repairs',
    'IT Support, Computer Repair, Web Design',
    
    # Larger Categories
    'Department Stores, Big Box Retailers',
    'Large Scale Production Facilities',
    'Software Companies, Hardware Manufacturers, Telecom',
    'Heavy Equipment, Industrial Supplies',
    'Product Distributors, Warehousing Operations',
    'Hospitals, Medical Centers, Health Networks',
    'Government Agencies, Public Services',
    'Power Companies, Water Utilities, Telecommunications Infrastructure',
    'Farming Operations, Resource Extraction',
    'Shipping Companies, Freight Services, Delivery Networks',
    'Broadcasting Networks, Publishing Houses, Production Studios',
    'Research Laboratories, R&D Facilities',
    'Oil/Gas Companies, Renewable Energy Providers',
    'Military Contractors, Aviation Companies',
    'Charitable Organizations, Foundations, NGOs',
    'Churches, Synagogues, Religious Centers',
    'Universities, Colleges, School Districts',
    'Banks, Credit Unions, Investment Firms',
    'Property Development, Large Scale Construction',
    'Category-Specific National Retail Chains',
    'Food Manufacturing, Beverage Production',
    'Vehicle Manufacturing, Auto Parts Production',
    'Chemical Manufacturing, Pharmaceutical Production',
    'Major Construction Companies, Engineering Firms'
]


# Add standardized business type using AI_CLASSIFY with predefined categories
mdm_thin_enhanced = mdm_thin_with_lookup.withColumn('prompt', 
    F._concat_ws_ignore_nulls(' ',
        # F.lit('Which business category best describes this company based on:'),
        F.coalesce(F.col('MCC_DESCRIPTION'), F.lit('')),
        F.coalesce(F.col('INDUSTRY'), F.lit('')),
        F.coalesce(F.col('LU_MCC_DESCRIPTION'), F.lit('')), 
        F.coalesce(F.col('LU_SIC_INDUSTRY_DESCRIPTION'), F.lit('')),
        F.coalesce(F.col('LU_SIC_DIVISION'), F.lit('')),
        F.coalesce(F.col('LU_SIC_MAJOR_GROUP_DESCRIPTION'), F.lit(''))
    )
).withColumn(
    'AI_standardized_business_type',
    F.call_udf(
        'AI_CLASSIFY',
        F.col('prompt'),
        F.array_construct(*[F.lit(x) for x in AI_BUSINESS_CATEGORIES])
    )
).withColumn('AI_standardized_business_type', F.when(F.col('AI_standardized_business_type').isNull(), F.lit(None)).otherwise(F.col('AI_standardized_business_type')['labels'][0]))

mdm_thin_enhanced.where(F.col('MCC_DESCRIPTION').isNotNull()).show()

zi_thin_enhanced = zi_thin_with_lookup.withColumn('prompt',
    F._concat_ws_ignore_nulls(' ',
        F.lit('Which business category best describes this company based on:'),
        F.coalesce(F.col('ZI_C_INDUSTRY_PRIMARY'), F.lit('')),
        F.coalesce(F.col('ZI_ES_INDUSTRY'), F.lit('')),
        F.coalesce(F.col('LU_MCC_DESCRIPTION'), F.lit('')),
        F.coalesce(F.col('LU_SIC_INDUSTRY_DESCRIPTION'), F.lit('')),
        F.coalesce(F.col('LU_SIC_DIVISION'), F.lit('')),
        F.coalesce(F.col('LU_SIC_MAJOR_GROUP_DESCRIPTION'), F.lit(''))
    )
).withColumn(
    'AI_standardized_business_type',
    F.call_udf(
        'AI_CLASSIFY',
        F.col('prompt'),
        F.array_construct(*[F.lit(x) for x in AI_BUSINESS_CATEGORIES])
    )
).withColumn('AI_standardized_business_type', F.when(F.col('AI_standardized_business_type').isNull(), F.lit(None)).otherwise(F.col('AI_standardized_business_type')['labels'][0]))
zi_thin_enhanced.where(F.col('ZI_C_INDUSTRY_PRIMARY').isNotNull()).show()

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"IDENTIFIER"     |"FEDTAXID"   |"DBA_NAME"  |"DBA_PHONE_NUMBER"  |"DBA_ADDRESS"         |"DBA_CITY"  |"DBA_STATE"  |"DBA_ZIP"  |"DBA_FULL_ADDRESS"                           |"WEBADDRESS"  |"MCC_CODE"  |"MCC_DESCRIPTION"           |"INDUSTRY"  |"SIZE"  |"SIZE_TYPE"        |"LU_NAICS"  |"LU_NAICS_DESCRIPTION"          |"LU_MCC"  |"LU_MCC_CATEGORY"  |"LU_MCC_DESCRIPTION"     |"LU_SIC"  |"LU_SIC_DIVISION"   

In [308]:
zi_thin_enhanced.count()

-4421102177829706816

In [288]:
# Save the enhanced tables
mdm_thin_enhanced.write.mode("overwrite").save_as_table("mdm_thin_enhanced")
zi_thin_enhanced.write.mode("overwrite").save_as_table("zi_thin_enhanced")

KeyboardInterrupt: 

In [None]:
print('Loading enhanced tables from Snowflake...')

zi_thin_enhanced = session.table('zi_thin_enhanced')
print(f"`Reloaded: zi_thin_enhanced`: {humanize.metric(zi_thin_enhanced.count())} rows")
zi_thin_enhanced.show()

mdm_thin_enhanced = session.table('mdm_thin_enhanced')
print(f"`Reloaded: mdm_thin_enhanced`: {humanize.metric(mdm_thin_enhanced.count())} rows")
mdm_thin_enhanced.show()

print('✓ Enhanced tables loaded successfully')




-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"IDENTIFIER"        |"FEDTAXID"  |"DBA_NAME"       |"DBA_PHONE_NUMBER"  |"DBA_ADDRESS"       |"DBA_CITY"   |"DBA_STATE"  |"DBA_ZIP"  |"DBA_FULL_ADDRESS"                          |"WEBADDRESS"  |"MCC_CODE"  |"MCC_DESCRIPTION"  |"INDUSTRY"  |"SIZE"  |"SIZE_TYPE"  |"LU_NAICS"  |"LU_NAICS_DESCRIPTION"  |"LU_MCC"  |"LU_MCC_CATEGORY"  |"LU_MCC_DESCRIPTION"  |"LU_SIC"  |"LU_SIC_DIVISION"  |"LU_SIC_MAJOR_GROUP_DESCRIPTION"  |"LU_SIC_INDUSTRY_DESCRIPTION"  |"PRO

In [None]:
zi_spdf.select(F.col('ZI_C_COUNTRY')).distinct().show()

------------------
|"ZI_C_COUNTRY"  |
------------------
|Canada          |
|United Kingdom  |
|United States   |
------------------



In [None]:
zi_spdf.where(F.col('ZI_C_STATE') == 'Kent').show()

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
import snowflake.snowpark.window as W
zi_abberant_states_spdf = (zi_spdf
    
    .where(F.col('ZI_C_COUNTRY') == 'United States')
    .select('ZI_C_STATE')
    .groupBy('ZI_C_STATE')
    .count()
    .withColumn('percentage', F.col('count') / F.sum('count').over(W.Window.partitionBy()) * 100)
    .withColumn('row_number', F.row_number().over(W.Window.orderBy(F.col('count').desc())))
    .orderBy(F.col('count').desc())
)

# Show the full distribution of states
zi_abberant_states_spdf.show(100)
zi_abberant_states_spdf.save_as_table('zi_abberant_states_spdf').mode('overwrite')

---------------------------------------------------------------------
|"ZI_C_STATE"              |"COUNT"   |"PERCENTAGE"  |"ROW_NUMBER"  |
---------------------------------------------------------------------
|California                |14697979  |11.751900     |1             |
|Florida                   |11620745  |9.291500      |2             |
|Texas                     |10575177  |8.455500      |3             |
|New York                  |9699314   |7.755200      |4             |
|                          |6847677   |5.475100      |5             |
|Georgia                   |5197562   |4.155800      |6             |
|Pennsylvania              |4308309   |3.444700      |7             |
|North Carolina            |3656136   |2.923300      |8             |
|Illinois                  |3519952   |2.814400      |9             |
|Michigan                  |3461061   |2.767300      |10            |
|Arizona                   |3115480   |2.491000      |11            |
|Virginia           

In [None]:
# # Create standardization lookup from aberrant states dataframe

# STATE_CLEANUP_INSTRUCTIONS = """
# Convert this state to standard 2-letter abbreviation. Follow postal service standards.
#  It may be in a different format than the standard e.g. "California" -> "CA" or "Kent" -> "KY".')))
#  answer only with the 2-letter abbreviation.
# If no state is provided, returnn XX.
# Below is the state to be standardized:

# """

# zi_abberant_states_standardization_lookup = (zi_abberant_states_spdf
#     .withColumn('prompt_clean_state', F._concat_ws_ignore_nulls(' ', F.lit(STATE_CLEANUP_INSTRUCTIONS), F.col('ZI_C_STATE')))
#     .withColumn('zi_standardized_state',F.call_builtin("SNOWFLAKE.CORTEX.TRY_COMPLETE"
#                                                , F.lit('claude-3-7-sonnet')
#                                                , F.col('prompt_clean_state')
#                                                # , options
#                                                )            
#                 )
#     .withColumn('zi_standardized_state', F.when(F.col('zi_standardized_state') == 'XX', None).otherwise(F.col('zi_standardized_state')))
# )

# zi_abberant_states_standardization_lookup.write.mode("overwrite").save_as_table("zi_abberant_states_standardization_lookup")
# print("Successfully wrote zi_abberant_states_standardization_lookup table to Snowflake")


Successfully wrote zi_abberant_states_standardization_lookup table to Snowflake


In [None]:
# Reload the standardization lookup from Snowflake table
zi_abberant_states_standardization_lookup = session.table('zi_abberant_states_standardization_lookup')
print(f"`Reloaded: zi_abberant_states_standardization_lookup`: {humanize.metric(zi_abberant_states_standardization_lookup.count())}")

zi_abberant_states_standardization_lookup.select(F.col('ZI_C_STATE'), F.col('zi_standardized_state')).distinct().show()



`Reloaded: zi_abberant_states_standardization_lookup`: 56.0
------------------------------------------------------------------------------------------------------------------------------------------
|"ZI_C_STATE"    |"COUNT"   |"PERCENTAGE"  |"ROW_NUMBER"  |"PROMPT_CLEAN_STATE"                                |"ZI_STANDARDIZED_STATE"  |
------------------------------------------------------------------------------------------------------------------------------------------
|North Carolina  |3656136   |2.923300      |8             |                                                    |NC                       |
|                |          |              |              |Convert this state to standard 2-letter abbrevi...  |                         |
|                |          |              |              | It may be in a different format than the stand...  |                         |
|                |          |              |              | answer only with the 2-letter abbreviation.   

In [None]:
zi_thin_std_spdf = (zi_thin_enhanced
.join(zi_abberant_states_standardization_lookup.selectExpr("ZI_C_STATE as abberant_state, zi_standardized_state")
, F.col('ZI_C_STATE') == F.col('abberant_state')
, 'left')
)

zi_thin_std_spdf.write.mode("overwrite").save_as_table("zi_thin_std_spdf")

------------------------------------------------------
|"ZI_C_STATE"              |"ZI_STANDARDIZED_STATE"  |
------------------------------------------------------
|Massachusetts             |MA                       |
|Oklahoma                  |OK                       |
|Pennsylvania              |PA                       |
|Texas                     |TX                       |
|Florida                   |FL                       |
|Tennessee                 |TN                       |
|Colorado                  |CO                       |
|Arkansas                  |AR                       |
|Washington                |WA                       |
|Kansas                    |KS                       |
|Indiana                   |IN                       |
|New Mexico                |NM                       |
|Georgia                   |GA                       |
|Wisconsin                 |WI                       |
|Hawaii                    |HI                       |
|Delaware 

In [None]:
zi_thin_std_spdf = session.table('zi_thin_std_spdf')
print(f"Reloaded: zi_thin_std_spdf: {humanize.metric(zi_thin_std_spdf.count())}")