# Industry Classification Entity Resolution



## Purpose
This notebook focuses on entity resolution for industry classification lookup tables. We'll use AI functions to standardize and match industry codes across three different classification systems:

### Target Tables
- **NAICS** (North American Industry Classification System)
- **SIC** (Standard Industrial Classification)  
- **MCC** (Merchant Category Codes)

### Approach
1. Load and explore each classification table
2. Apply AI_CLASSIFY to standardize industry descriptions
3. Use AI_SIMILARITY to identify matching classifications across systems
4. Create unified industry mapping for entity resolution


In [281]:
# =============================================================================
# 🔧 ENTITY RESOLUTION FRAMEWORK
# =============================================================================

# 1. IMPORTS & SETUP
# Python Data 
import pandas as pd
from pydantic import BaseModel, Field
import json

# Python Formatting & Display
import humanize 
from datetime import datetime
from textwrap import dedent

#  Snowpark
import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T
import snowflake.snowpark.window as W
from snowflake.snowpark import Session
from snowflake.snowpark.context import get_active_session

# Cortex
import snowflake.cortex as C

# Helper Functions
def display_df_info(spdf, name="DataFrame"):
    """
    Display first 10 rows and metrics for a Snowpark DataFrame
    
    Args:
        spdf: Snowpark DataFrame to analyze
        name: Name to display for the DataFrame
    """
    # Get row and column counts
    row_count = spdf.count()
    col_count = len(spdf.columns)
    
    print(f"\n📊 {name} Overview:")
    print(f"  • Rows: {humanize.intword(row_count)} ({humanize.intcomma(row_count)})")
    print(f"  • Columns: {col_count}")
    
    print("\n🔍 First 10 rows:")
    spdf.limit(10).show()

def show_full_df(df, num_rows=10):
    """Display DataFrame with full formatting"""
    return df.limit(num_rows).to_pandas().style.set_properties(**{
        'text-align': 'left',
        'white-space': 'pre-wrap'
    }).set_table_styles([dict(selector='th', props=[('text-align', 'left')])])

def clean_na_values(df, columns_to_select=None):
    """
    Replace 'NA' string values with None in all string columns of a dataframe
    
    Args:
        df: Snowpark DataFrame to clean
        columns_to_select: Optional list of columns to select in output DataFrame
        
    Returns:
        Snowpark DataFrame with 'NA' values replaced with None
    """
    # Get all string columns
    string_columns = [field.name for field in df.schema.fields 
                     if isinstance(field.datatype, T.StringType)]

    # Create list of column transformations
    column_transformations_list = [
        F.when(F.col(column) == 'NA', None)
         .when((F.col(column) == 'x') & (column == 'FEDTAXID'), None)
         .otherwise(F.col(column))
        for column in string_columns
    ]

    # Apply all transformations at once
    cleaned_df = df.with_columns(string_columns, column_transformations_list)
    
    # Select specified columns if provided
    if columns_to_select:
        cleaned_df = cleaned_df.select(columns_to_select)
        
    return cleaned_df

print("✅ All imports and helper functions loaded successfully")


✅ All imports and helper functions loaded successfully


In [261]:
# 2. SESSION INITIALIZATION
def initialize_session():
    """Initialize Snowflake session with fallback options"""
    try:
        # First check for active session
        session = get_active_session()
        print("🔑 Using existing active Snowflake session")
        return session
    except Exception as e:
        print(f"⚠️  No active session found: {e}")
        try:
            # Try to load local credentials
            with open("/Users/jsoliz/.creds/gpn_connection.json", 'r') as f:
                connection_params = json.load(f)
            session = Session.builder.configs(connection_params).create()
            print("🔑 Local session initialized successfully")
            return session
        except Exception as e2:
            print(f"❌ Session initialization failed: {e2}")
            return None

# Initialize session
session = initialize_session()

if session:
    print(f"📊 Session active: {session.get_current_warehouse()}")
    print(f"🏢 Database: {session.get_current_database()}")
    print(f"📁 Schema: {session.get_current_schema()}")
else:
    print("❌ No session available - please check your connection configuration")


⚠️  No active session found: (1409): More than one active session is detected. When you call function 'udf' or use decorator '@udf', you must specify the 'session' parameter if you created multiple sessions.Alternatively, you can use 'session.udf.register' to register UDFs
Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...
Going to open: https://login.microsoftonline.com/3a7077d2-14ae-4001-a736-75e0437e2b89/saml2?SAMLRequest=lZLdbuIwEIVfJfJeJzbmJ9QCKlqEiralqIRld%2B%2FcZACrjs16nIby9GtCkboXrbR31vgc%2B5s5M7g%2BlDp6BYfKmiFpJYxEYHJbKLMdklU2jfskQi9NIbU1MCRvgOR6NEBZ6r0YV35nnuBPBeij8JBB0VwMSeWMsBIVCiNLQOFzsRw%2F3AueMLF31tvcavLB8rVDIoLzgfBiKVAFvJ33e0FpXddJ3U6s21LOGKPsigbVSfLtoj%2BEnj7RtyjrnPRBEeSLd7YbZc4j%2BArr%2BSxCcZdli3jxuMxINL6g3lqDVQluCe5V5bB6uj8DYCDYgXReh6HG9bHX5YwnaGy90fIFclvuKx9eTcKJbqCg2m5VaHw2GZL9iyqOK9zO

### Schemata (Confirm Tables)

In [16]:
# Load the classification tables
naisc_lu_spdf = session.table("sandbox.javier.LU_NAICS")
sic_lu_spdf = session.table("sandbox.javier.LU_SIC")
mcc_lu_spdf = session.table("sandbox.javier.LU_MCC")

# Get record counts
naics_count = naisc_lu_spdf.count()
sic_count = sic_lu_spdf.count()
mcc_count = mcc_lu_spdf.count()

print(f"NAICS Records: {naics_count:,} total")
naisc_lu_spdf.limit(10).show()

print(f"\nSIC Records: {sic_count:,} total") 
sic_lu_spdf.limit(3).show()

print(f"\nMCC Records: {mcc_count:,} total")
show_full_df(mcc_lu_spdf.limit(3))

# Get schema information
print("\nSchema Summary:")
print("---------------")
print("NAICS Columns:", len(naisc_lu_spdf.schema.names))
print("NAICS Schema:")
for field in naisc_lu_spdf.schema.fields:
    print(f"  - {field.name}: {field.datatype}")

print("\nSIC Columns:", len(sic_lu_spdf.schema.names))
print("SIC Schema:")
for field in sic_lu_spdf.schema.fields:
    print(f"  - {field.name}: {field.datatype}")

print("\nMCC Columns:", len(mcc_lu_spdf.schema.names))
print("MCC Schema:")
for field in mcc_lu_spdf.schema.fields:
    print(f"  - {field.name}: {field.datatype}")


NAICS Records: 2,125 total
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"CODE"  |"TITLE"                                     |"DESCRIPTION"                                       |"REFERENCE_CODE"  |"REFERENCE_DESCRIPTION"                             |"DESCRIPTION_FULL"                                  |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|11      |Agriculture, Forestry, Fishing and Hunting  |The Sector as a Whole    The Agriculture, Fores...  |NULL              |NULL                                                |The Sector as a Whole    The Agriculture, Fores...  |
|111     |Crop Production            

In [None]:
# Step 3: AI Classification for Final Selection

print("🤖 Preparing data for AI_CLASSIFY to select best SIC match for each NAICS...")

# First, create arrays of SIC categories for each NAICS
# We'll collect all 3 SIC options as an array of categories
naics_with_sic_categories = naics_sic_cross_join_top3_spdf.select(
    "NAICS_CODE",
    "NAICS_TITLE", 
    "NAICS_DESCRIPTION",
    "SIC_CODE",
    "SIC_TITLE",
    "SIMILARITY_RANK"
).group_by(
    "NAICS_CODE", "NAICS_TITLE", "NAICS_DESCRIPTION"
).agg(
    # Create array of SIC codes as classification categories
    F.array_agg(F.col("SIC_CODE")).alias("SIC_CATEGORIES"),
    # Also keep SIC titles for reference
    F.array_agg(F.col("SIC_TITLE")).alias("SIC_TITLES")
)

print("✅ SIC categories prepared! Sample:")
naics_with_sic_categories.limit(2).show()

print(f"\n🤖 Using AI_CLASSIFY to select best SIC matches for {humanize.intcomma(naics_with_sic_categories.count())} NAICS codes...")

# Create input text combining NAICS title and description for classification
# Use AI_CLASSIFY to classify each NAICS into one of its 3 SIC categories
ai_classifications = naics_with_sic_categories.select(
    "NAICS_CODE",
    "NAICS_TITLE",
    "NAICS_DESCRIPTION",
    "SIC_CATEGORIES",
    "SIC_TITLES",
    
    # Create classification input text
    F.concat(
        F.col("NAICS_TITLE"), F.lit(". "), F.col("NAICS_DESCRIPTION")
    ).alias("CLASSIFICATION_TEXT"),
    
    # Use AI_CLASSIFY to select best SIC match from the 3 options
    F.call_function("AI_CLASSIFY",
        F.concat(F.col("NAICS_TITLE"), F.lit(" - "), F.col("NAICS_DESCRIPTION")),
        F.col("SIC_CATEGORIES")
    ).alias("AI_CLASSIFICATION")
)

print("🔍 Sample AI classifications:")
ai_classifications.limit(3).select("NAICS_CODE", "NAICS_TITLE", "SIC_CATEGORIES", "AI_CLASSIFICATION").show()

print("✅ AI classifications complete! Extracting selected SIC codes...")


In [None]:
# Step 3: AI Classification for Final Selection

print("🤖 Preparing data for AI_CLASSIFY to select best SIC match for each NAICS...")

# First, create arrays of SIC categories for each NAICS
# We'll collect all 3 SIC options as an array of categories
naics_with_sic_categories = naics_sic_cross_join_top3_spdf.select(
    "NAICS_CODE",
    "NAICS_TITLE", 
    "NAICS_DESCRIPTION",
    "SIC_CODE",
    "SIC_TITLE",
    "SIMILARITY_RANK"
).group_by(
    "NAICS_CODE", "NAICS_TITLE", "NAICS_DESCRIPTION"
).agg(
    # Create array of SIC codes as classification categories
    F.array_agg(F.col("SIC_CODE")).alias("SIC_CATEGORIES"),
    # Also keep SIC titles for reference
    F.array_agg(F.col("SIC_TITLE")).alias("SIC_TITLES")
)

print("✅ SIC categories prepared! Sample:")
naics_with_sic_categories.limit(2).show()

print(f"\n🤖 Using AI_CLASSIFY to select best SIC matches for {humanize.intcomma(naics_with_sic_categories.count())} NAICS codes...")

# Create input text combining NAICS title and description for classification
# Use AI_CLASSIFY to classify each NAICS into one of its 3 SIC categories
ai_classifications = naics_with_sic_categories.select(
    "NAICS_CODE",
    "NAICS_TITLE",
    "NAICS_DESCRIPTION",
    "SIC_CATEGORIES",
    "SIC_TITLES",
    
    # Create classification input text
    F.concat(
        F.col("NAICS_TITLE"), F.lit(". "), F.col("NAICS_DESCRIPTION")
    ).alias("CLASSIFICATION_TEXT"),
    
    # Use AI_CLASSIFY to select best SIC match from the 3 options
    F.call_function("AI_CLASSIFY",
        F.concat(F.col("NAICS_TITLE"), F.lit(". "), F.col("NAICS_DESCRIPTION")),
        F.col("SIC_CATEGORIES")
    ).alias("AI_CLASSIFICATION")
)

print("🔍 Sample AI classifications:")
ai_classifications.limit(3).select("NAICS_CODE", "NAICS_TITLE", "SIC_CATEGORIES", "AI_CLASSIFICATION").show()

print("✅ AI classifications complete! Extracting selected SIC codes...")


In [None]:
# Step 3: AI Classification for Final Selection

print("🤖 Preparing data for AI_CLASSIFY to select best SIC match for each NAICS...")

# First, create arrays of SIC categories for each NAICS
# We'll collect all 3 SIC options as an array of categories
naics_with_sic_categories = naics_sic_cross_join_top3_spdf.select(
    "NAICS_CODE",
    "NAICS_TITLE", 
    "NAICS_DESCRIPTION",
    "SIC_CODE",
    "SIC_TITLE",
    "SIMILARITY_RANK"
).group_by(
    "NAICS_CODE", "NAICS_TITLE", "NAICS_DESCRIPTION"
).agg(
    # Create array of SIC codes as classification categories
    F.array_agg(F.col("SIC_CODE")).alias("SIC_CATEGORIES"),
    # Also keep SIC titles for reference
    F.array_agg(F.col("SIC_TITLE")).alias("SIC_TITLES")
)

print("✅ SIC categories prepared! Sample:")
naics_with_sic_categories.limit(2).show()

print(f"\n🤖 Using AI_CLASSIFY to select best SIC matches for {humanize.intcomma(naics_with_sic_categories.count())} NAICS codes...")

# Create input text combining NAICS title and description for classification
# Use AI_CLASSIFY to classify each NAICS into one of its 3 SIC categories
ai_classifications = naics_with_sic_categories.select(
    "NAICS_CODE",
    "NAICS_TITLE",
    "NAICS_DESCRIPTION",
    "SIC_CATEGORIES",
    "SIC_TITLES",
    
    # Create classification input text
    F.concat(
        F.col("NAICS_TITLE"), F.lit(". "), F.col("NAICS_DESCRIPTION")
    ).alias("CLASSIFICATION_TEXT"),
    
    # Use AI_CLASSIFY to select best SIC match from the 3 options
    F.call_function("AI_CLASSIFY",
        F.concat(F.col("NAICS_TITLE"), F.lit(". "), F.col("NAICS_DESCRIPTION")),
        F.col("SIC_CATEGORIES")
    ).alias("AI_CLASSIFICATION")
)

print("🔍 Sample AI classifications:")
ai_classifications.limit(3).select("NAICS_CODE", "NAICS_TITLE", "SIC_CATEGORIES", "AI_CLASSIFICATION").show()

print("✅ AI classifications complete! Extracting selected SIC codes...")


# 🏷️ NAICS → SIC Mapping


## Objective
Create systematic mappings between NAICS and SIC classification systems using Snowflake's AI functions.

## Step-by-Step Approach

### Step 1: Cross Join and AI Similarity Calculation
- **Goal**: Cross join NAICS (left) with SIC (right) tables
- **AI Function**: Use `AI_SIMILARITY()` to compare names and descriptions between all NAICS-SIC pairs
- **Output**: Every NAICS record paired with every SIC record, including similarity scores

### Step 2: Ranking and Top 3 Selection  
- **Goal**: Use ranking function to keep top 3 SIC matches per NAICS code
- **Method**: `ROW_NUMBER()` or `RANK()` partitioned by NAICS, ordered by similarity score DESC
- **Output**: Maximum 3 SIC candidates per NAICS record

### Step 3: AI Classification for Final Selection
- **Goal**: Use `AI_CLASSIFY()` to pick the best match among the top 3 candidates
- **Method**: Let AI choose the most appropriate SIC match for each NAICS
- **Output**: Final `naisc_sic_lu_spdf` with one-to-one NAICS→SIC mapping

---

## 🚀 Step 1: Cross Join and AI Similarity Calculation


In [35]:
# Step 1: Cross Join NAICS and SIC with AI Similarity Calculation

print("\n🔍 First, let's examine the table structures:")
print("\nNAICS Table Columns:")
print(naisc_lu_spdf.columns)
print("\nSIC Table Columns:")
print(sic_lu_spdf.columns)

print(f"\n📊 Table Sizes:")
print(f"NAICS Records: {humanize.intcomma(naisc_lu_spdf.count())}")
print(f"SIC Records: {humanize.intcomma(sic_lu_spdf.count())}")
print(f"Cross Join Size: {humanize.intcomma(naisc_lu_spdf.count() * sic_lu_spdf.count())} total combinations")

# Create cross join with AI similarity calculation
print("\n🤖 Creating cross join with AI_SIMILARITY scores...")

import time
start_time = time.time()

# Time the cross join creation
naics_sic_cross_join = (
    naisc_lu_spdf.alias("naics").cross_join(
      sic_lu_spdf.alias("sic")
    )
)

# Time the column selection and AI similarity calculation
naics_sic_cross_join = (
    naics_sic_cross_join.select(
    # NAICS fields
    F.col("CODE").alias("NAICS_CODE"),
    F.concat(
        F.col("TITLE"),
        F.lit(" - "),
        F.col("DESCRIPTION_FULL")
    ).alias("NAICS_TEXT"),
    
    # SIC fields  
    F.col("SIC_INDUSTRY_CODE").alias("SIC_CODE"),
    F.concat(
        F.col("SIC_INDUSTRY_DESCRIPTION"),
        F.lit(" - "),
        F.col("SIC_MAJOR_GROUP_DESCRIPTION"),
        F.lit(" - "),
        F.col("SIC_DIVISION_DESCRIPTION")
    ).alias("SIC_TEXT"),
    
    # AI Similarity score
    F.call_function("AI_SIMILARITY",
        F.col("NAICS_TEXT"),
        F.col("SIC_TEXT")
    ).alias("SIMILARITY_SCORE")
)
)

# Write results to Snowflake table
print("\n💾 Writing results to Snowflake table...")
naics_sic_cross_join.write.mode("overwrite").saveAsTable("sandbox.javier.naics_sic_cross_join")

print("\n✅ Cross join with AI similarity created and saved successfully!")

end_time = time.time()
print(f"⏱️ Total execution time: {round(end_time - start_time, 2)} seconds")




🔍 First, let's examine the table structures:

NAICS Table Columns:
['CODE', 'TITLE', 'DESCRIPTION', 'REFERENCE_CODE', 'REFERENCE_DESCRIPTION', 'DESCRIPTION_FULL']

SIC Table Columns:
['SIC_INDUSTRY_CODE', 'SIC_INDUSTRY_DESCRIPTION', 'SIC_MAJOR_GROUP_DESCRIPTION', 'SIC_DIVISION_DESCRIPTION']

📊 Table Sizes:
NAICS Records: 2,125
SIC Records: 1,005
Cross Join Size: 2,135,625 total combinations

🤖 Creating cross join with AI_SIMILARITY scores...

💾 Writing results to Snowflake table...

✅ Cross join with AI similarity created and saved successfully!
⏱️ Total execution time: 366.63 seconds


## 🏆 Step 2: Ranking and Top 3 Selection

**Goal**: Use window functions to rank SIC matches by similarity score and keep only the top 3 candidates per NAICS code.

**Method**: 
- Load existing cross join table: `sandbox.javier.naics_sic_cross_join`
- Apply `ROW_NUMBER()` partitioned by `NAICS_CODE`, ordered by similarity score DESC
- Filter to keep only ranks 1, 2, and 3 (top 3 matches per NAICS)

**Output**: Reduced dataset with maximum 3 SIC candidates per NAICS record, ready for final AI classification.


In [None]:
# Load Step 2 Results - Top 3 SIC Matches per NAICS

print("📂 Loading top 3 results table...")
naics_sic_cross_join_top3_spdf = session.table("sandbox.javier.naics_sic_cross_join_top3")

print("🔍 Examining top 3 results table:")
print("Columns:", naics_sic_cross_join_top3_spdf.columns)
print(f"Total records: {humanize.intcomma(naics_sic_cross_join_top3_spdf.count())}")

print(f"\n📊 Records per NAICS (should be max 3 per NAICS code):")
records_per_naics = naics_sic_cross_join_top3_spdf.group_by("NAICS_CODE").count().select(F.col("COUNT").alias("RECORDS_PER_NAICS"))
records_per_naics.group_by("RECORDS_PER_NAICS").count().order_by("RECORDS_PER_NAICS").show()

print(f"\n🏆 Sample of top 3 matches for a few NAICS codes:")
sample_naics = naics_sic_cross_join_top3_spdf.select("NAICS_CODE").distinct().limit(2)
for row in sample_naics.collect():
    naics_code = row['NAICS_CODE']
    print(f"\n📋 NAICS {naics_code} - Top 3 SIC matches:")
    naics_sic_cross_join_top3_spdf.filter(F.col("NAICS_CODE") == naics_code).select(
        "NAICS_CODE", "NAICS_TITLE", "SIC_CODE", "SIC_TITLE", "SIMILARITY_SCORE", "SIMILARITY_RANK"
    ).order_by("SIMILARITY_RANK").show()

print(f"\n✅ Step 2 results loaded and verified!")
print(f"📈 Ready for Step 3: AI_CLASSIFY to select final best match from top 3 candidates")


In [40]:
# Step 2: Load Cross Join Table and Apply Ranking

print("📂 Loading existing cross join table...")
naics_sic_cross_join = session.table("sandbox.javier.naics_sic_cross_join")

print("🔍 Examining cross join table structure:")
print("Columns:", naics_sic_cross_join.columns)
print(f"Total records: {humanize.intcomma(naics_sic_cross_join.count())}")

print("\n📋 Sample of cross join data (top and bottom 5 matches by similarity score):")
show_full_df(
    naics_sic_cross_join.orderBy(F.col("SIMILARITY_SCORE").desc()).limit(3)
    .unionAll(
        naics_sic_cross_join.orderBy(F.col("SIMILARITY_SCORE").asc()).limit(3)
    )
)


📂 Loading existing cross join table...
🔍 Examining cross join table structure:
Columns: ['NAICS_CODE', 'NAICS_TEXT', 'SIC_CODE', 'SIC_TEXT', 'SIMILARITY_SCORE']
Total records: 2,135,625

📋 Sample of cross join data (top and bottom 5 matches by similarity score):


Unnamed: 0,NAICS_CODE,NAICS_TEXT,SIC_CODE,SIC_TEXT,SIMILARITY_SCORE
0,483111,Deep Sea Freight Transportation - This U.S. industry comprises establishments primarily engaged in providing deep sea transportation of cargo to or from foreign ports. Cross-References.,4412,DEEP SEA FOREIGN TRANSPORTAION OF FREIGHT - WATER TRANSPORTATION - TRANSPORTATION,0.787876
1,926,Administration of Economic Programs - The Administration of Economic Programs subsector groups government establishments primarily engaged in the administration of economic programs.,9611,ADMINISTRATION OF GENERAL ECONOMIC PROGRAMS - ADMINISTRATION OF ECONOMIC PROGRAMS - PUBLIC ADMINISTRATION,0.781339
2,622110,"General Medical and Surgical Hospitals - This industry comprises establishments known and licensed as general medical and surgical hospitals primarily engaged in providing diagnostic and medical treatment (both surgical and nonsurgical) to inpatients with any of a wide variety of medical conditions. These establishments maintain inpatient beds and provide patients with food services that meet their nutritional requirements. These hospitals have an organized staff of physicians and other medical staff to provide patient care services. These establishments usually provide other services, such as outpatient services, anatomical pathology services, diagnostic X-ray services, clinical laboratory services, operating room services for a variety of procedures, and pharmacy services.",8062,GENERAL MEDICAL AND SURGICAL HOSPITALS - HEALTH SERVICES - SERVICES,0.778612
3,327,"Nonmetallic Mineral Product Manufacturing - The Nonmetallic Mineral Product Manufacturing subsector is based on the transformation of mined or quarried nonmetallic minerals, such as sand, gravel, stone, clay, and refractory materials, into products for intermediate or final consumption. Processes used include grinding, mixing, cutting, shaping, and honing. Heat often is used in the process and chemicals are frequently mixed to change the composition, purity, and chemical properties for the intended product. For example, glass is produced by heating silica sand to the melting point (sometimes combined with cullet or recycled glass) and then drawn, floated, or blow molded to the desired shape or thickness. Refractory materials are heated and then formed into bricks or other shapes for use in industrial applications. The Nonmetallic Mineral Product Manufacturing subsector includes establishments that manufacture bricks, refractories, ceramic products, and glass and glass products, such as plate glass and containers. Also included are cement and concrete products, lime, gypsum, and other nonmetallic mineral products including abrasive products, ceramic plumbing fixtures, statuary, cut stone products, and mineral wool. The products are used in a wide range of activities from construction and heavy and light manufacturing to articles for personal use. Mining, beneficiating, and manufacturing activities often occur in a single location. Separate receipts will be collected for these activities whenever possible. When receipts cannot be broken out between mining and manufacturing, establishments that mine or quarry nonmetallic minerals, beneficiate the nonmetallic minerals, and further process the nonmetallic minerals into a more finished manufactured product are classified based on the primary activity of the establishment. A mine that manufactures a small amount of finished products is classified in Sector 21, Mining, Quarrying, and Oil and Gas Extraction. An establishment that mines whose primary output is a more finished manufactured product is classified in the Manufacturing sector. Excluded from the Nonmetallic Mineral Product Manufacturing subsector are establishments that primarily beneficiate mined nonmetallic minerals. Beneficiation is the process whereby the extracted material is reduced to particles that can be separated into mineral and waste, the former suitable for further processing or direct use. Beneficiation establishments are included in Sector 21, Mining, Quarrying, and Oil and Gas Extraction.",6311,"LIFE INSURANCE - INSURANCE CARRIERS - FINANCE, INSURANCE, & REAL ESTATE",-0.010281
4,327,"Nonmetallic Mineral Product Manufacturing - The Nonmetallic Mineral Product Manufacturing subsector is based on the transformation of mined or quarried nonmetallic minerals, such as sand, gravel, stone, clay, and refractory materials, into products for intermediate or final consumption. Processes used include grinding, mixing, cutting, shaping, and honing. Heat often is used in the process and chemicals are frequently mixed to change the composition, purity, and chemical properties for the intended product. For example, glass is produced by heating silica sand to the melting point (sometimes combined with cullet or recycled glass) and then drawn, floated, or blow molded to the desired shape or thickness. Refractory materials are heated and then formed into bricks or other shapes for use in industrial applications. The Nonmetallic Mineral Product Manufacturing subsector includes establishments that manufacture bricks, refractories, ceramic products, and glass and glass products, such as plate glass and containers. Also included are cement and concrete products, lime, gypsum, and other nonmetallic mineral products including abrasive products, ceramic plumbing fixtures, statuary, cut stone products, and mineral wool. The products are used in a wide range of activities from construction and heavy and light manufacturing to articles for personal use. Mining, beneficiating, and manufacturing activities often occur in a single location. Separate receipts will be collected for these activities whenever possible. When receipts cannot be broken out between mining and manufacturing, establishments that mine or quarry nonmetallic minerals, beneficiate the nonmetallic minerals, and further process the nonmetallic minerals into a more finished manufactured product are classified based on the primary activity of the establishment. A mine that manufactures a small amount of finished products is classified in Sector 21, Mining, Quarrying, and Oil and Gas Extraction. An establishment that mines whose primary output is a more finished manufactured product is classified in the Manufacturing sector. Excluded from the Nonmetallic Mineral Product Manufacturing subsector are establishments that primarily beneficiate mined nonmetallic minerals. Beneficiation is the process whereby the extracted material is reduced to particles that can be separated into mineral and waste, the former suitable for further processing or direct use. Beneficiation establishments are included in Sector 21, Mining, Quarrying, and Oil and Gas Extraction.",6351,"SURETY INSURANCE - INSURANCE CARRIERS - FINANCE, INSURANCE, & REAL ESTATE",0.003763
5,335931,"Current-Carrying Wiring Device Manufacturing - This U.S. industry comprises establishments primarily engaged in manufacturing current-carrying wiring devices. Illustrative Examples: Bus bars, electrical conductors (except switchgear-type), manufacturing GFCI (ground fault circuit interrupters) manufacturing Lamp holders manufacturing Lightning arrestors and coils manufacturing Receptacles (i.e., outlets), electrical, manufacturing Switches for electrical wiring (e.g., pressure, pushbutton, snap, tumbler) manufacturing Cross-References. Establishments primarily engaged in--",9431,ADMINISTRATION OF PUBLIC HEALTH PROGRAMS - ADMINISTRATION-HUMAN RESOURCE PROGRAMS - PUBLIC ADMINISTRATION,0.00492


In [45]:
print("\n🏆 Applying ranking to get top 3 SIC matches per NAICS...")

# Apply ROW_NUMBER() window function to rank SIC matches by similarity score
# Assuming we have similarity score columns - let's check which one to use for ranking
top_3_matches = naics_sic_cross_join.select(
    "*",
    F.row_number().over(
        W.Window.partition_by("NAICS_CODE").orderBy(F.col("SIMILARITY_SCORE").desc())
    ).alias("SIMILARITY_RANK")
).filter(
    F.col("SIMILARITY_RANK") <= 3
)

print("✅ Ranking applied! Keeping top 3 SIC matches per NAICS code.")
print(f"📊 Reduced from {humanize.intcomma(naics_sic_cross_join.count())} to {humanize.intcomma(top_3_matches.count())} records")

print("\n🔍 Sample of ranked results (showing top matches for first few NAICS codes):")
top_3_matches.filter(F.col("SIMILARITY_RANK") == 1).limit(5).show()

print("\n📈 Distribution of similarity ranks:")
top_3_matches.group_by("SIMILARITY_RANK").count().order_by("SIMILARITY_RANK").show()

# Write results to temporary table for Step 3
top_3_matches.write.mode("overwrite").save_as_table("sandbox.javier.naics_sic_cross_join_top3")
naics_sic_cross_join_top3_spdf = session.table("sandbox.javier.naics_sic_cross_join_top3")
print("\n💾 Results saved to table for Step 3!")



🏆 Applying ranking to get top 3 SIC matches per NAICS...
✅ Ranking applied! Keeping top 3 SIC matches per NAICS code.
📊 Reduced from 2,135,625 to 6,375 records

🔍 Sample of ranked results (showing top matches for first few NAICS codes):
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"NAICS_CODE"  |"NAICS_TEXT"                                        |"SIC_CODE"  |"SIC_TEXT"                                          |"SIMILARITY_SCORE"  |"SIMILARITY_RANK"  |
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|62321         |Residential Intellectual and Developmental Disa...  |8361        |RESIDENTIAL CARE - SOCIAL SERVICES - SERVICES       |0.5289557155384758  |1                  |
|523999        |Miscellaneous Financial Investment Act

In [None]:
print("📂 Loading top 3 results table...")
naics_sic_cross_join_top3_spdf = session.table("sandbox.javier.naics_sic_cross_join_top3")

print("🔍 Examining top 3 results table:")
print("Columns:", naics_sic_cross_join_top3_spdf.columns)
print(f"Total records: {humanize.intcomma(naics_sic_cross_join_top3_spdf.count())}")

print(f"\n📊 Records per NAICS (should be max 3 per NAICS code):")
records_per_naics = naics_sic_cross_join_top3_spdf.group_by("NAICS_CODE").count().select(F.col("COUNT").alias("RECORDS_PER_NAICS"))
records_per_naics.group_by("RECORDS_PER_NAICS").count().order_by("RECORDS_PER_NAICS").show()

print(f"\n🏆 Sample of top 3 matches for a few NAICS codes:")
sample_naics = naics_sic_cross_join_top3_spdf.select("NAICS_CODE").distinct().limit(2)
for row in sample_naics.collect():
    naics_code = row['NAICS_CODE']
    print(f"\n📋 NAICS {naics_code} - Top 3 SIC matches:")
    naics_sic_cross_join_top3_spdf.filter(F.col("NAICS_CODE") == naics_code).select(
        "NAICS_TEXT", "SIC_TEXT", "SIMILARITY_SCORE"
    ).order_by(F.col("SIMILARITY_SCORE").desc()).show(max_width=1000)

print(f"\n✅ Step 2 results loaded and verified!")
print(f"📈 Ready for Step 3: AI_CLASSIFY to select final best match from top 3 candidates")

📂 Loading top 3 results table...
🔍 Examining top 3 results table:
Columns: ['NAICS_CODE', 'NAICS_TEXT', 'SIC_CODE', 'SIC_TEXT', 'SIMILARITY_SCORE', 'SIMILARITY_RANK']
Total records: 6,375

📊 Records per NAICS (should be max 3 per NAICS code):
---------------------------------
|"RECORDS_PER_NAICS"  |"COUNT"  |
---------------------------------
|3                    |2125     |
---------------------------------


🏆 Sample of top 3 matches for a few NAICS codes:

📋 NAICS 533110 - Top 3 SIC matches:
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"NAICS_TEXT"                                   

In [59]:
# Step 3: AI Classification for Final Selection using AI_CLASSIFY

print("📂 Loading top 3 results table...")
naics_sic_cross_join_top3_spdf = session.table("sandbox.javier.naics_sic_cross_join_top3")

naics_sic_cross_join_top3_spdf.print_schema()

📂 Loading top 3 results table...
root
 |-- "NAICS_CODE": StringType(16777216) (nullable = True)
 |-- "NAICS_TEXT": StringType(33554435) (nullable = True)
 |-- "SIC_CODE": StringType(15) (nullable = True)
 |-- "SIC_TEXT": StringType(756) (nullable = True)
 |-- "SIMILARITY_SCORE": DoubleType() (nullable = True)
 |-- "SIMILARITY_RANK": LongType() (nullable = False)


In [131]:
print("🤖 Preparing data for AI_CLASSIFY to select best SIC match for each NAICS...")

# Group the top 3 SIC codes per NAICS into arrays for AI_CLASSIFY
naics_with_sic_categories = naics_sic_cross_join_top3_spdf.select(
    "NAICS_CODE",
    "NAICS_TEXT", 
    "SIC_CODE",
    "SIC_TEXT",
    "SIMILARITY_SCORE",
    "SIMILARITY_RANK"
).group_by(
    "NAICS_CODE", "NAICS_TEXT"
).agg(
    # Create array of SIC codes as classification categories
    F.array_agg(F.expr("SIC_CODE")).within_group("SIMILARITY_RANK").alias("SIC_CODES"),
    # Also keep SIC titles for reference
    F.array_agg(F.expr("SIC_TEXT")).within_group("SIMILARITY_RANK").alias("SIC_TEXTS"),
    F.array_agg(F.expr("SIMILARITY_RANK")).within_group("SIMILARITY_RANK").alias("SIMILARITY_RANKS")

)

print("✅ SIC categories prepared! Sample:")
naics_with_sic_categories.limit(2).show()

🤖 Preparing data for AI_CLASSIFY to select best SIC match for each NAICS...
✅ SIC categories prepared! Sample:
-------------------------------------------------------------------------------------------------------------------------------------------------------------
|"NAICS_CODE"  |"NAICS_TEXT"                                        |"SIC_CODES"  |"SIC_TEXTS"                                         |"SIMILARITY_RANKS"  |
-------------------------------------------------------------------------------------------------------------------------------------------------------------
|458320        |Luggage and Leather Goods Retailers - This indu...  |[            |[                                                   |[                   |
|              |                                                    |  "5948",    |  "LUGGAGE & LEATHER GOODS STORES - MISCELLANEO...  |  1,                |
|              |                                                    |  "3161",    |  "LUGGAGE - LEA

In [132]:
print(f"\n🤖 Using AI_CLASSIFY to select best SIC matches for {humanize.intcomma(naics_with_sic_categories.count())} NAICS codes...")

# Use AI_CLASSIFY to classify each NAICS into one of its 3 SIC categories
naics_sic_classified = naics_with_sic_categories.select(
    "NAICS_CODE",
    "NAICS_TEXT",
    "SIC_CODES",
    "SIC_TEXTS",
   
    # Use AI_CLASSIFY to select best SIC match from the 3 options
    F.call_function("AI_CLASSIFY",
        F.concat(F.col("NAICS_CODE"), F.lit(". "), F.col("NAICS_TEXT")),    # input
        F.col("SIC_TEXTS"),                                                 # list_of_categories
    ).alias("AI_CLASSIFIED_SIC_TEXT")
)

print("🔍 Sample AI classifications:")
naics_sic_classified.selectExpr("""NAICS_CODE
                                , NAICS_TEXT
                                , SIC_TEXTS
                                , AI_CLASSIFIED_SIC_TEXT:labels[0]::string                   as AI_CLASSIFIED_SIC_TEXT_STR
                                , COALESCE(AI_CLASSIFIED_SIC_TEXT_STR, SIC_TEXTS[0]::string) as MOST_LIKELY_SIC_TEXT
                                , AI_CLASSIFIED_SIC_TEXT_STR is not null as CLASSIFY_HAD_RETURN 
                                ,CLASSIFY_HAD_RETURN AND  AI_CLASSIFIED_SIC_TEXT_STR = SIC_TEXTS[0]::string as CLASSIFY_RETURNED_MOST_SIMILAR_SIC
                              """
).where((~F.col("CLASSIFY_RETURNED_MOST_SIMILAR_SIC")) & (F.col("CLASSIFY_HAD_RETURN"))).limit(2).show(max_width=1000)

print("✅ AI classifications complete! Ready for extraction...")



🤖 Using AI_CLASSIFY to select best SIC matches for 2,125 NAICS codes...
🔍 Sample AI classifications:
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"NAICS_CODE"  |"NAICS_TEXT"                           

In [133]:
import time
start_time = time.time()

print(f"\n🤖 Using AI_CLASSIFY to select best SIC matches for {humanize.intcomma(naics_with_sic_categories.count())} NAICS codes...")

# Use AI_CLASSIFY to classify each NAICS into one of its 3 SIC categories
naics_sic_classified = naics_with_sic_categories.select(
    "NAICS_CODE",
    "NAICS_TEXT", 
    "SIC_CODES",
    "SIC_TEXTS",
   
    # Use AI_CLASSIFY to select best SIC match from the 3 options
    F.call_function("AI_CLASSIFY",
        F.concat(F.col("NAICS_CODE"), F.lit(". "), F.col("NAICS_TEXT")),    # input
        F.col("SIC_TEXTS"),                                                 # list_of_categories
    ).alias("AI_CLASSIFIED_SIC_TEXT")
)

naics_sic_classified.selectExpr("""NAICS_CODE
                                , NAICS_TEXT
                                , SIC_TEXTS
                                , AI_CLASSIFIED_SIC_TEXT:labels[0]::string                   as AI_CLASSIFIED_SIC_TEXT_STR
                                , COALESCE(AI_CLASSIFIED_SIC_TEXT_STR, SIC_TEXTS[0]::string) as MOST_LIKELY_SIC_TEXT
                                , AI_CLASSIFIED_SIC_TEXT_STR is not null as CLASSIFY_HAD_RETURN 
                                ,CLASSIFY_HAD_RETURN AND  AI_CLASSIFIED_SIC_TEXT_STR = SIC_TEXTS[0]::string as CLASSIFY_RETURNED_MOST_SIMILAR_SIC
                              """
).write.mode("overwrite").save_as_table("naics_sic_classified")

end_time = time.time()
duration = end_time - start_time

print(f"✅ AI classifications complete and saved to table 'naics_sic_classified'! (took {duration:.1f} seconds)")



🤖 Using AI_CLASSIFY to select best SIC matches for 2,125 NAICS codes...
✅ AI classifications complete and saved to table 'naics_sic_classified'! (took 13.7 seconds)


In [144]:
naics_sic_classified = session.table("naics_sic_classified")
naics_sic_classified.show()

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"NAICS_CODE"  |"NAICS_TEXT"                                        |"SIC_TEXTS"                                         |"AI_CLASSIFIED_SIC_TEXT_STR"                        |"MOST_LIKELY_SIC_TEXT"                              |"CLASSIFY_HAD_RETURN"  |"CLASSIFY_RETURNED_MOST_SIMILAR_SIC"  |
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|52221         |Credit Card Issuing - This industry comprises e...  |[                                                   |NU

In [109]:
sic_code_to_sic_text = naics_sic_cross_join.select(
    F.col("SIC_CODE"),
    F.col("SIC_TEXT")
).distinct().orderBy("SIC_CODE")

sic_code_to_sic_text.write.mode("overwrite").save_as_table("sandbox.javier.sic_code_to_sic_text")
sic_code_to_sic_text = session.table("sandbox.javier.sic_code_to_sic_text")
sic_code_to_sic_text.show()

-------------------------------------------------------------------
|"SIC_CODE"  |"SIC_TEXT"                                          |
-------------------------------------------------------------------
|3648        |LIGHTING EQUIPMENT, NOT ELSEWHERE CLASSIFIED - ...  |
|3651        |HOUSEHOLD AUDIO & VIDEO EQUIPMENT - ELECTRONIC ...  |
|3652        |PRE-RECORDED RECORDS & TAPES - ELECTRONIC & OTH...  |
|3661        |TELEPHONE & TELEGRAPH APPARATUS - ELECTRONIC & ...  |
|3663        |RADIO & TV COMMUNICATIONS EQUIPMENT - ELECTRONI...  |
|3669        |COMMUNICATIONS EQUIPMENT, NOT ELSEWHERE CLASSIF...  |
|3671        |ELECTRON TUBES - ELECTRONIC & OTHER ELECTRICAL ...  |
|3672        |PRINTED CIRCUIT BOARDS - ELECTRONIC & OTHER ELE...  |
|3674        |SEMI-CONDUCTORS & RELATED DEVICES - ELECTRONIC ...  |
|3675        |ELECTRONIC CAPACITORS - ELECTRONIC & OTHER ELEC...  |
-------------------------------------------------------------------



TODO:  TRIM ALL FIELDS BEFORE RUNNING ANYTHING!

In [None]:
sic_code_to_sic_text = sic_code_to_sic_text.withColumn("SIC_TEXT", F.trim(F.col("SIC_TEXT"))) ############# REMOVE THIS after TODO: TRIM ALL FIELDS BEFORE RUNNING ANYTHING!

In [145]:
# Join back SIC codes by matching on SIC text
naics_sic_classified_with_sic_codes = naics_sic_classified.join(
    sic_code_to_sic_text,
    naics_sic_classified["MOST_LIKELY_SIC_TEXT"] == sic_code_to_sic_text["SIC_TEXT"],
    "left"
).select(
    naics_sic_classified["*"],
    sic_code_to_sic_text["SIC_CODE"].alias("MOST_LIKELY_SIC_CODE")
)

naics_sic_classified_with_sic_codes.write.mode("overwrite").save_as_table("naics_sic_classified_with_sic_codes")
naics_sic_classified_with_sic_codes.show()



------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"NAICS_CODE"  |"NAICS_TEXT"                                        |"SIC_TEXTS"                                         |"AI_CLASSIFIED_SIC_TEXT_STR"  |"MOST_LIKELY_SIC_TEXT"                              |"CLASSIFY_HAD_RETURN"  |"CLASSIFY_RETURNED_MOST_SIMILAR_SIC"  |"MOST_LIKELY_SIC_CODE"  |
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|336120        |Heavy Duty Truck Manufacturing - This industry ...  |[                                             

In [146]:
naics_sic_classified_with_sic_codes.select(
    "NAICS_CODE",
    "NAICS_TEXT",
    "MOST_LIKELY_SIC_CODE",
    "MOST_LIKELY_SIC_TEXT",
    "CLASSIFY_HAD_RETURN",
    "CLASSIFY_RETURNED_MOST_SIMILAR_SIC"
).show()

------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"NAICS_CODE"  |"NAICS_TEXT"                                        |"MOST_LIKELY_SIC_CODE"  |"MOST_LIKELY_SIC_TEXT"                              |"CLASSIFY_HAD_RETURN"  |"CLASSIFY_RETURNED_MOST_SIMILAR_SIC"  |
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|52221         |Credit Card Issuing - This industry comprises e...  |7323                    |CREDIT REPORTING SERVICES - BUSINESS SERVICES -...  |False                  |False                                 |
|11293         |Fur-Bearing Animal and Rabbit Production - This...  |0271                    |FUR-BEARING ANIMALS & RABBITS - AGRICULTURAL PR...  |True     

In [152]:
code_map_naics_sic = naics_sic_classified_with_sic_codes.select(
    "NAICS_CODE",
    "MOST_LIKELY_SIC_CODE"
).distinct().orderBy("NAICS_CODE")

code_map_naics_sic.write.mode("overwrite").save_as_table("sandbox.javier.code_map_naics_sic")
code_map_naics_sic = session.table("sandbox.javier.code_map_naics_sic")
code_map_naics_sic.orderBy("NAICS_CODE").show()

-----------------------------------------
|"NAICS_CODE"  |"MOST_LIKELY_SIC_CODE"  |
-----------------------------------------
|11            |0921                    |
|111           |0191                    |
|1111          |0191                    |
|11111         |0116                    |
|111110        |0116                    |
|11112         |0116                    |
|111120        |0116                    |
|11113         |0161                    |
|111130        |0161                    |
|11114         |0111                    |
-----------------------------------------



In [148]:
# Count distinct NAICS codes in our classified results
naics_count_classified = naics_sic_classified_with_sic_codes.select("NAICS_CODE").distinct().count()

# Count distinct NAICS codes in original lookup table
naics_count_original = naisc_lu_spdf.select("CODE").distinct().count()

print(f"Number of NAICS codes in classified results: {naics_count_classified}")
print(f"Number of NAICS codes in original lookup: {naics_count_original}")
print(f"Difference: {naics_count_original - naics_count_classified}")


Number of NAICS codes in classified results: 2125
Number of NAICS codes in original lookup: 2125
Difference: 0


# 🏷️ NAICS → MCC Mapping



## Step 1: Cross Join and AI Similarity Calculation

Creating NAICS-MCC cross join with AI similarity scores using the correct schemas.

In [251]:
# First, let's examine the table structures
print("\n🔍 First, let's examine the table structures:")
print("\nNAICS Table Columns:")
print(naisc_lu_spdf.columns)
print("\nMCC Table Columns:")
print(mcc_lu_spdf.columns)

print(f"\n📊 Table Sizes:")
print(f"NAICS Records: {humanize.intcomma(naisc_lu_spdf.count())}")
print(f"MCC Records: {humanize.intcomma(mcc_lu_spdf.count())}")
print(f"Cross Join Size: {humanize.intcomma(naisc_lu_spdf.count() * mcc_lu_spdf.count())} total combinations")

# Create cross join with AI similarity calculation
print("\n🤖 Creating cross join with AI_SIMILARITY scores...")

import time
start_time = time.time()

# Time the cross join creation
naics_mcc_cross_join = (
    naisc_lu_spdf.alias("naics").cross_join(
        mcc_lu_spdf.alias("mcc")
    )
)

# Time the column selection and AI similarity calculation
naics_mcc_cross_join = (
    naics_mcc_cross_join.select(
        # NAICS fields
        F.col("CODE").alias("NAICS_CODE"),
        F.concat(
            F.col("TITLE"),
            F.lit(" - "),
            F.col("DESCRIPTION_FULL")
        ).alias("NAICS_TEXT"),

        # MCC fields
        F.col("MCC").alias("MCC_CODE"),
        F.concat(
            F.col("MCC_DESCRIPTIVE_TITLE"),
            F.lit(" - "),
            F.col("INCLUDED_IN_THIS_MCC")
        ).alias("MCC_TEXT"),

        # AI Similarity score
        F.call_function("AI_SIMILARITY",
            F.col("NAICS_TEXT"),
            F.col("MCC_TEXT")
        ).alias("SIMILARITY_SCORE")
    )
)

# Write results to Snowflake table
print("\n💾 Writing results to Snowflake table...")
naics_mcc_cross_join.write.mode("overwrite").saveAsTable("sandbox.javier.naics_mcc_cross_join")

print("\n✅ Cross join with AI similarity created and saved successfully!")

end_time = time.time()
print(f"⏱️ Total execution time: {round(end_time - start_time, 2)} seconds")


🔍 First, let's examine the table structures:

NAICS Table Columns:
['CODE', 'TITLE', 'DESCRIPTION', 'REFERENCE_CODE', 'REFERENCE_DESCRIPTION', 'DESCRIPTION_FULL']

MCC Table Columns:
['MCC', 'MCC_DESCRIPTIVE_TITLE', 'INCLUDED_IN_THIS_MCC', 'SIMILAR_MCC_CODES', 'SIMILAR_MCC_CODE_READABLE', 'MCC_ARRAY', 'MCC_ARR_MAPS']

📊 Table Sizes:
Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...
Going to open: https://login.microsoftonline.com/3a7077d2-14ae-4001-a736-75e0437e2b89/saml2?SAMLRequest=lVLBbuIwFPyVyHtO7JiEUAuoWBAqUrtFQFfq3kzyAKuOnbWdBvj6OgGk7qGV9mbZM555M294fyxl8A7GCq1GKI4ICkDluhBqP0Ivm3k4QIF1XBVcagUjdAKL7sdDy0tZsUntDmoFf2uwLvAfKcvahxGqjWKaW2GZ4iVY5nK2njw9MhoRxq0F47wculIKK7zWwbmKYdw0TdT0Im32mBJCMLnDHtVCfqBPEtX3GpXRTuda3ihHP9MXEjEmSSvhEV5heSX%2BFOoSwXcq2wvIsofNZhkun9cbFExu0021snUJZg3mXeTwsnq8GLDewQG4cdKHGjbn

## Step 2: Rank and Keep Top 5 MCC Matches per NAICS

Apply ROW_NUMBER() ranking to keep the top 5 MCC matches for each NAICS code.

In [267]:
# Step 2: Load cross join and apply ranking to keep top 5 MCC matches per NAICS
print("📂 Loading cross join results...")
naics_mcc_cross_join_spdf = session.table("sandbox.javier.naics_mcc_cross_join")

print("🏆 Applying ranking to keep top 5 MCC matches per NAICS...")

# Create window spec for ranking by similarity score within each NAICS
window_spec = W.Window.partition_by("NAICS_CODE").order_by(F.col("SIMILARITY_SCORE").desc())

naics_mcc_cross_join_top5 = naics_mcc_cross_join_spdf.select(
    "*",
    F.row_number().over(window_spec).alias("SIMILARITY_RANK")
).filter(
    F.col("SIMILARITY_RANK") <= 5
)

print("✅ Top 5 MCC matches per NAICS selected!")
print(f"📊 Reduced from {humanize.intcomma(naics_mcc_cross_join_spdf.count())} to {humanize.intcomma(naics_mcc_cross_join_top5.count())} records")

naics_mcc_cross_join_top5.show()

print(f"\n💾 Writing to sandbox.javier.naics_mcc_cross_join_top5...")
naics_mcc_cross_join_top5.write.mode("overwrite").save_as_table("sandbox.javier.naics_mcc_cross_join_top5")
print("✅ Top 5 matches table saved!")

📂 Loading cross join results...
🏆 Applying ranking to keep top 5 MCC matches per NAICS...
✅ Top 5 MCC matches per NAICS selected!
📊 Reduced from 1,874,250 to 10,625 records
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"NAICS_CODE"  |"NAICS_TEXT"                                        |"MCC_CODE"  |"MCC_TEXT"                                          |"SIMILARITY_SCORE"   |"SIMILARITY_RANK"  |
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|62321         |Residential Intellectual and Developmental Disa...  |8050        |Nursing, Home Healthcare and Personal Care Faci...  |0.4752067384182132   |1                  |
|62321         |Residential Intellectual and Developmental Disa...  |8062        |Hospitals - Merchants classified 

## Step 3: AI_CLASSIFY Final Selection

Use AI_CLASSIFY to intelligently select the single best MCC match from the top 3 candidates for each NAICS.

In [402]:
# Step 3: AI Classification for Final MCC Selection
print("📂 Loading top 5 results table...")
naics_mcc_cross_join_top5_spdf = session.table("sandbox.javier.naics_mcc_cross_join_top5")

print("🤖 Preparing data for AI_CLASSIFY to select best MCC match for each NAICS...")

# Group the top 5 MCC codes per NAICS into arrays for AI_CLASSIFY
naics_with_mcc_categories = naics_mcc_cross_join_top5_spdf.select(
    "NAICS_CODE",
    "NAICS_TEXT", 
    "MCC_CODE",
    "MCC_TEXT",
    "SIMILARITY_RANK",
    "SIMILARITY_SCORE"
).group_by(
    "NAICS_CODE", "NAICS_TEXT"
).agg(
    # Create array of MCC codes as classification categories - convert to string
    F.array_agg(F.col("MCC_CODE").cast("string")).within_group("SIMILARITY_RANK").alias("MCC_CATEGORIES"),
    # Also keep MCC titles for reference
    F.array_agg(F.col("MCC_TEXT")).within_group("SIMILARITY_RANK").alias("MCC_TEXTS"),
    F.array_agg(F.expr("SIMILARITY_RANK")).within_group("SIMILARITY_RANK").alias("SIMILARITY_RANKS"),   
    F.array_agg(F.expr("SIMILARITY_SCORE")).within_group("SIMILARITY_RANK").alias("SIMILARITY_SCORES")
)

print("✅ MCC categories prepared! Sample:")
naics_with_mcc_categories.limit(5).show()


📂 Loading top 5 results table...
🤖 Preparing data for AI_CLASSIFY to select best MCC match for each NAICS...
✅ MCC categories prepared! Sample:
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"NAICS_CODE"  |"NAICS_TEXT"                                        |"MCC_CATEGORIES"  |"MCC_TEXTS"                                         |"SIMILARITY_RANKS"  |"SIMILARITY_SCORES"       |
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|488999        |All Other Support Activities for Transportation...  |[                 |[                                                   |[                   |[                         |
|              |                                                    |  "4789",         |  "Trans

In [None]:
print(f"\n🤖 Using AI_CLASSIFY to select best MCC matches for {humanize.intcomma(naics_with_mcc_categories.count())} NAICS codes...")

# Use AI_CLASSIFY to classify each NAICS into one of its 5 MCC categories
naics_mcc_classified = naics_with_mcc_categories.select(
    "NAICS_CODE", 
    "NAICS_TEXT",
    "MCC_CATEGORIES",
    "MCC_TEXTS",
    "SIMILARITY_SCORES", 
    # Use AI_CLASSIFY to select best MCC match from the 5 options
    F.call_function("AI_CLASSIFY",
        F.concat(F.col("NAICS_CODE"), F.lit(". "), F.col("NAICS_TEXT")),    # input
        F.col("MCC_TEXTS")                                                  # list_of_categories
    ).alias("AI_CLASSIFIED_MCC_TEXT")
)

print("🔍 Sample AI classifications:")
naics_mcc_classified.selectExpr("""NAICS_CODE
                                , NAICS_TEXT
                                , MCC_TEXTS
                                , AI_CLASSIFIED_MCC_TEXT:labels[0]::string                   as AI_CLASSIFIED_MCC_TEXT_STR
                                , COALESCE(AI_CLASSIFIED_MCC_TEXT_STR, MCC_TEXTS[0]::string) as MOST_LIKELY_MCC_TEXT
                                , AI_CLASSIFIED_MCC_TEXT_STR is not null as CLASSIFY_HAD_RETURN 
                                , CLASSIFY_HAD_RETURN AND AI_CLASSIFIED_MCC_TEXT_STR = MCC_TEXTS[0]::string as CLASSIFY_RETURNED_MOST_SIMILAR_MCC
                                , SIMILARITY_SCORES
                              """
).where((~F.col("CLASSIFY_RETURNED_MOST_SIMILAR_MCC")) & (F.col("CLASSIFY_HAD_RETURN"))).limit(2).show(max_width=1000)

print("✅ AI classifications complete! Ready for extraction...")


🤖 Using AI_CLASSIFY to select best MCC matches for 2,125 NAICS codes...
🔍 Sample AI classifications:
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"NAICS_CODE"  |"NAICS_TEXT"                               |"MCC_TEXTS"                       |"AI_CLASSIFIED_MCC_TEXT_STR"  |"MOST_LIKELY_MCC_TEXT"        |"CLASSIFY_HAD_RETURN"  |"CLASSIFY_RETURNED_MOST_SIMILAR_MCC"  |"SIMILARITY_SCORES"       |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|4885          |Freight Transportation Arrangement - NULL  |[                                 |AIRTRAN AIRWAYS - AIRTRANAIR  |AIRTRAN AIRWAYS - AIRTRANAIR

#### Expiriment with Similarity Scores vs `AI_FILTER`  

In [256]:
mcc_lu_spdf.select(
    F.lit("Freight Transportation Arrangement").alias("ref_naics_text"),
    F.concat(F.col("MCC_DESCRIPTIVE_TITLE"), F.lit(" "), F.col("INCLUDED_IN_THIS_MCC")).alias("MCC_FULL_DESC"),
    F.call_function("AI_SIMILARITY", 
        F.col("MCC_FULL_DESC"),
        F.lit("Freight Transportation Arrangement")
    ).alias("SIMILARITY_SCORE")
).sort(F.col("SIMILARITY_SCORE").desc()).show()

------------------------------------------------------------------------------------------------------------
|"REF_NAICS_TEXT"                    |"MCC_FULL_DESC"                                |"SIMILARITY_SCORE"   |
------------------------------------------------------------------------------------------------------------
|Freight Transportation Arrangement  |MERCHANTS RENT-A-CAR MERCHANTS RENT-A-CAR      |0.5132171842018974   |
|Freight Transportation Arrangement  |AFFILIATED AUTO RENTAL AFFILIATED AUTO RENTAL  |0.510001543979834    |
|Freight Transportation Arrangement  |FRONTIER AIRLINES FRONTIER AIR                 |0.5059433010952094   |
|Freight Transportation Arrangement  |AIRTRAN AIRWAYS AIRTRANAIR                     |0.4989260419694972   |
|Freight Transportation Arrangement  |US AIRWAYS USAIRWAYS                           |0.49606668527315634  |
|Freight Transportation Arrangement  |COMMAND AIRWAYS COMMAND AIR                    |0.48533914762693425  |
|Freight Transporta

In [257]:
mcc_lu_spdf.where(
    F.lower(F.concat(
        F.col("MCC_DESCRIPTIVE_TITLE"), 
        F.lit(" "), 
        F.col("INCLUDED_IN_THIS_MCC")
    )).rlike("logistics|freight|transport|shipping|cargo|moving|delivery|courier|warehouse")
).show()

-----------------------------------------------------------------------------------------------------------------------------------------------
|"MCC"  |"MCC_DESCRIPTIVE_TITLE"  |"INCLUDED_IN_THIS_MCC"  |"SIMILAR_MCC_CODES"  |"SIMILAR_MCC_CODE_READABLE"  |"MCC_ARRAY"  |"MCC_ARR_MAPS"  |
-----------------------------------------------------------------------------------------------------------------------------------------------
|       |                         |                        |                     |                             |             |                |
-----------------------------------------------------------------------------------------------------------------------------------------------



In [258]:
naisc_lu_spdf.select(
    F.col("*"),
    F.ai_filter(
                F.prompt("Is the following text related to Freight Transportation Arrangement? \n text: {0}"
                        , F.col("DESCRIPTION_FULL")
                )
              ).alias("IS_TRAVEL_OR_AIRLINES")
).where(F.col("IS_TRAVEL_OR_AIRLINES")).show()

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"CODE"  |"TITLE"                                            |"DESCRIPTION"                                       |"REFERENCE_CODE"  |"REFERENCE_DESCRIPTION"  |"DESCRIPTION_FULL"                                  |"IS_TRAVEL_OR_AIRLINES"  |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|322211  |Corrugated and Solid Fiber Box Manufacturing       |This U.S. industry comprises establishments pri...  |NULL              |NULL                     |This U.S. industry comprises establishments pri...  |True                     |
|48-49   |Transportation and Warehousing

In [259]:
mcc_lu_spdf.where(
    (~F.col("MCC").cast("string").startswith(F.lit("3")))
).select(
    F.col("*"),
    F.ai_filter(
                F.prompt("Is the following text related to Freight Transportation Arrangement? \n text: {0}"
                        , F.concat(F.col("MCC_DESCRIPTIVE_TITLE"), F.lit(" "), F.col("INCLUDED_IN_THIS_MCC"))
                )
              ).alias("IS_FREIGHT_TRANSPORTATION_ARRANGEMENT")
).where(
    (F.col("IS_FREIGHT_TRANSPORTATION_ARRANGEMENT"))  
).show()

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"MCC"  |"MCC_DESCRIPTIVE_TITLE"                             |"INCLUDED_IN_THIS_MCC"                              |"SIMILAR_MCC_CODES"                                 |"SIMILAR_MCC_CODE_READABLE"                         |"MCC_ARRAY"                                         |"MCC_ARR_MAPS"                                      |"IS_FREIGHT_TRANSPORTATION_ARRANGEMENT"  |
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [271]:
naics_mcc_cross_join_spdf.show()

---------------------------------------------------------------------------------------------------------------------------------------------------------------
|"NAICS_CODE"  |"NAICS_TEXT"                                        |"MCC_CODE"  |"MCC_TEXT"                                          |"SIMILARITY_SCORE"     |
---------------------------------------------------------------------------------------------------------------------------------------------------------------
|11            |Agriculture, Forestry, Fishing and Hunting - Th...  |3783        |TOWN AND COUNTRY RESORT & CONVENTION - CENTER       |0.2461053630801569     |
|11            |Agriculture, Forestry, Fishing and Hunting - Th...  |3791        |STAYBRIDGE SUITES - STAYBRIDGE SUITES               |0.02745284785997664    |
|11            |Agriculture, Forestry, Fishing and Hunting - Th...  |3799        |HALE KOA HOTEL - HALE KOA HOTEL                     |0.014326015355619729   |
|11            |Agriculture, Forestry, F

In [276]:
(
    naics_mcc_cross_join_spdf
    .where(F.col("NAICS_TEXT").rlike("Freight Transportation Arrangement.*"))
    .orderBy(F.col("SIMILARITY_SCORE").desc())
    .limit(100)
).show(max_width=1000)

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [301]:
non_null_freight_transportation_arrangement_naics_codes = ["4885"
                                                          ,"48851"
                                                          ,"488510"
                                                          ]
                                                          
def get_naics_mcc_matches(naics_codes, num_rows_for_each_code):
    # Calculate total expected rows
    total_expected_rows = len(naics_codes) * num_rows_for_each_code

    # Start with first code
    base_df = (naics_mcc_cross_join_spdf
        .where(F.col("NAICS_CODE") == naics_codes[0])   
        .orderBy(F.col("SIMILARITY_SCORE").desc())
        .select(
            F.col("*"),
            F.ai_filter(
                F.prompt("Is the following text related to Freight Transportation Arrangement? \n text: {0}"
                        , F.col("MCC_TEXT")
                )
            ).alias("IS_FREIGHT_TRANSPORTATION_ARRANGEMENT")
        ).orderBy(F.col("NAICS_CODE"),F.col("SIMILARITY_SCORE").desc())
        .limit(num_rows_for_each_code))

    # Union remaining codes
    for code in naics_codes[1:]:
        next_df = (naics_mcc_cross_join_spdf
            .where(F.col("NAICS_CODE") == code)   
            .orderBy(F.col("SIMILARITY_SCORE").desc())
            .select(
                F.col("*"),
                F.ai_filter(
                    F.prompt("Is the following text related to Freight Transportation Arrangement? \n text: {0}"
                            , F.col("MCC_TEXT")
                    )
            ).alias("IS_FREIGHT_TRANSPORTATION_ARRANGEMENT")
            ).orderBy(F.col("NAICS_CODE"),F.col("SIMILARITY_SCORE").desc())
            .limit(num_rows_for_each_code))
        base_df = base_df.unionAll(next_df)

    return base_df, total_expected_rows

# Call function with the codes and number of rows
result_df, total_rows = get_naics_mcc_matches(non_null_freight_transportation_arrangement_naics_codes, 5)
show_full_df(result_df, total_rows)

Unnamed: 0,NAICS_CODE,NAICS_TEXT,MCC_CODE,MCC_TEXT,SIMILARITY_SCORE,IS_FREIGHT_TRANSPORTATION_ARRANGEMENT
0,4885,Freight Transportation Arrangement - NULL,3063,US AIRWAYS - USAIRWAYS,0.471121,True
1,4885,Freight Transportation Arrangement - NULL,3061,CONTINENTAL - CONTINENTAL,0.467287,True
2,4885,Freight Transportation Arrangement - NULL,3089,TRANSAERO - TRANSAERO,0.464449,True
3,4885,Freight Transportation Arrangement - NULL,3016,SAS - SAS,0.449084,False
4,4885,Freight Transportation Arrangement - NULL,3177,AIRTRAN AIRWAYS - AIRTRANAIR,0.438593,True
5,48851,"Freight Transportation Arrangement - This industry comprises establishments primarily engaged in arranging transportation of freight between shippers and carriers. These establishments are usually known as freight forwarders, marine shipping agents, or customs brokers and offer a combination of services spanning transportation modes but do not directly provide shipping services. Cross-References.",4214,"Motor Freight Carriers and Trucking - Local and Long Distance, Moving and Storage Companies, and Local Delivery Services - Merchants classified with this MCC provide local or long -distance trucking services and may or may not also provide storage services. Delivery Services - Local Freight Carriers, Trucking and Storage Local Delivery Service Long Distance Trucking Services Motor Freight Carriers - Local, Long-Distance Trucking Moving Companies Moving, Storage Companies Storage, Moving Companies - Local, Long Distance",0.542151,True
6,48851,"Freight Transportation Arrangement - This industry comprises establishments primarily engaged in arranging transportation of freight between shippers and carriers. These establishments are usually known as freight forwarders, marine shipping agents, or customs brokers and offer a combination of services spanning transportation modes but do not directly provide shipping services. Cross-References.",4011,Railroads - Merchants classified with this MCC are railroads engaged in freight transport operations. Freight - Railroad and Train Transportation,0.514024,True
7,48851,"Freight Transportation Arrangement - This industry comprises establishments primarily engaged in arranging transportation of freight between shippers and carriers. These establishments are usually known as freight forwarders, marine shipping agents, or customs brokers and offer a combination of services spanning transportation modes but do not directly provide shipping services. Cross-References.",4722,"Travel Agencies and Tour Operators - Merchants classified with this MCC provide travel information and booking services. They act as agents on behalf of travelers in booking and ticketing air, land, or sea transportation and/or accommodation. They also arrange and assemble tours for sale through a travel agent or directly to the consumer. Bus charters and tour bus operators are included in this MCC. Charter Buses Package Tour Operators Tour Buses Travel Packages Travel Wholesalers",0.50949,False
8,48851,"Freight Transportation Arrangement - This industry comprises establishments primarily engaged in arranging transportation of freight between shippers and carriers. These establishments are usually known as freight forwarders, marine shipping agents, or customs brokers and offer a combination of services spanning transportation modes but do not directly provide shipping services. Cross-References.",4789,"Transportation Services (Not Elsewhere Classified) - Merchants classified with this MCC provide passenger transportation services that are not classified with a more specific MCC. This MCC includes Merchants that provide transportation via horse-drawn cabs and carriages, bicycle-taxis, aerial tramways, airport shuttles, and cable cars. Ferry service, bus transportation, cruise lines, passenger railways, and taxi and limousine service Merchants are not included in this category. Airport Shuttle Transportation Miscellaneous Transport Services Shuttle Transportation",0.500171,False
9,48851,"Freight Transportation Arrangement - This industry comprises establishments primarily engaged in arranging transportation of freight between shippers and carriers. These establishments are usually known as freight forwarders, marine shipping agents, or customs brokers and offer a combination of services spanning transportation modes but do not directly provide shipping services. Cross-References.",4215,"Courier Services - Air and Ground, and Freight Forwarders - Merchants classified with this MCC deliver individually addressed letters, parcels, and packages, but excludes the Postal Services - Government Only (MCC 9402). Air or Ground Courier Services Freight Forwarders, Courier Services",0.485541,True


--> Conclusion:  Do not trust similarity matching on 4 digit NAICS codes. Just null them / leave them out of the cross join & match 

#### Save `AI_CLASSIFY` results

In [303]:
print(f"\n🤖 Using AI_CLASSIFY to select best MCC matches for {humanize.intcomma(naics_with_mcc_categories.count())} NAICS codes...")

# Use AI_CLASSIFY to classify each NAICS into one of its 5 MCC categories
naics_mcc_classified = naics_with_mcc_categories.select(
    "NAICS_CODE", 
    "NAICS_TEXT",
    "MCC_CATEGORIES",
    "MCC_TEXTS",
    "SIMILARITY_SCORES", 
    # Use AI_CLASSIFY to select best MCC match from the 5 options
    F.call_function("AI_CLASSIFY",
        F.concat(F.col("NAICS_CODE"), F.lit(". "), F.col("NAICS_TEXT")),    # input
        F.col("MCC_TEXTS")                                                  # list_of_categories
    ).alias("AI_CLASSIFIED_MCC_TEXT")
)

print("🔍 Writing to sandbox.javier.naics_mcc_classified...")
naics_mcc_classified.selectExpr("""NAICS_CODE
                                , NAICS_TEXT
                                , MCC_TEXTS
                                , AI_CLASSIFIED_MCC_TEXT:labels[0]::string                   as AI_CLASSIFIED_MCC_TEXT_STR
                                , COALESCE(AI_CLASSIFIED_MCC_TEXT_STR, MCC_TEXTS[0]::string) as MOST_LIKELY_MCC_TEXT
                                , AI_CLASSIFIED_MCC_TEXT_STR is not null as CLASSIFY_HAD_RETURN 
                                , CLASSIFY_HAD_RETURN AND AI_CLASSIFIED_MCC_TEXT_STR = MCC_TEXTS[0]::string as CLASSIFY_RETURNED_MOST_SIMILAR_MCC
                                , SIMILARITY_SCORES
                              """
).write.mode("overwrite").save_as_table("sandbox.javier.naics_mcc_classified")



🤖 Using AI_CLASSIFY to select best MCC matches for 2,125 NAICS codes...
🔍 Writing to sandbox.javier.naics_mcc_classified...


In [304]:
# Reload the AI classifications from saved table
print("📥 Loading AI classifications from sandbox.javier.naics_mcc_classified...")
ai_classifications = session.table("sandbox.javier.naics_mcc_classified")

print(f"\n📊 Loaded {humanize.intcomma(ai_classifications.count())} AI classifications")
print("\n🔍 Sample of loaded data:")
ai_classifications.limit(5).show()



📥 Loading AI classifications from sandbox.javier.naics_mcc_classified...
Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...
Going to open: https://login.microsoftonline.com/3a7077d2-14ae-4001-a736-75e0437e2b89/saml2?SAMLRequest=lZLdbuIwEIVfJfJeJ3YMNMUCKha2arT9QZB2pd6ZZALeOjbYDil9%2BnVCkboXrdQ7a3yO%2Fc2cGV29VjI4gLFCqzGKI4ICULkuhNqM0WN2HV6iwDquCi61gjE6gkVXk5Hlldyxae22agn7GqwL%2FEPKsu5ijGqjmOZWWKZ4BZa5nK2md7eMRoTtjHY61xJ9sHzt4NaCcZ7wbCms8Hhb53YM46ZpoqYXabPBlBCCyRB7VSv5cda%2F%2Bp4%2B0ceY9Fu9V3j54p3tp1CnEXyFtT6JLLvJskW4eFhlKJieUWda2boCswJzEDk8Lm9PANYTbIEbJ%2F1Qw%2BbtYkAJjazSTSn5C%2BS62tXOvxr5Ey6hwFJvhG88nY%2FR7kUUx2W5f94v1WHD%2F9z8LldwP1xBGfNsnaz%2F6mzN52RWZulD%2BvYrR8HTOVnaJptaW0Oq2jydLxE6CEkS0mFGYkYT1htEMUmeUTD3eQrFXec8Q3ccUSVyo60unVZSKOgoezwhSVLQMO5zCPt%2BpiFPehdhMgDS7yVA15dD3KZM0WlzWAdiJt%2Bdxwh%2FdL8v4b3PJ

Confirming if we should null out all 4 digit NAICS code results

In [312]:
(
    ai_classifications.select(
        "NAICS_CODE",
        "NAICS_TEXT",
        "MOST_LIKELY_MCC_TEXT",
        "CLASSIFY_HAD_RETURN", 
        "CLASSIFY_RETURNED_MOST_SIMILAR_MCC",
        F.get(F.col("SIMILARITY_SCORES"), 0).alias("SIMILARITY_SCORE")
    )
    .where(
        F.length(F.col("NAICS_CODE")) == 4
    )
).show(20)

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"NAICS_CODE"  |"NAICS_TEXT"                                        |"MOST_LIKELY_MCC_TEXT"                              |"CLASSIFY_HAD_RETURN"  |"CLASSIFY_RETURNED_MOST_SIMILAR_MCC"  |"SIMILARITY_SCORE"     |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|6215          |Medical and Diagnostic Laboratories - NULL          |Medical and Dental Laboratories - Merchants cla...  |False                  |False                                 |5.772550433000783e-01  |
|3141          |Textile Furnishings Mills - This industry group...  |Furniture, Home Furnishings, and Equipment Stor...  |False                  |False         

##### ⚠️ 🚨 Redoing all 4-digit NAICS codes and excluding hotel/airline MCCs (3000-series) since they are a special case

In [None]:
print("\n⚠️ Creating NAICS-MCC cross join using only 4-digit NAICS codes...")
num_4_digit_naics = naisc_lu_spdf.where(F.length(F.col('CODE')) == 4).count()
print(f"4-digit NAICS codes: {humanize.intcomma(num_4_digit_naics)}")

num_non_3000_mccs = mcc_lu_spdf.where(~F.col("MCC").startswith("3")).count()
print(f"Non-3000 MCCs: {humanize.intcomma(num_non_3000_mccs)}")

# First, let's examine the table structures
print("\n🔍 First, let's examine the table structures:")
print("\nNAICS Table Columns:")
print(naisc_lu_spdf.columns)
print("\nMCC Table Columns:")
print(mcc_lu_spdf.columns)

print(f"\n📊 Table Sizes:")
print(f"NAICS Records (4-digit): {humanize.intcomma(num_4_digit_naics)}")
print(f"MCC Records (Non-3000): {humanize.intcomma(num_non_3000_mccs)}")
print(f"Cross Join Size: {humanize.intcomma(num_4_digit_naics * num_non_3000_mccs)} total combinations")

naics_mcc_short_cross_join = (
    naisc_lu_spdf.where(F.length(F.col("CODE")) == 4).cross_join(
        mcc_lu_spdf.where(~F.col("MCC").startswith("3"))
    )
)


⚠️ Creating NAICS-MCC cross join using only 4-digit NAICS codes...
4-digit NAICS codes: 308
Non-3000 MCCs: 286

🔍 First, let's examine the table structures:

NAICS Table Columns:
['CODE', 'TITLE', 'DESCRIPTION', 'REFERENCE_CODE', 'REFERENCE_DESCRIPTION', 'DESCRIPTION_FULL']

MCC Table Columns:
['MCC', 'MCC_DESCRIPTIVE_TITLE', 'INCLUDED_IN_THIS_MCC', 'SIMILAR_MCC_CODES', 'SIMILAR_MCC_CODE_READABLE', 'MCC_ARRAY', 'MCC_ARR_MAPS']

📊 Table Sizes:
NAICS Records (4-digit): 308
MCC Records (Non-3000): 286
Cross Join Size: 88,088 total combinations


In [363]:
naics_mcc_short_cross_join.printSchema()

root
 |-- "CODE": StringType(16777216) (nullable = True)
 |-- "TITLE": StringType(16777216) (nullable = True)
 |-- "DESCRIPTION": StringType(16777216) (nullable = True)
 |-- "REFERENCE_CODE": LongType() (nullable = True)
 |-- "REFERENCE_DESCRIPTION": StringType(16777216) (nullable = True)
 |-- "DESCRIPTION_FULL": StringType(16777216) (nullable = True)
 |-- "MCC": LongType() (nullable = True)
 |-- "MCC_DESCRIPTIVE_TITLE": StringType(16777216) (nullable = True)
 |-- "INCLUDED_IN_THIS_MCC": StringType(16777216) (nullable = True)
 |-- "SIMILAR_MCC_CODES": StringType(16777216) (nullable = True)
 |-- "SIMILAR_MCC_CODE_READABLE": StringType() (nullable = True)
 |-- "MCC_ARRAY": ArrayType (nullable = True)
 |   |-- element: StringType()
 |-- "MCC_ARR_MAPS": ArrayType (nullable = True)
 |   |-- element: StringType()


In [364]:
naics_mcc_short_cross_join_with_texts = (
    naics_mcc_short_cross_join.select(
        # NAICS fields
        F.col("CODE").alias("NAICS_CODE"),
        F.concat(
            F.col("TITLE"),
            F.lit(" - "),
            F.col("DESCRIPTION_FULL")
        ).alias("NAICS_TEXT"),

        # MCC fields
        F.col("MCC").alias("MCC_CODE"),
        F.concat(
            F.col("MCC_DESCRIPTIVE_TITLE"),
            F.lit(" - "),
            F.col("INCLUDED_IN_THIS_MCC")
        ).alias("MCC_TEXT")
    )
)

naics_mcc_short_cross_join_with_texts.show()

---------------------------------------------------------------------------------------------------------------------------------------
|"NAICS_CODE"  |"NAICS_TEXT"                                        |"MCC_CODE"  |"MCC_TEXT"                                          |
---------------------------------------------------------------------------------------------------------------------------------------
|1111          |Oilseed and Grain Farming - This industry group...  |742         |Veterinary Services - Merchants classified with...  |
|1111          |Oilseed and Grain Farming - This industry group...  |1761        |Roofing, Siding, and Sheet Metal Work Contracto...  |
|1111          |Oilseed and Grain Farming - This industry group...  |4112        |Passenger Railways - Merchants classified with ...  |
|1111          |Oilseed and Grain Farming - This industry group...  |4457        |Boat Rentals and Leasing - Merchants classified...  |
|1111          |Oilseed and Grain Farming - This

In [370]:
# Create cross join with AI filter
print("\n🤖 Creating cross join with AI_FILTER...")

naics_mcc_cross_join_with_ai_filter = (
naics_mcc_short_cross_join_with_texts.select(
    F.col("*"),
    F.ai_filter(
                F.prompt("Is the following text related to {0} \n text: {1}"
                        , F.col("NAICS_TEXT")
                        , F.col("MCC_TEXT")
                )
              ).alias("NAICS_AND_MCC_ARE_RELATED")
).where(
    (F.col("NAICS_AND_MCC_ARE_RELATED"))  
))
naics_mcc_cross_join_with_ai_filter.show(2,max_width=1000)



🤖 Creating cross join with AI_FILTER...
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [372]:
# Write results to Snowflake table
print("\n💾 Writing results to Snowflake table...")
import time
start_time = time.time()

naics_mcc_cross_join_with_ai_filter.write.mode("overwrite").saveAsTable("sandbox.javier.naics_mcc_cross_join_with_ai_filter")

print("\n✅ Cross join with AI_FILTER created and saved successfully!")

end_time = time.time()
print(f"⏱️ Total execution time: {round(end_time - start_time, 2)} seconds")


💾 Writing results to Snowflake table...

✅ Cross join with AI_FILTER created and saved successfully!
⏱️ Total execution time: 83.93 seconds


In [380]:
# Reload cross join with AI filter from Snowflake and show stats
print("🔄 Reloading cross join with AI_FILTER from Snowflake...")

naics_mcc_cross_join_with_ai_filter = session.table("sandbox.javier.naics_mcc_cross_join_with_ai_filter")

# Get record count
record_count = naics_mcc_cross_join_with_ai_filter.count()
print(f"\n📊 Statistics:")
print(f"Total records: {record_count:,}")

# Show distribution of matches per NAICS code
print("\nMatches per NAICS code:")
# Get top 5
top_5 = naics_mcc_cross_join_with_ai_filter.groupBy("NAICS_CODE").count().orderBy("count", ascending=False).limit(5)

# Get bottom 5 
bottom_5 = naics_mcc_cross_join_with_ai_filter.groupBy("NAICS_CODE").count().orderBy("count", ascending=True).limit(5)

# Union the results and show
top_5.union(bottom_5).orderBy("count", ascending=False).show(10)

# Get count distribution statistics
count_stats = naics_mcc_cross_join_with_ai_filter.groupBy("NAICS_CODE").count().agg(
    F.count("count").alias("number_of_different_counts"),
    F.min("count").alias("min_matches_per_naics"),
    F.max("count").alias("max_matches_per_naics"),
    F.avg("count").alias("avg_matches_per_naics")
)

print("\nCount distribution statistics:")
count_stats.show()


🔄 Reloading cross join with AI_FILTER from Snowflake...

📊 Statistics:
Total records: 1,124

Matches per NAICS code:
--------------------------
|"NAICS_CODE"  |"COUNT"  |
--------------------------
|4599          |90       |
|5419          |43       |
|4249          |26       |
|8114          |19       |
|4561          |19       |
|1119          |1        |
|1113          |1        |
|1131          |1        |
|1124          |1        |
|1152          |1        |
--------------------------


Count distribution statistics:
--------------------------------------------------------------------------------------------------------------
|"NUMBER_OF_DIFFERENT_COUNTS"  |"MIN_MATCHES_PER_NAICS"  |"MAX_MATCHES_PER_NAICS"  |"AVG_MATCHES_PER_NAICS"  |
--------------------------------------------------------------------------------------------------------------
|246                           |1                        |90                       |4.569106                 |
----------------------------

In [381]:
naics_mcc_cross_join_with_ai_filter.show()

---------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"NAICS_CODE"  |"NAICS_TEXT"                                        |"MCC_CODE"  |"MCC_TEXT"                                          |"NAICS_AND_MCC_ARE_RELATED"  |
---------------------------------------------------------------------------------------------------------------------------------------------------------------------
|1113          |Fruit and Tree Nut Farming - This industry grou...  |763         |Agricultural Co-operatives - Merchants classifi...  |True                         |
|1114          |Greenhouse, Nursery, and Floriculture Productio...  |5992        |Florists - Merchants classified with this MCC s...  |True                         |
|1114          |Greenhouse, Nursery, and Floriculture Productio...  |5261        |Nurseries and Lawn and Garden Supply Stores - M...  |True                         |
|111

In [386]:
naics_mcc_cross_join_with_ai_filter.groupBy("NAICS_CODE", "NAICS_TEXT").agg(
    F.collect_list("MCC_TEXT").alias("MCC_TEXTS"),
    F.collect_list("MCC_CODE").alias("MCC_CODES")
).show(max_width=1000)

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [399]:
(
    naics_mcc_cross_join_with_ai_filter.groupBy("NAICS_CODE", "NAICS_TEXT")
    .agg( F.collect_list("MCC_TEXT").alias("MCC_TEXTS"),
          F.collect_list("MCC_CODE").alias("MCC_CODES")
        ).select( F.col("NAICS_CODE"),    
                  F.col("NAICS_TEXT"),
                  F.col("MCC_TEXTS"),
                  F.col("MCC_CODES"),
                  F.call_function("AI_CLASSIFY",
                                  F.col("NAICS_TEXT"),
                                  F.col("MCC_TEXTS")
                                 ).alias("AI_CLASSIFICATION"),
        F.col("AI_CLASSIFICATION").getItem("labels")[0].alias("LABEL")
    )
).where(F.col("LABEL").isNotNull()).show(max_width=1000)

------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"NAICS_CODE"  |"NAICS_TEXT"                |"MCC_TEXTS"                                                                                     

only one good match found this way.

**Just NULL OUT the results for all 4 digit NAICS codes**

##### ⚠️ 🚨 Delete all MCC Matches for 4 digit NAICS Codes 

##### Complete NAICS to MCC Lookup 

In [400]:
# Reload the AI classifications from saved table
print("📥 Loading AI classifications from sandbox.javier.naics_mcc_classified...")
ai_classifications = session.table("sandbox.javier.naics_mcc_classified")

print(f"\n📊 Loaded {humanize.intcomma(ai_classifications.count())} AI classifications")
print("\n🔍 Sample of loaded data:")
ai_classifications.limit(5).show()



📥 Loading AI classifications from sandbox.javier.naics_mcc_classified...

📊 Loaded 2,125 AI classifications

🔍 Sample of loaded data:
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"NAICS_CODE"  |"NAICS_TEXT"                                        |"MCC_TEXTS"                                         |"AI_CLASSIFIED_MCC_TEXT_STR"  |"MOST_LIKELY_MCC_TEXT"                              |"CLASSIFY_HAD_RETURN"  |"CLASSIFY_RETURNED_MOST_SIMILAR_MCC"  |"SIMILARITY_SCORES"       |
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [401]:
(
    ai_classifications.select(
        "NAICS_CODE",
        "NAICS_TEXT",
        "MOST_LIKELY_MCC_TEXT",
        "CLASSIFY_HAD_RETURN", 
        "CLASSIFY_RETURNED_MOST_SIMILAR_MCC",
    )
).show(max_width=1000)

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
# Get distinct MCC code-text pairs from cross join table
print("📊 Getting distinct MCC code-text pairs...")
mcc_pairs = (
    session.table("sandbox.javier.naics_mcc_cross_join")
    .select("MCC_CODE", "MCC_TEXT")
    .distinct()
)

print(f"\n📈 Found {humanize.intcomma(mcc_pairs.count())} unique MCC pairs")
print("\n🔍 Sample of MCC pairs:")
mcc_pairs.limit(5).show()


📊 Getting distinct MCC code-text pairs...

📈 Found 882 unique MCC pairs

🔍 Sample of MCC pairs:
-------------------------------------------------------------------
|"MCC_CODE"  |"MCC_TEXT"                                          |
-------------------------------------------------------------------
|3716        |CARLTON HOTELS - CARLTON HOTELS                     |
|3732        |OPRYLAND HOTEL - OPRYLAND HOTEL                     |
|3741        |MILLENNIUM HOTELS - MILLENNIUM HOTELS               |
|3749        |THE BEVERLY HILLS HOTEL - THE BEVERLY HILLS HOTEL   |
|5074        |Plumbing and Heating Equipment and Supplies - M...  |
-------------------------------------------------------------------



In [413]:
# Join MCC codes back to AI classifications based on matched text
print("🔄 Joining MCC codes to AI classifications...")
naics_mcc_lookup = (
    ai_classifications.join(
        mcc_pairs,
        ai_classifications["MOST_LIKELY_MCC_TEXT"] == mcc_pairs["MCC_TEXT"],
        "left"
    )
    .select(
        ai_classifications["NAICS_CODE"],
        ai_classifications["NAICS_TEXT"], 
        ai_classifications["MOST_LIKELY_MCC_TEXT"],
        mcc_pairs["MCC_CODE"].alias("MOST_LIKELY_MCC_CODE"),
        ai_classifications["CLASSIFY_HAD_RETURN"],
        ai_classifications["CLASSIFY_RETURNED_MOST_SIMILAR_MCC"]
    )
)

print("\n🔍 Sample of joined data:")
naics_mcc_lookup.limit(5).show()


🔄 Joining MCC codes to AI classifications...

🔍 Sample of joined data:
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"NAICS_CODE"  |"NAICS_TEXT"                                        |"MOST_LIKELY_MCC_TEXT"                              |"MOST_LIKELY_MCC_CODE"  |"CLASSIFY_HAD_RETURN"  |"CLASSIFY_RETURNED_MOST_SIMILAR_MCC"  |
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|5313          |Activities Related to Real Estate - This indust...  |AFFILIATED AUTO RENTAL - AFFILIATED AUTO RENTAL     |3351                    |False                  |False                                 |
|923           |Administration of Human Resource Programs - The...  |AFFILIATED AUTO 

In [414]:
naisc_lu_spdf.filter(~naisc_lu_spdf["CODE"].rlike("^[0-9]+$")).show(max_width=1000)

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [426]:
code_map_naics_mcc = naics_mcc_lookup.select(
    "NAICS_CODE",
    "MOST_LIKELY_MCC_CODE"
).distinct().orderBy("NAICS_CODE")

code_map_naics_mcc.write.mode("overwrite").save_as_table("sandbox.javier.code_map_naics_mcc")
code_map_naics_mcc = session.table("sandbox.javier.code_map_naics_mcc")
code_map_naics_mcc.orderBy("NAICS_CODE").show()

# Count distinct NAICS codes in our classified results
naics_count_classified = naics_mcc_lookup.select("NAICS_CODE").distinct().count()

# Count distinct NAICS codes in original lookup table
naics_count_original = naisc_lu_spdf.select("CODE").distinct().count()

print(f"Number of NAICS codes in original lookup:     {naics_count_original}")
print(f"Number of NAICS codes in classified results: {naics_count_classified}")
print(f"Difference: {naics_count_classified - naics_count_original}")

-----------------------------------------
|"NAICS_CODE"  |"MOST_LIKELY_MCC_CODE"  |
-----------------------------------------
|11            |763                     |
|111           |763                     |
|1111          |763                     |
|11111         |763                     |
|111110        |763                     |
|11112         |763                     |
|111120        |763                     |
|11113         |763                     |
|111130        |763                     |
|11114         |763                     |
-----------------------------------------

Number of NAICS codes in original lookup:     2125
Number of NAICS codes in classified results: 2125
Difference: 0


In [428]:
code_map_naics_sic.show()
code_map_naics_mcc.show()

-----------------------------------------
|"NAICS_CODE"  |"MOST_LIKELY_SIC_CODE"  |
-----------------------------------------
|11            |0921                    |
|111           |0191                    |
|1111          |0191                    |
|11111         |0116                    |
|111110        |0116                    |
|11112         |0116                    |
|111120        |0116                    |
|11113         |0161                    |
|111130        |0161                    |
|11114         |0111                    |
-----------------------------------------

-----------------------------------------
|"NAICS_CODE"  |"MOST_LIKELY_MCC_CODE"  |
-----------------------------------------
|441           |5511                    |
|4411          |5521                    |
|44111         |5511                    |
|441110        |5511                    |
|44112         |5521                    |
|441120        |5521                    |
|4412          |5521             

In [439]:
# Join the NAICS-SIC and NAICS-MCC mappings
joined_mappings = code_map_naics_sic.join(
    code_map_naics_mcc,
    on="NAICS_CODE", 
    how="left"
).orderBy("NAICS_CODE")

# Get statistics on the joined results
total_rows = joined_mappings.count()
null_mcc = joined_mappings.filter(joined_mappings.MOST_LIKELY_MCC_CODE.isNull()).count()
null_sic = joined_mappings.filter(joined_mappings.MOST_LIKELY_SIC_CODE.isNull()).count()
distinct_naics = joined_mappings.select("NAICS_CODE").distinct().count()
distinct_sic = joined_mappings.select("MOST_LIKELY_SIC_CODE").distinct().count()
distinct_mcc = joined_mappings.select("MOST_LIKELY_MCC_CODE").distinct().count()

print(f"\nMapping Statistics:")
print(f"Total NAICS codes mapped:          {distinct_naics:>6}")
print(f"Total SIC codes mapped:            {distinct_sic:>6}")
print(f"Total MCC codes mapped:            {distinct_mcc:>6}")
print()
print(f"NAICS codes without MCC mapping:   {null_mcc:>6} ({(null_mcc/total_rows*100):>5.1f}%)")
print(f"NAICS codes without SIC mapping:   {null_sic:>6} ({(null_sic/total_rows*100):>5.1f}%)")

# Show sample of results

print("\nRandom Sample of Mappings:")
joined_mappings.sample(n=10).show()


Mapping Statistics:
Total NAICS codes mapped:            2125
Total SIC codes mapped:               771
Total MCC codes mapped:               268

NAICS codes without MCC mapping:        0 (  0.0%)
NAICS codes without SIC mapping:        0 (  0.0%)

Random Sample of Mappings:
------------------------------------------------------------------
|"NAICS_CODE"  |"MOST_LIKELY_SIC_CODE"  |"MOST_LIKELY_MCC_CODE"  |
------------------------------------------------------------------
|6111          |8211                    |8211                    |
|541715        |8731                    |7395                    |
|71392         |7948                    |7999                    |
|21            |1389                    |5172                    |
|33361         |3568                    |5085                    |
|311212        |2044                    |2842                    |
|311411        |2037                    |5411                    |
|456191        |5411                    |5499       

In [458]:
# Rename columns, order by NAICS, and write to table
joined_mappings.orderBy("NAICS_CODE").write.mode("overwrite").saveAsTable("industry_code_mappings")



In [467]:

# Reload the saved table to verify
industry_code_mappings = session.table("industry_code_mappings")
industry_code_mappings.order_by(F.col("NAICS_CODE").asc()).show()


------------------------------------------------------------------
|"NAICS_CODE"  |"MOST_LIKELY_SIC_CODE"  |"MOST_LIKELY_MCC_CODE"  |
------------------------------------------------------------------
|11            |0921                    |763                     |
|111           |0191                    |763                     |
|1111          |0191                    |763                     |
|11111         |0116                    |763                     |
|111110        |0116                    |763                     |
|11112         |0116                    |763                     |
|111120        |0116                    |763                     |
|11113         |0161                    |763                     |
|111130        |0161                    |763                     |
|11114         |0111                    |763                     |
------------------------------------------------------------------



In [461]:
# Show schemas of lookup tables
print("NAICS Lookup Schema:")
naisc_lu_spdf.printSchema()

print("\nSIC Lookup Schema:")
sic_lu_spdf.printSchema()

print("\nMCC Lookup Schema:") 
mcc_lu_spdf.printSchema()


NAICS Lookup Schema:
root
 |-- "CODE": StringType(16777216) (nullable = True)
 |-- "TITLE": StringType(16777216) (nullable = True)
 |-- "DESCRIPTION": StringType(16777216) (nullable = True)
 |-- "REFERENCE_CODE": LongType() (nullable = True)
 |-- "REFERENCE_DESCRIPTION": StringType(16777216) (nullable = True)
 |-- "DESCRIPTION_FULL": StringType(16777216) (nullable = True)

SIC Lookup Schema:
root
 |-- "SIC_INDUSTRY_CODE": StringType(15) (nullable = True)
 |-- "SIC_INDUSTRY_DESCRIPTION": StringType(250) (nullable = True)
 |-- "SIC_MAJOR_GROUP_DESCRIPTION": StringType(250) (nullable = True)
 |-- "SIC_DIVISION_DESCRIPTION": StringType(250) (nullable = True)

MCC Lookup Schema:
root
 |-- "MCC": LongType() (nullable = True)
 |-- "MCC_DESCRIPTIVE_TITLE": StringType(16777216) (nullable = True)
 |-- "INCLUDED_IN_THIS_MCC": StringType(16777216) (nullable = True)
 |-- "SIMILAR_MCC_CODES": StringType(16777216) (nullable = True)
 |-- "SIMILAR_MCC_CODE_READABLE": StringType() (nullable = True)
 |--

In [480]:
# Join with lookup tables to get additional attributes

industry_code_mappings_with_desc = \
(industry_code_mappings
    .join(naisc_lu_spdf, industry_code_mappings.NAICS_CODE == naisc_lu_spdf.CODE, "left")
    .join(sic_lu_spdf, industry_code_mappings.MOST_LIKELY_SIC_CODE == sic_lu_spdf.SIC_INDUSTRY_CODE, "left") 
    .join(mcc_lu_spdf, industry_code_mappings.MOST_LIKELY_MCC_CODE == mcc_lu_spdf.MCC, "left")
    .selectExpr(
        """ NAICS_CODE,
            MOST_LIKELY_SIC_CODE AS SIC_CODE,
            MOST_LIKELY_MCC_CODE AS MCC_CODE,
            -- CODE,
            TITLE AS NAICS_TITLE,
            -- DESCRIPTION,
            -- REFERENCE_CODE,
            -- REFERENCE_DESCRIPTION AS NAICS_REFERENCE_DESCRIPTION,
            DESCRIPTION_FULL AS NAICS_DESCRIPTION,
            -- SIC_INDUSTRY_CODE,
            SIC_MAJOR_GROUP_DESCRIPTION AS SIC_MAJOR_GROUP_TITLE,
            SIC_DIVISION_DESCRIPTION AS SIC_DIVISION_TITLE,
            SIC_INDUSTRY_DESCRIPTION AS SIC_TITLE,
            -- MCC,
            MCC_DESCRIPTIVE_TITLE AS MCC_TITLE,
            INCLUDED_IN_THIS_MCC AS MCC_INCLUDED_MERCHANTS,
            -- SIMILAR_MCC_CODES,
            -- SIMILAR_MCC_CODE_READABLE,
            -- MCC_ARRAY AS SIMILAR_MCC_CODES_ARRAY,
            -- MCC_ARR_MAPS AS SIMILAR_MCC_CODES_MAPS
        """
    )
)

industry_code_mappings_with_desc.show()

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"NAICS_CODE"  |"SIC_CODE"  |"MCC_CODE"  |"NAICS_TITLE"                                       |"NAICS_DESCRIPTION"                                 |"SIC_MAJOR_GROUP_TITLE"          |"SIC_DIVISION_TITLE"  |"SIC_TITLE"                                       |"MCC_TITLE"                                         |"MCC_INCLUDED_MERCHANTS"                            |
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [481]:
industry_code_mappings_with_desc.orderBy("NAICS_CODE").write.mode("overwrite").saveAsTable("industry_classifications")

In [483]:
# Join with lookup tables to get additional attributes

industry_code_mappings_with_desc_ext = \
(industry_code_mappings
    .join(naisc_lu_spdf, industry_code_mappings.NAICS_CODE == naisc_lu_spdf.CODE, "left")
    .join(sic_lu_spdf, industry_code_mappings.MOST_LIKELY_SIC_CODE == sic_lu_spdf.SIC_INDUSTRY_CODE, "left") 
    .join(mcc_lu_spdf, industry_code_mappings.MOST_LIKELY_MCC_CODE == mcc_lu_spdf.MCC, "left")
    .selectExpr(
        """ NAICS_CODE,
            MOST_LIKELY_SIC_CODE AS SIC_CODE,
            MOST_LIKELY_MCC_CODE AS MCC_CODE,
            -- CODE,
            TITLE AS NAICS_TITLE,
            -- DESCRIPTION,
            -- REFERENCE_CODE,
            -- REFERENCE_DESCRIPTION AS NAICS_REFERENCE_DESCRIPTION,
            DESCRIPTION_FULL AS NAICS_DESCRIPTION,
            -- SIC_INDUSTRY_CODE,
            SIC_MAJOR_GROUP_DESCRIPTION AS SIC_MAJOR_GROUP_TITLE,
            SIC_DIVISION_DESCRIPTION AS SIC_DIVISION_TITLE,
            SIC_INDUSTRY_DESCRIPTION AS SIC_TITLE,
            -- MCC,
            MCC_DESCRIPTIVE_TITLE AS MCC_TITLE,
            INCLUDED_IN_THIS_MCC AS MCC_INCLUDED_MERCHANTS,
            -- SIMILAR_MCC_CODES,
            -- SIMILAR_MCC_CODE_READABLE,
            MCC_ARRAY AS SIMILAR_MCC_CODES_ARRAY,
            MCC_ARR_MAPS AS SIMILAR_MCC_CODES_MAPS
        """
    )
)

industry_code_mappings_with_desc_ext.orderBy("NAICS_CODE").write.mode("overwrite").saveAsTable("industry_classifications_ext")