# Splink with Spark Backend - Iceberg Tables Demo

This notebook demonstrates how to use Splink with Spark backend to read data from Iceberg tables stored in MinIO via Nessie catalog.


In [None]:
# Install required packages
!pip install splink[spark] pyiceberg


In [None]:
import os
from pyspark.sql import SparkSession
from splink.spark.linker import SparkLinker
from splink.spark.comparison_library import exact_match, levenshtein_at_thresholds
from splink.spark.comparison_template_library import name_comparison

# Set AWS environment variables for MinIO access
os.environ['AWS_ACCESS_KEY_ID'] = 'minio'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'minio12345'
os.environ['AWS_DEFAULT_REGION'] = 'us-east-1'
os.environ['AWS_ENDPOINT_URL'] = 'http://minio:9000'
os.environ['AWS_ENDPOINT_URL_S3'] = 'http://minio:9000'


In [None]:
# Initialize Spark session with Iceberg support
spark = SparkSession.builder \
    .appName("SplinkIcebergDemo") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.nessie", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.nessie.type", "nessie") \
    .config("spark.sql.catalog.nessie.uri", "http://nessie:19120/api/v2") \
    .config("spark.sql.catalog.nessie.ref", "main") \
    .config("spark.sql.catalog.nessie.warehouse", "s3a://warehouse/") \
    .config("spark.sql.catalog.nessie.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \
    .config("spark.sql.catalog.nessie.s3.endpoint", "http://minio:9000") \
    .config("spark.sql.catalog.nessie.s3.path-style-access", "true") \
    .config("spark.hadoop.fs.s3a.access.key", "minio") \
    .config("spark.hadoop.fs.s3a.secret.key", "minio12345") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .getOrCreate()

print("Spark session initialized successfully!")


In [None]:
# List available catalogs and tables
print("Available catalogs:")
spark.sql("SHOW CATALOGS").show()

print("\nAvailable databases in nessie catalog:")
spark.sql("SHOW DATABASES IN nessie").show()

print("\nAvailable tables in nessie catalog:")
spark.sql("SHOW TABLES IN nessie").show()


In [None]:
# Example: Read data from an Iceberg table (replace 'your_table' with actual table name)
# df = spark.sql("SELECT * FROM nessie.your_database.your_table LIMIT 10")
# df.show()

# For demonstration, let's create a sample dataset
from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Sample data for testing Splink
sample_data = [
    Row(id=1, first_name="John", last_name="Smith", email="john.smith@email.com", phone="555-1234"),
    Row(id=2, first_name="Jon", last_name="Smith", email="jon.smith@email.com", phone="555-1234"),
    Row(id=3, first_name="John", last_name="Smyth", email="john.smyth@email.com", phone="555-1235"),
    Row(id=4, first_name="Jane", last_name="Doe", email="jane.doe@email.com", phone="555-5678"),
    Row(id=5, first_name="Jane", last_name="Doe", email="jane.doe@email.com", phone="555-5678"),
]

df = spark.createDataFrame(sample_data)
print("Sample dataset:")
df.show()


In [None]:
# Configure Splink with Spark backend
settings = {
    "link_type": "dedupe_only",
    "blocking_rules_to_generate_predictions": [
        "l.first_name = r.first_name",
        "l.last_name = r.last_name",
    ],
    "comparisons": [
        exact_match("first_name"),
        exact_match("last_name"),
        exact_match("email"),
        exact_match("phone"),
        levenshtein_at_thresholds("first_name", 2),
        levenshtein_at_thresholds("last_name", 2),
    ],
    "retain_matching_columns": True,
    "retain_intermediate_calculation_columns": True,
}

# Initialize Splink linker
linker = SparkLinker(df, settings)
print("Splink linker initialized successfully!")


In [None]:
# Train the model
linker.estimate_probability_two_random_records_match(
    ["l.first_name = r.first_name", "l.last_name = r.last_name"],
    recall=0.7
)

linker.estimate_u_using_random_sampling(max_pairs=1e6)
linker.estimate_parameters_using_expectation_maximisation("l.first_name = r.first_name")

print("Model training completed!")


In [None]:
# Get predictions
predictions = linker.predict()
print("Predictions:")
predictions.show()


In [None]:
# Get clusters
clusters = linker.cluster_pairwise_predictions_at_threshold(predictions, threshold_match_probability=0.5)
print("Clusters:")
clusters.show()


In [None]:
# Example: Save results back to Iceberg table
# clusters.writeTo("nessie.your_database.clustered_results").createOrReplace()

print("Splink with Spark backend demo completed successfully!")
print("\nTo use with your actual Iceberg tables:")
print("1. Replace the sample data with: df = spark.sql('SELECT * FROM nessie.your_database.your_table')")
print("2. Adjust the blocking rules and comparisons based on your data schema")
print("3. Save results back to Iceberg using: results.writeTo('nessie.your_database.results_table').createOrReplace()")
