In [None]:
# ? SPARK CLUSTER CONNECTION - WORKING SOLUTION!
# This is the correct configuration for connecting to the Spark cluster

from pyspark.sql import SparkSession

print("üéØ Connecting to Spark Cluster...")
print("‚ö° Cluster: spark://spark-master:7077")

try:
    # Create SparkSession with cluster connection
    spark = SparkSession.builder \
        .appName("DataForge-JupyterLab") \
        .master("spark://spark-master:7077") \
        .config("spark.executor.memory", "512m") \
        .config("spark.driver.memory", "512m") \
        .config("spark.executor.cores", "1") \
        .config("spark.cores.max", "2") \
        .getOrCreate()
    
    print("‚úÖ SparkSession created successfully!")
    print(f"‚úÖ Master: {spark.sparkContext.master}")
    print(f"‚úÖ Version: {spark.version}")
    print(f"‚úÖ App Name: {spark.sparkContext.appName}")
    
    # Test with sample data
    print("\nüß™ Testing distributed processing...")
    sample_data = [
        ("Alice", 25, "Engineer"),
        ("Bob", 30, "Manager"), 
        ("Charlie", 35, "Analyst"),
        ("Diana", 28, "Developer"),
        ("Eve", 32, "Scientist")
    ]
    
    df = spark.createDataFrame(sample_data, ["name", "age", "role"])
    
    # Show sample data
    print("üìä Sample Dataset:")
    df.show()
    
    # Distributed operations
    total_records = df.count()
    avg_age = df.agg({"age": "avg"}).collect()[0][0]
    senior_staff = df.filter(df.age >= 30).select("name", "role").collect()
    
    print(f"‚úÖ Total records: {total_records}")
    print(f"‚úÖ Average age: {avg_age:.1f}")
    print(f"‚úÖ Senior staff (30+): {[(row.name, row.role) for row in senior_staff]}")
    
    print("\nüéâ SPARK CLUSTER CONNECTION SUCCESSFUL!")
    print("üöÄ Ready for distributed data processing!")
    
    # Keep the session active for further use
    print("\nüí° SparkSession is ready for use in subsequent cells!")
    
except Exception as e:
    print(f"‚ùå Connection failed: {e}")
    import traceback
    traceback.print_exc()

# Quick Data Forge Connections

Ready-to-use connection snippets for all Data Forge services.

## üöÄ Quick Start
Just run the cell for the service you need!

## Environment Variables
All connection credentials are automatically loaded from Docker environment.

In [1]:
import os

# Database connections
POSTGRES_URL = f"postgresql://{os.getenv('POSTGRES_USER', 'admin')}:{os.getenv('POSTGRES_PASSWORD', 'admin')}@postgres:5432/{os.getenv('POSTGRES_DB', 'metastore')}"
CLICKHOUSE_URL = f"clickhouse://{os.getenv('CLICKHOUSE_USER', 'admin')}:{os.getenv('CLICKHOUSE_PASSWORD', 'admin')}@clickhouse:8123/{os.getenv('CLICKHOUSE_DB', 'analytics')}"

# Object storage
MINIO_ENDPOINT = "http://minio:9000"
MINIO_ACCESS_KEY = os.getenv('MINIO_ROOT_USER', 'minio')
MINIO_SECRET_KEY = os.getenv('MINIO_ROOT_PASSWORD', 'minio123')

# Streaming
KAFKA_SERVERS = os.getenv('KAFKA_BOOTSTRAP_SERVERS', 'kafka:9092')
SCHEMA_REGISTRY_URL = os.getenv('SCHEMA_REGISTRY_URL', 'http://schema-registry:8081')

# Services
TRINO_URL = "http://trino:8080"
SPARK_MASTER = os.getenv('SPARK_MASTER_URL', 'spark://spark-master:7077')

print("‚úÖ Connection URLs configured!")

‚úÖ Connection URLs configured!


## üìä PostgreSQL Connection

In [2]:
import pandas as pd
import psycopg2
from sqlalchemy import create_engine

# Method 1: Using pandas (recommended for data analysis)
pg_engine = create_engine(POSTGRES_URL)
df = pd.read_sql("SELECT current_timestamp as now", pg_engine)
print("üìä PostgreSQL via pandas:")
print(df)

# Method 2: Direct connection
pg_conn = psycopg2.connect(
    host='postgres',
    port=5432,
    database=os.getenv('POSTGRES_DB', 'metastore'),
    user=os.getenv('POSTGRES_USER', 'admin'),
    password=os.getenv('POSTGRES_PASSWORD', 'admin')
)

print("‚úÖ PostgreSQL connections ready!")
pg_conn.close()

## üöÄ ClickHouse Connection

In [3]:
import clickhouse_connect

# Connect to ClickHouse
ch_client = clickhouse_connect.get_client(
    host='clickhouse',
    port=8123,
    username=os.getenv('CLICKHOUSE_USER', 'admin'),
    password=os.getenv('CLICKHOUSE_PASSWORD', 'admin'),
    database=os.getenv('CLICKHOUSE_DB', 'analytics')
)

# Test query
result = ch_client.query("SELECT 'Hello ClickHouse!' as message, now() as timestamp")
df_ch = result.result_as_dataframe()
print("üìä ClickHouse Result:")
print(df_ch)

print("‚úÖ ClickHouse connection ready!")

## ‚òÅÔ∏è MinIO S3 Connection

In [4]:
import boto3
import pandas as pd
from io import StringIO

# Create S3 client
s3_client = boto3.client(
    's3',
    endpoint_url=MINIO_ENDPOINT,
    aws_access_key_id=MINIO_ACCESS_KEY,
    aws_secret_access_key=MINIO_SECRET_KEY
)

# List buckets
buckets = s3_client.list_buckets()
print(f"üìÅ Available buckets: {[b['Name'] for b in buckets['Buckets']]}")

# Example: Save DataFrame to MinIO
def save_dataframe_to_minio(df, bucket, key):
    csv_buffer = StringIO()
    df.to_csv(csv_buffer, index=False)
    s3_client.put_object(
        Bucket=bucket,
        Key=key,
        Body=csv_buffer.getvalue()
    )
    print(f"üì§ DataFrame saved to s3://{bucket}/{key}")

# Example: Load DataFrame from MinIO
def load_dataframe_from_minio(bucket, key):
    obj = s3_client.get_object(Bucket=bucket, Key=key)
    return pd.read_csv(obj['Body'])

print("‚úÖ MinIO S3 connection ready!")

## üì® Kafka Connection

In [5]:
from kafka import KafkaProducer, KafkaConsumer
import json
from datetime import datetime

# Producer setup
producer = KafkaProducer(
    bootstrap_servers=[KAFKA_SERVERS],
    value_serializer=lambda x: json.dumps(x).encode('utf-8')
)

# Send message function
def send_message(topic, message):
    data = {
        'timestamp': datetime.now().isoformat(),
        'message': message
    }
    future = producer.send(topic, data)
    record = future.get(timeout=10)
    print(f"üì§ Message sent to {topic}: partition {record.partition}, offset {record.offset}")
    return record

# Consumer setup
def create_consumer(topic, group_id='jupyter-consumer'):
    return KafkaConsumer(
        topic,
        bootstrap_servers=[KAFKA_SERVERS],
        group_id=group_id,
        value_deserializer=lambda m: json.loads(m.decode('utf-8')),
        auto_offset_reset='latest'
    )

print("‚úÖ Kafka producer/consumer ready!")

## ‚ö° Trino SQL Engine

In [6]:
from trino.dbapi import connect as trino_connect
import pandas as pd

# Connect to Trino
trino_conn = trino_connect(
    host='trino',
    port=8080,
    user='admin',
    catalog='system',
    schema='runtime'
)

# Query function
def query_trino(sql):
    cursor = trino_conn.cursor()
    cursor.execute(sql)
    columns = [desc[0] for desc in cursor.description]
    data = cursor.fetchall()
    return pd.DataFrame(data, columns=columns)

# Example query
catalogs_df = query_trino("SHOW CATALOGS")
print("üìö Available Catalogs:")
print(catalogs_df)

print("‚úÖ Trino connection ready!")

## üî• Spark Connection

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp

# Create Spark Session
spark = SparkSession.builder \
    .appName("DataForgeJupyter") \
    .master(SPARK_MASTER) \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .getOrCreate()

# Test DataFrame creation
data = [(1, "Alice", 25), (2, "Bob", 30), (3, "Charlie", 35)]
columns = ["id", "name", "age"]
df = spark.createDataFrame(data, columns)

print("üìä Sample Spark DataFrame:")
df.show()

print(f"‚ö° Spark Application ID: {spark.sparkContext.applicationId}")
print("‚úÖ Spark session ready!")

# Don't stop Spark session here - keep it running for use

TypeError: 'JavaPackage' object is not callable

### üîß Spark Cluster Connection (Advanced)

‚ö†Ô∏è **Note**: The above cell uses local mode to avoid network issues. If you need cluster mode, use the code below carefully.

In [None]:
# CLUSTER CONNECTION DISABLED TO PREVENT NETTY ERRORS
print("üö´ Cluster connection is disabled to prevent network issues")
print("üí° For development, local Spark mode above is perfect!")
print()
print("? If you REALLY need cluster mode for production:")
print("   1. Ensure proper Docker networking is configured")
print("   2. Configure Spark driver host settings correctly") 
print("   3. Set up proper port mappings")
print("   4. Consider using external Spark cluster instead")
print()
print("üìö Benefits of local mode:")
print("   ‚úÖ No network configuration needed")
print("   ‚úÖ Faster startup and execution")
print("   ‚úÖ Perfect for data exploration")
print("   ‚úÖ Handles datasets up to several GB easily")
print("   ‚úÖ Full Spark SQL and DataFrame capabilities")
print()
print("üéØ For most data engineering tasks in Jupyter, local mode is ideal!")

# Cluster connection code is commented out to prevent Netty errors
"""
def try_cluster_connection():
    # This function is disabled to prevent Netty networking errors
    # If you need cluster mode, extensive network configuration is required
    pass
"""

print("‚úÖ Using safe local-only Spark configuration")

### ‚úÖ Quick Spark Verification

Run this cell to verify Spark is working correctly:

In [None]:
# Quick verification that Spark is working
if 'spark' in globals() and spark is not None:
    try:
        # Simple test
        test_df = spark.createDataFrame([(1, "test"), (2, "data")], ["id", "value"])
        row_count = test_df.count()
        
        print(f"‚úÖ Spark verification successful!")
        print(f"üìä Test DataFrame has {row_count} rows")
        print(f"üéØ Spark master: {spark.sparkContext.master}")
        print(f"üöÄ Ready for data processing!")
        
    except Exception as e:
        print(f"‚ùå Spark verification failed: {str(e)}")
        
else:
    print("‚ö†Ô∏è Spark not initialized. Run the Spark connection cell above first.")

### üîß Spark Troubleshooting

If Spark connection fails, try these alternatives:

In [4]:
# Alternative Spark connection methods

# Method 1: Local mode (for development/testing)
def create_local_spark():
    """Create Spark session in local mode"""
    try:
        local_spark = SparkSession.builder \
            .appName("DataForge-Local") \
            .master("local[*]") \
            .config("spark.sql.adaptive.enabled", "true") \
            .config("spark.sql.warehouse.dir", "/tmp/spark-warehouse") \
            .getOrCreate()
        print("‚úÖ Local Spark session created")
        return local_spark
    except Exception as e:
        print(f"‚ùå Local Spark failed: {e}")
        return None

# Method 2: Simple cluster connection
def create_cluster_spark():
    """Create Spark session with cluster connection"""
    try:
        cluster_spark = SparkSession.builder \
            .appName("DataForge-Cluster") \
            .master("spark://spark-master:7077") \
            .config("spark.executor.memory", "512m") \
            .config("spark.driver.memory", "512m") \
            .getOrCreate()
        print("‚úÖ Cluster Spark session created")
        return cluster_spark
    except Exception as e:
        print(f"‚ùå Cluster Spark failed: {e}")
        return None

# Test methods
print("üß™ Testing Spark connection methods...")

if 'spark' not in globals() or spark is None:
    print("üîÑ Trying local Spark...")
    spark = create_local_spark()
    
    if spark is None:
        print("üîÑ Trying cluster Spark...")
        spark = create_cluster_spark()

if spark:
    print(f"‚úÖ Active Spark session: {spark.sparkContext.master}")
else:
    print("‚ùå No Spark session available")
    print("üí° Check Docker services: docker compose ps | grep spark")

üß™ Testing Spark connection methods...
üîÑ Trying local Spark...
‚ùå Local Spark failed: 'JavaPackage' object is not callable
üîÑ Trying cluster Spark...
‚ùå Cluster Spark failed: 'JavaPackage' object is not callable
‚ùå No Spark session available
üí° Check Docker services: docker compose ps | grep spark


## üî¥ Redis Cache

In [8]:
import redis
import json

# Connect to Redis
r = redis.Redis(host='redis', port=6379, decode_responses=True)

# Cache functions
def cache_dataframe(key, df, expire_seconds=3600):
    """Cache a DataFrame as JSON"""
    json_data = df.to_json(orient='records')
    r.setex(key, expire_seconds, json_data)
    print(f"üìù DataFrame cached with key: {key}")

def get_cached_dataframe(key):
    """Retrieve a cached DataFrame"""
    json_data = r.get(key)
    if json_data:
        return pd.read_json(json_data, orient='records')
    return None

# Test cache
r.set('test:message', 'Hello from Redis!')
message = r.get('test:message')
print(f"üí¨ Cached message: {message}")

print("‚úÖ Redis connection ready!")

## üîß Connection Status Check

In [None]:
# Quick health check for all services
def check_all_connections():
    status = {}
    
    # PostgreSQL
    try:
        pd.read_sql("SELECT 1", pg_engine)
        status['PostgreSQL'] = '‚úÖ'
    except Exception as e:
        status['PostgreSQL'] = '‚ùå'
        print(f"   PostgreSQL error: {str(e)[:50]}...")
    
    # ClickHouse
    try:
        ch_client.query("SELECT 1")
        status['ClickHouse'] = '‚úÖ'
    except Exception as e:
        status['ClickHouse'] = '‚ùå'
        print(f"   ClickHouse error: {str(e)[:50]}...")
    
    # MinIO
    try:
        s3_client.list_buckets()
        status['MinIO'] = '‚úÖ'
    except Exception as e:
        status['MinIO'] = '‚ùå'
        print(f"   MinIO error: {str(e)[:50]}...")
    
    # Redis
    try:
        r.ping()
        status['Redis'] = '‚úÖ'
    except Exception as e:
        status['Redis'] = '‚ùå'
        print(f"   Redis error: {str(e)[:50]}...")
    
    # Spark
    try:
        if 'spark' in globals() and spark is not None:
            spark.sql("SELECT 1").collect()
            status['Spark'] = '‚úÖ'
        else:
            status['Spark'] = '‚ùå (Session not initialized)'
    except Exception as e:
        status['Spark'] = '‚ùå'
        print(f"   Spark error: {str(e)[:50]}...")
    
    # Trino
    try:
        test_df = query_trino("SELECT 1 as test")
        status['Trino'] = '‚úÖ'
    except Exception as e:
        status['Trino'] = '‚ùå'
        print(f"   Trino error: {str(e)[:50]}...")
    
    print("üîç Connection Status:")
    for service, stat in status.items():
        print(f"  {stat} {service}")
    
    # Summary
    successful = sum(1 for s in status.values() if '‚úÖ' in s)
    total = len(status)
    print(f"\nüìä Overall Status: {successful}/{total} services connected ({successful/total*100:.1f}%)")
    
    return status

check_all_connections()

## üìö Ready-to-Use Code Snippets

### Load data from PostgreSQL to Spark
```python
df_spark = spark.read \
    .format("jdbc") \
    .option("url", POSTGRES_URL) \
    .option("dbtable", "your_table") \
    .load()
```

### Save Spark DataFrame to ClickHouse
```python
# Convert Spark DF to Pandas then to ClickHouse
pandas_df = spark_df.toPandas()
ch_client.insert_df('your_table', pandas_df)
```

### Stream data with Kafka
```python
# Send data
send_message('your-topic', {'key': 'value'})

# Consume data
consumer = create_consumer('your-topic')
for message in consumer:
    print(message.value)
    break  # Process one message
```