# Quick Data Forge Connections

Ready-to-use connection snippets for all Data Forge services.

## Quick Start
Just run the cell for the service you need!

## Environment Setup

**Run this cell first!** All connection configurations are loaded from Docker environment variables.

In [None]:
import os
POSTGRES_URL = f"postgresql://{os.getenv('POSTGRES_USER', 'admin')}:{os.getenv('POSTGRES_PASSWORD', 'admin')}@postgres:5432/{os.getenv('POSTGRES_DB', 'metastore')}"
CLICKHOUSE_URL = f"clickhouse://{os.getenv('CLICKHOUSE_USER', 'admin')}:{os.getenv('CLICKHOUSE_PASSWORD', 'admin')}@clickhouse:8123/{os.getenv('CLICKHOUSE_DB', 'analytics')}"

MINIO_ENDPOINT = "http://minio:9000"
MINIO_ACCESS_KEY = os.getenv('MINIO_ROOT_USER', 'minio')
MINIO_SECRET_KEY = os.getenv('MINIO_ROOT_PASSWORD', 'minio123')

KAFKA_SERVERS = os.getenv('KAFKA_BOOTSTRAP_SERVERS', 'kafka:9092')
SCHEMA_REGISTRY_URL = os.getenv('SCHEMA_REGISTRY_URL', 'http://schema-registry:8081')

TRINO_URL = "http://trino:8080"
SPARK_MASTER = os.getenv('SPARK_MASTER_URL', 'spark://spark-master:7077')

print("[SUCCESS] Connection URLs configured!")

In [None]:
from pyspark.sql import SparkSession

print("Connecting to Spark Cluster...")
print(f"Cluster: {SPARK_MASTER}")

try:
    spark = SparkSession.builder \
        .appName("DataForge-JupyterLab") \
        .master(SPARK_MASTER) \
        .config("spark.executor.memory", "512m") \
        .config("spark.driver.memory", "512m") \
        .config("spark.executor.cores", "1") \
        .config("spark.cores.max", "2") \
        .getOrCreate()
    
    print("[SUCCESS] SparkSession created successfully!")
    print(f"[SUCCESS] Master: {spark.sparkContext.master}")
    print(f"[SUCCESS] Version: {spark.version}")
    sample_data = [
        ("Alice", 25, "Engineer"),
        ("Bob", 30, "Manager"), 
        ("Charlie", 35, "Analyst")
    ]
    
    df = spark.createDataFrame(sample_data, ["name", "age", "role"])
    print("Sample Dataset:")
    df.show()
    
    print("\nSPARK CLUSTER CONNECTION SUCCESSFUL!")
    print("SparkSession is ready for use!")
    
except Exception as e:
    print(f"[ERROR] Connection failed: {e}")
    print("Make sure to run the Environment Setup cell first!")

## Spark Cluster Connection

## PostgreSQL Connection

In [3]:
import pandas as pd
import psycopg2
from sqlalchemy import create_engine

# Using pandas (recommended)
pg_engine = create_engine(POSTGRES_URL)
df = pd.read_sql("SELECT current_timestamp as now", pg_engine)
print("PostgreSQL Result:")
print(df)

print("[SUCCESS] PostgreSQL connection ready!")

## ClickHouse Connection

In [None]:
import clickhouse_connect
import pandas as pd

ch_client = clickhouse_connect.get_client(
    host='clickhouse',
    port=8123,
    username=os.getenv('CLICKHOUSE_USER', 'admin'),
    password=os.getenv('CLICKHOUSE_PASSWORD', 'admin'),
    database=os.getenv('CLICKHOUSE_DB', 'analytics')
)

# Query with DataFrame result
result = ch_client.query("SELECT 'Hello ClickHouse!' as message, now() as timestamp")
# Convert to DataFrame using the correct method
df_ch = pd.DataFrame(result.result_rows, columns=result.column_names)
print("ClickHouse Result:")
print(df_ch)

print("[SUCCESS] ClickHouse connection ready!")

## MinIO S3 Connection

In [5]:
import boto3
from io import StringIO

s3_client = boto3.client(
    's3',
    endpoint_url=MINIO_ENDPOINT,
    aws_access_key_id=MINIO_ACCESS_KEY,
    aws_secret_access_key=MINIO_SECRET_KEY
)

buckets = s3_client.list_buckets()
print(f"Available buckets: {[b['Name'] for b in buckets['Buckets']]}")

print("[SUCCESS] MinIO S3 connection ready!")

## Trino SQL Engine

In [None]:
from trino.dbapi import connect as trino_connect
import pandas as pd

# Connect to Trino
trino_conn = trino_connect(
    host='trino',
    port=8080,
    user='admin',
    catalog='system',
    schema='runtime'
)

# Query function
def query_trino(sql):
    cursor = trino_conn.cursor()
    cursor.execute(sql)
    columns = [desc[0] for desc in cursor.description]
    data = cursor.fetchall()
    return pd.DataFrame(data, columns=columns)

# Example query
catalogs_df = query_trino("SHOW CATALOGS")
print("Available Catalogs:")
print(catalogs_df)

print("[SUCCESS] Trino connection ready!")

## Redis Cache

In [None]:
import redis
import json

# Connect to Redis
r = redis.Redis(host='redis', port=6379, decode_responses=True)

# Cache functions
def cache_dataframe(key, df, expire_seconds=3600):
    """Cache a DataFrame as JSON"""
    json_data = df.to_json(orient='records')
    r.setex(key, expire_seconds, json_data)
    print(f"DataFrame cached with key: {key}")

def get_cached_dataframe(key):
    """Retrieve a cached DataFrame"""
    json_data = r.get(key)
    if json_data:
        return pd.read_json(json_data, orient='records')
    return None

# Test cache
r.set('test:message', 'Hello from Redis!')
message = r.get('test:message')
print(f"Cached message: {message}")

print("[SUCCESS] Redis connection ready!")

## Kafka Connection

In [None]:
from kafka import KafkaProducer, KafkaConsumer
import json
from datetime import datetime

# Producer setup
producer = KafkaProducer(
    bootstrap_servers=[KAFKA_SERVERS],
    value_serializer=lambda x: json.dumps(x).encode('utf-8')
)

# Send message function
def send_message(topic, message):
    data = {
        'timestamp': datetime.now().isoformat(),
        'message': message
    }
    future = producer.send(topic, data)
    record = future.get(timeout=10)
    print(f"Message sent to {topic}: partition {record.partition}, offset {record.offset}")
    return record

# Consumer setup
def create_consumer(topic, group_id='jupyter-consumer'):
    return KafkaConsumer(
        topic,
        bootstrap_servers=[KAFKA_SERVERS],
        group_id=group_id,
        value_deserializer=lambda m: json.loads(m.decode('utf-8')),
        auto_offset_reset='latest'
    )

print("[SUCCESS] Kafka producer/consumer ready!")

## Connection Status Check

In [None]:
def check_all_connections():
    status = {}
    
    try:
        pd.read_sql("SELECT 1", pg_engine)
        status['PostgreSQL'] = '[SUCCESS]'
    except:
        status['PostgreSQL'] = '[ERROR]'
    
    try:
        ch_client.query("SELECT 1")
        status['ClickHouse'] = '[SUCCESS]'
    except:
        status['ClickHouse'] = '[ERROR]'
    
    try:
        s3_client.list_buckets()
        status['MinIO'] = '[SUCCESS]'
    except:
        status['MinIO'] = '[ERROR]'
    
    try:
        if 'spark' in globals() and spark is not None:
            spark.sql("SELECT 1").collect()
            status['Spark'] = '[SUCCESS]'
        else:
            status['Spark'] = '[ERROR] (Not initialized)'
    except:
        status['Spark'] = '[ERROR]'
    
    print("Connection Status:")
    for service, stat in status.items():
        print(f"  {stat} {service}")
    
    successful = sum(1 for s in status.values() if '[SUCCESS]' in s)
    total = len(status)
    print(f"\nOverall: {successful}/{total} services connected ({successful/total*100:.1f}%)")
    
    return status

check_all_connections()

In [None]:
def check_all_connections():
    status = {}
    
    # PostgreSQL
    try:
        pd.read_sql("SELECT 1", pg_engine)
        status['PostgreSQL'] = '[SUCCESS]'
    except Exception as e:
        status['PostgreSQL'] = '[ERROR]'
        print(f"   PostgreSQL error: {str(e)[:50]}...")
    
    # ClickHouse
    try:
        ch_client.query("SELECT 1")
        status['ClickHouse'] = '[SUCCESS]'
    except Exception as e:
        status['ClickHouse'] = '[ERROR]'
        print(f"   ClickHouse error: {str(e)[:50]}...")
    
    # MinIO
    try:
        s3_client.list_buckets()
        status['MinIO'] = '[SUCCESS]'
    except Exception as e:
        status['MinIO'] = '[ERROR]'
        print(f"   MinIO error: {str(e)[:50]}...")
    
    # Redis
    try:
        r.ping()
        status['Redis'] = '[SUCCESS]'
    except Exception as e:
        status['Redis'] = '[ERROR]'
        print(f"   Redis error: {str(e)[:50]}...")
    
    # Kafka
    try:
        # Test producer connection
        producer.bootstrap_connected()
        status['Kafka'] = '[SUCCESS]'
    except Exception as e:
        status['Kafka'] = '[ERROR]'
        print(f"   Kafka error: {str(e)[:50]}...")
    
    # Trino
    try:
        test_df = query_trino("SELECT 1 as test")
        status['Trino'] = '[SUCCESS]'
    except Exception as e:
        status['Trino'] = '[ERROR]'
        print(f"   Trino error: {str(e)[:50]}...")
    
    # Spark
    try:
        if 'spark' in globals() and spark is not None:
            spark.sql("SELECT 1").collect()
            status['Spark'] = '[SUCCESS]'
        else:
            status['Spark'] = '[ERROR] (Not initialized)'
    except Exception as e:
        status['Spark'] = '[ERROR]'
        print(f"   Spark error: {str(e)[:50]}...")
    
    print("Connection Status:")
    for service, stat in status.items():
        print(f"  {stat} {service}")
    
    successful = sum(1 for s in status.values() if '[SUCCESS]' in s)
    total = len(status)
    print(f"\nOverall: {successful}/{total} services connected ({successful/total*100:.1f}%)")
    
    return status

check_all_connections()

## Ready-to-Use Code Snippets

### Load data from PostgreSQL to Spark
```python
df_spark = spark.read \
    .format("jdbc") \
    .option("url", POSTGRES_URL.replace("postgresql://", "jdbc:postgresql://")) \
    .option("dbtable", "your_table") \
    .option("user", os.getenv('POSTGRES_USER', 'admin')) \
    .option("password", os.getenv('POSTGRES_PASSWORD', 'admin')) \
    .load()
```

### Save Spark DataFrame to ClickHouse
```python
# Convert Spark DF to Pandas then to ClickHouse
pandas_df = spark_df.toPandas()
ch_client.insert_df('your_table', pandas_df)
```

### Stream data with Kafka
```python
# Send data
send_message('your-topic', {'key': 'value'})

# Consume data
consumer = create_consumer('your-topic')
for message in consumer:
    print(message.value)
    break  # Process one message
```

### Cache DataFrame in Redis
```python
# Cache a DataFrame
cache_dataframe('my_data', df, expire_seconds=3600)

# Retrieve cached DataFrame
cached_df = get_cached_dataframe('my_data')
```

### Query across multiple databases with Trino
```python
# Query data from different sources
query = '''
SELECT pg.*, ch.analytics_column 
FROM postgresql.public.user_data pg
JOIN clickhouse.analytics.user_events ch 
ON pg.user_id = ch.user_id
'''
result_df = query_trino(query)
```