In [None]:
import hopsworks
import pandas as pd
import numpy as np
import random
import json
import time
import threading
import logging
from datetime import datetime, timedelta
from collections import deque
from faker import Faker
from geopy.distance import geodesic
from confluent_kafka import Producer

# --- Logging setup ---
logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# --- Random seeds for reproducibility ---
fake = Faker()
Faker.seed(42)
np.random.seed(42)
random.seed(42)

# Define Avro schema
avro_schema = {
  "type": "record",
  "name": "credit_card_transactions",
  "namespace": "com.example",
  "doc": "Details about credit card transactions.",
  "fields": [
    {
      "name": "t_id",
      "type": "string",
      "doc": "Unique identifier for this credit card transaction (primary key)."
    },
    {
      "name": "cc_num",
      "type": "string",
      "doc": "Foreign key to credit card."
    },
    {
      "name": "merchant_id",
      "type": "string",
      "doc": "Foreign key to a merchant."
    },
    {
      "name": "amount",
      "type": {
        "type": "bytes",
        "logicalType": "decimal",
        "precision": 10,
        "scale": 2
      },
      "doc": "Credit card transaction amount."
    },
    {
      "name": "ip_address",
      "type": "string",
      "doc": "IP address of the merchant. Format: XXX.XXX.XXX.XXX"
    },
    {
      "name": "card_present",
      "type": "boolean",
      "doc": "Credit card was used in a physical terminal (true) or online payment (false)."
    },
    {
      "name": "ts",
      "type": {
        "type": "long",
        "logicalType": "timestamp-millis"
      },
      "doc": "Timestamp for this credit card transaction."
    }
  ]
}

# --- Helpers ---
def generate_distant_ip_pairs():
    """Generate pairs of IP addresses that are geographically distant"""
    cities = [
        ("1.1.1.1", 40.7128, -74.0060),        # New York
        ("80.80.80.80", 34.0522, -118.2437),  # Los Angeles
        ("120.120.120.120", 51.5074, -0.1278), # London
        ("160.160.160.160", 35.6762, 139.6503), # Tokyo
        ("200.200.200.200", -33.8688, 151.2093), # Sydney
        ("50.50.50.50", 41.8781, -87.6298),   # Chicago
        ("90.90.90.90", 48.8566, 2.3522),     # Paris
        ("130.130.130.130", 55.7558, 37.6173), # Moscow
        ("170.170.170.170", -22.9068, -43.1729), # Rio de Janeiro
        ("210.210.210.210", 31.2304, 121.4737)  # Shanghai
    ]
    
    distant_pairs = []
    for i in range(len(cities)):
        for j in range(i + 1, len(cities)):
            city1 = cities[i]
            city2 = cities[j]
            distance = geodesic((city1[1], city1[2]), (city2[1], city2[2])).kilometers
            if distance > 1000:
                distant_pairs.append((city1[0], city2[0], distance))
    return distant_pairs


# --- Main generator class ---
class ContinuousTransactionGenerator:
    def __init__(self, project):
        self.project = project

        # Load valid card + merchant IDs from feature store
        fs = project.get_feature_store()
        logger.info("Loading valid card numbers and merchant IDs...")
        card_details_fg = fs.get_feature_group("card_details", version=1)
        merchant_details_fg = fs.get_feature_group("merchant_details", version=1)

        card_df = card_details_fg.read()
        self.card_numbers = card_df['cc_num'].tolist()

        merchant_df = merchant_details_fg.read()
        self.merchant_ids = merchant_df['merchant_id'].tolist()

        logger.info(f"Loaded {len(self.card_numbers)} card numbers and {len(self.merchant_ids)} merchant IDs")

        # Kafka setup
        kafka_api = project.get_kafka_api()
        kafka_conf = kafka_api.get_default_config()
        self.producer = Producer(kafka_conf)
        self.topic = "cc_transactions"

        # State
        self.next_t_id = 1
        self.transaction_counter = 0
        self.fraud_counter = 0
        self.distant_pairs = generate_distant_ip_pairs()
        self.pending_attacks = deque()
        self.running = False

        # Statistics
        self.start_time = None
        self.total_transactions = 0
        self.total_fraud_cases = 0

    # --- Transaction generators ---
    def generate_single_transaction(self):
        t_id = str(self.next_t_id + self.transaction_counter)
        self.transaction_counter += 1
        return {
            "t_id": t_id,
            "cc_num": random.choice(self.card_numbers),
            "merchant_id": random.choice(self.merchant_ids),
            "amount": round(np.random.lognormal(3.5, 1.2), 2),
            "ip_address": f"{random.randint(1,255)}.{random.randint(0,255)}.{random.randint(0,255)}.{random.randint(0,255)}",
            "card_present": random.choice([True, False]),
            "ts": datetime.now().isoformat(),
            "fraud_type": None
        }

    def generate_fraud_case(self):
        """Generate one fraud case (returns list of transactions)"""
        self.fraud_counter += 1
        fraud_type = random.choice(['geographic', 'chain', 'single'])

        if fraud_type == 'geographic':
            card_num = random.choice(self.card_numbers)
            ip1, ip2, distance = random.choice(self.distant_pairs)
            fraud_time = datetime.now()

            txn1 = {
                "t_id": str(self.next_t_id + self.transaction_counter),
                "cc_num": card_num,
                "merchant_id": random.choice(self.merchant_ids),
                "amount": round(np.random.lognormal(3.5, 1.2), 2),
                "ip_address": ip1,
                "card_present": True,
                "ts": fraud_time.isoformat(),
                "fraud_type": "geographic"
            }
            self.transaction_counter += 1

            txn2 = {
                "t_id": str(self.next_t_id + self.transaction_counter),
                "cc_num": card_num,
                "merchant_id": random.choice(self.merchant_ids),
                "amount": round(np.random.lognormal(3.5, 1.2), 2),
                "ip_address": ip2,
                "card_present": True,
                "ts": (fraud_time + timedelta(minutes=random.randint(5, 30))).isoformat(),
                "fraud_type": "geographic"
            }
            self.transaction_counter += 1

            self.pending_attacks.append(txn2)
            logger.info(f"Generated geographic fraud case #{self.fraud_counter}: {distance:.0f} km apart")
            return [txn1]

        elif fraud_type == 'chain':
            card_num = random.choice(self.card_numbers)
            chain_size = random.randint(5, 15)
            chain_start = datetime.now()

            transactions = []
            for j in range(chain_size):
                txn = {
                    "t_id": str(self.next_t_id + self.transaction_counter),
                    "cc_num": card_num,
                    "merchant_id": random.choice(self.merchant_ids),
                    "amount": round(random.uniform(10.00, 99.99), 2),
                    "ip_address": f"{random.randint(1,255)}.{random.randint(0,255)}.{random.randint(0,255)}.{random.randint(0,255)}",
                    "card_present": random.choice([True, False]),
                    "ts": (chain_start + timedelta(minutes=j * random.randint(2, 10))).isoformat(),
                    "fraud_type": "chain"
                }
                self.transaction_counter += 1
                if j == 0:
                    transactions.append(txn)
                else:
                    self.pending_attacks.append(txn)

            logger.info(f"Generated chain fraud case #{self.fraud_counter}: {chain_size} transactions")
            return transactions

        else:  # single high-value
            txn = {
                "t_id": str(self.next_t_id + self.transaction_counter),
                "cc_num": random.choice(self.card_numbers),
                "merchant_id": random.choice(self.merchant_ids),
                "amount": round(random.uniform(1000.00, 5000.00), 2),
                "ip_address": f"{random.randint(1,255)}.{random.randint(0,255)}.{random.randint(0,255)}.{random.randint(0,255)}",
                "card_present": False,
                "ts": datetime.now().isoformat(),
                "fraud_type": "single"
            }
            self.transaction_counter += 1
            logger.info(f"Generated single high-value fraud case #{self.fraud_counter}: ${txn['amount']}")
            return [txn]

    def get_pending_transactions(self):
        """Release pending attack transactions if their time has come"""
        current_time = datetime.now()
        ready = []
        while self.pending_attacks:
            if datetime.fromisoformat(self.pending_attacks[0]['ts']) <= current_time:
                ready.append(self.pending_attacks.popleft())
            else:
                break
        return ready

    def generate_transaction_batch(self, target_count=10):
        """Batch with normal + fraud + pending"""
        transactions = []
        transactions.extend(self.get_pending_transactions())
        remaining = target_count - len(transactions)

        if remaining > 0 and random.randint(1, 1000) == 1:
            fraud_txns = self.generate_fraud_case()
            transactions.extend(fraud_txns)
            self.total_fraud_cases += 1

        remaining = target_count - len(transactions)
        for _ in range(remaining):
            transactions.append(self.generate_single_transaction())

        self.total_transactions += len(transactions)
        return transactions

    # --- Kafka output ---
    def insert_transactions(self, transactions):
        if not transactions:
            return
        try:
            for txn in transactions:
                value = json.dumps(txn).encode("utf-8")
                self.producer.produce(self.topic, value=value)
            self.producer.flush()
            logger.debug(f"Produced {len(transactions)} transactions to Kafka")
        except Exception as e:
            logger.error(f"Error producing to Kafka: {e}")

    # --- Main loop ---
    def run_continuous_client(self):
        logger.info("Starting continuous generator (Kafka output)")
        self.running = True
        self.start_time = datetime.now()

        def print_stats():
            while self.running:
                time.sleep(30)
                if self.running:
                    runtime = (datetime.now() - self.start_time).total_seconds()
                    rate = self.total_transactions / runtime if runtime > 0 else 0
                    logger.info(f"Stats â€” Txns: {self.total_transactions}, Frauds: {self.total_fraud_cases}, "
                                f"Rate: {rate:.2f} txn/sec, Pending: {len(self.pending_attacks)}")

        threading.Thread(target=print_stats, daemon=True).start()

        try:
            while self.running:
                start = time.time()
                txns = self.generate_transaction_batch(10)
                self.insert_transactions(txns)
                elapsed = time.time() - start
                time.sleep(max(0, 1.0 - elapsed))
        except KeyboardInterrupt:
            logger.info("Stopping generator...")
        finally:
            self.running = False
            logger.info("Stopped transaction generator.")


# --- Entrypoint ---
def main():
    project = hopsworks.login()
    kafka_api = project.get_kafka_api()
    topic_name=f"{project.name}_credit_card_transactions"
    if topic_name not in [topic.name for topic in kafka_api.get_topics()]:
        schema = kafka_api.create_schema("cc_transactions_"  + str(project.id), avro_schema)
        kafka_topic = kafka_api.create_topic(topic_name, schema.subject, 1)
        print(f"Created topic: {kafka_topic.name} with schema: {schema.subject}")

    generator = ContinuousTransactionGenerator(project)
    generator.run_continuous_client()

if __name__ == "__main__":
    main()
