In [4]:
from alpaca.data.live import StockDataStream
from datetime import datetime, timedelta
import pytz
import pandas as pd
import nest_asyncio
import asyncio
import os

# Applica patch per notebook o ambienti embedded (opzionale in PyCharm)
nest_asyncio.apply()

# Configura le API Alpaca
API_KEY = "AKMFU3E7GI5IL5C2BI3V"
API_SECRET = "fDrBXt5omT71frwtJrrwlL5fgiiCim6gwrlJ6RpQ"

# Legge tickers S&P500
url = "https://raw.githubusercontent.com/datasets/s-and-p-500-companies/master/data/constituents.csv"
tic = pd.read_csv(url)
tickers = tic['Symbol'].tolist()
top_30_tickers = [
    "AAPL", "MSFT", "NVDA", "AMZN", "META", "BRK.B", "GOOGL", "AVGO", "TSLA", "GOOG",
    "LLY", "JPM", "V", "XOM", "NFLX", "COST", "UNH", "JNJ", "PG", "MA",
    "CVX", "MRK", "PEP", "ABBV", "ADBE", "WMT", "BAC", "HD", "KO", "TMO"
]
top_30_dict = {
    "AAPL": "Apple Inc.",
    "MSFT": "Microsoft Corp.",
    "NVDA": "NVIDIA Corp.",
    "AMZN": "Amazon.com Inc.",
    "META": "Meta Platforms Inc.",
    "BRK.B": "Berkshire Hathaway Inc.",
    "GOOGL": "Alphabet Inc. Class A",
    "AVGO": "Broadcom Inc.",
    "TSLA": "Tesla Inc.",
    "GOOG": "Alphabet Inc. Class C",
    "LLY": "Eli Lilly and Co.",
    "JPM": "JPMorgan Chase & Co.",
    "V": "Visa Inc.",
    "XOM": "Exxon Mobil Corp.",
    "NFLX": "Netflix Inc.",
    "COST": "Costco Wholesale Corp.",
    "UNH": "UnitedHealth Group Inc.",
    "JNJ": "Johnson & Johnson",
    "PG": "Procter & Gamble Co.",
    "MA": "Mastercard Inc.",
    "CVX": "Chevron Corp.",
    "MRK": "Merck & Co. Inc.",
    "PEP": "PepsiCo Inc.",
    "ABBV": "AbbVie Inc.",
    "ADBE": "Adobe Inc.",
    "WMT": "Walmart Inc.",
    "BAC": "Bank of America Corp.",
    "HD": "Home Depot Inc.",
    "KO": "Coca-Cola Co.",
    "TMO": "Thermo Fisher Scientific Inc."
}


# Dati raccolti
df_all_data = pd.DataFrame()

# Stream Alpaca
stream = StockDataStream(API_KEY, API_SECRET)

# Directory per salvataggi
os.makedirs("dati_stream", exist_ok=True)

# Fuso orario
italy_timezone = pytz.timezone("Europe/Rome")

# Ricezione trade
async def handle_trade_data(data):
    global df_all_data
    row = {
        "Ticker": data.symbol,
        "Timestamp": pd.to_datetime(data.timestamp, utc=True),
        "Price": data.price,
        "Size": data.size,
        "Exchange": data.exchange
    }
    df_all_data = pd.concat([df_all_data, pd.DataFrame([row])], ignore_index=True)
    print(df_all_data.tail(1))

# Salvataggio periodico
async def save_data_periodically(interval_sec=30):
    while True:
        await asyncio.sleep(interval_sec)
        if not df_all_data.empty:
            timestamp = datetime.now(italy_timezone).strftime("%Y%m%d_%H%M%S")
            path = f"dati_stream/stream_{timestamp}.csv"
            df_all_data.to_csv(path, index=False)
            print(f"💾 Salvato {len(df_all_data)} righe in {os.path.abspath(path)}")

# Funzione principale
async def main(duration_seconds=60):
    global df_all_data

    # Abbonati ai primi 30 ticker (Alpaca free limit)
    for symbol in top_30_tickers:
        stream.subscribe_trades(handle_trade_data, symbol)

    # Task salvataggio
    save_task = asyncio.create_task(save_data_periodically(interval_sec=20))

    # Stop dopo X secondi
    stop_time = datetime.now(italy_timezone) + timedelta(seconds=duration_seconds)

    # Avvia streaming
    stream_task = asyncio.create_task(stream.run())

    # Attesa
    while datetime.now(italy_timezone) < stop_time:
        await asyncio.sleep(1)

    # Stop streaming
    await stream.stop()

    # Ferma salvataggio periodico
    save_task.cancel()

    # Salvataggio finale
    if not df_all_data.empty:
        final_timestamp = datetime.now(italy_timezone).strftime("%Y%m%d_%H%M%S")
        final_path = f"dati_stream/stream_{final_timestamp}_FINAL.csv"
        df_all_data.to_csv(final_path, index=False)
        print(f"✅ Salvataggio finale con {len(df_all_data)} righe in {os.path.abspath(final_path)}")

    print("✅ Raccolta dati terminata.")

# Avvio dello script (solo in esecuzione diretta)
if __name__ == "__main__":
    asyncio.run(main(duration_seconds=60))


  Ticker                        Timestamp   Price    Size Exchange
0   NVDA 2025-05-09 13:33:08.232281+00:00  117.58  1038.0        V
  Ticker                        Timestamp   Price  Size Exchange
1    PEP 2025-05-09 13:33:08.241685+00:00  130.53  10.0        V
  Ticker                        Timestamp  Price   Size Exchange
2      V 2025-05-09 13:33:08.892952+00:00  353.2  100.0        V
  Ticker                        Timestamp   Price   Size Exchange
3     HD 2025-05-09 13:33:08.900270+00:00  362.85  100.0        V
  Ticker                        Timestamp   Price   Size Exchange
4     HD 2025-05-09 13:33:08.900399+00:00  362.89  100.0        V
  Ticker                        Timestamp   Price  Size Exchange
5  GOOGL 2025-05-09 13:33:09.067141+00:00  154.27  10.0        V
  Ticker                        Timestamp   Price  Size Exchange
6   GOOG 2025-05-09 13:33:09.086561+00:00  155.69  10.0        V
  Ticker                        Timestamp   Price  Size Exchange
7   META 2025-05-

TimeoutError: 

💾 Salvato 4222 righe in c:\Users\miklo\Desktop\Big data Finance\BDT-project\useless_trial\dati_stream\stream_20250509_153808.csv
💾 Salvato 4222 righe in c:\Users\miklo\Desktop\Big data Finance\BDT-project\useless_trial\dati_stream\stream_20250509_153828.csv
💾 Salvato 4222 righe in c:\Users\miklo\Desktop\Big data Finance\BDT-project\useless_trial\dati_stream\stream_20250509_153848.csv


In [None]:
from alpaca.data.live import StockDataStream
from datetime import datetime, timedelta
from kafka import KafkaProducer
import pytz
import pandas as pd
import nest_asyncio
import asyncio
import json

# Patch per ambienti embedded (notebook, PyCharm)
nest_asyncio.apply()

# Alpaca API config
API_KEY = "AKMFU3E7GI5IL5C2BI3V"
API_SECRET = "fDrBXt5omT71frwtJrrwlL5fgiiCim6gwrlJ6RpQ"

# Kafka config
KAFKA_BROKER = 'localhost:9092'
KAFKA_TOPIC = 'stock_trades'

# Kafka producer
producer = KafkaProducer(
    bootstrap_servers=KAFKA_BROKER,
    value_serializer=lambda v: json.dumps(v).encode('utf-8')
)

# Lista dei 30 principali ticker S&P 500
top_30_tickers = [
    "AAPL", "MSFT", "NVDA", "AMZN", "META", "BRK.B", "GOOGL", "AVGO", "TSLA", "GOOG",
    "LLY", "JPM", "V", "XOM", "NFLX", "COST", "UNH", "JNJ", "PG", "MA",
    "CVX", "MRK", "PEP", "ABBV", "ADBE", "WMT", "BAC", "HD", "KO", "TMO"
]

# Fuso orario italiano
italy_timezone = pytz.timezone("Europe/Rome")

# Stream Alpaca
stream = StockDataStream(API_KEY, API_SECRET)

# Callback per ogni trade ricevuto
async def handle_trade_data(data):
    row = {
        "Ticker": data.symbol,
        "Timestamp": pd.to_datetime(data.timestamp, utc=True).isoformat(),
        "Price": data.price,
        "Size": data.size,
        "Exchange": data.exchange
    }

    # Stampa e invia su Kafka
    print("📤 Kafka:", row)
    producer.send(KAFKA_TOPIC, value=row)
    producer.flush()

# Funzione principale
async def main(duration_seconds=60):
    for symbol in top_30_tickers:
        stream.subscribe_trades(handle_trade_data, symbol)

    stop_time = datetime.now(italy_timezone) + timedelta(seconds=duration_seconds)
    stream_task = asyncio.create_task(stream.run())

    while datetime.now(italy_timezone) < stop_time:
        await asyncio.sleep(1)

    await stream.stop()
    print("✅ Streaming terminato.")

# Avvio script
if __name__ == "__main__":
    asyncio.run(main(duration_seconds=60))


ModuleNotFoundError: No module named 'kafka.vendor.six.moves'

: 

In [7]:
!pip install kafka-python




[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
