In [2]:
# Project Setup and Imports
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import json
import requests
import time
import os
from pathlib import Path
from datetime import datetime, timedelta
from typing import Dict, List, Tuple, Optional, Union
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import warnings
from dotenv import load_dotenv
import glob

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set random seed for reproducibility
np.random.seed(42)

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

print(f"Starting gathering more features at {datetime.now()}")

Starting gathering more features at 2025-10-01 13:16:01.752002


In [8]:
# Constants and Configuration
PROJECT_ROOT = Path('/Users/joshuacroppenstedt/Desktop/Work/JP Morgan/short_interest_prediction')
DATA_DIR = PROJECT_ROOT / 'data'
CACHE_DIR = DATA_DIR / 'cache'
PRICES_DIR = DATA_DIR / 'prices'
FINRA_DIR = DATA_DIR / 'finra_clean'
FEATURES_DIR = DATA_DIR / 'features'

# Ensure directories exist
for dir_path in [CACHE_DIR, CACHE_DIR / 'alphavantage', CACHE_DIR / 'finnhub', CACHE_DIR / 'eodhd']:
    dir_path.mkdir(parents=True, exist_ok=True)

# Trading day windows (approximations)
WINDOWS = [10, 21, 42, 63, 126, 252]  # 2w, 1m, 2m, 3m, 6m, 12m
WINDOW_NAMES = ['2w', '1m', '2m', '3m', '6m', '12m']
WINDOW_MAPPING = dict(zip(WINDOWS, WINDOW_NAMES))

# Global settings
USE_CACHE = True
MAX_WORKERS = 4  # For parallel API calls
RATE_LIMIT_DELAY = 0.1  # Base delay between API calls

# Generate timestamp for output files
TIMESTAMP = datetime.now().strftime('%Y%m%d_%H%M')

print(f"Configuration:")
print(f"  - Project root: {PROJECT_ROOT}")
print(f"  - Windows: {WINDOWS}")
print(f"  - Use cache: {USE_CACHE}")
print(f"  - Timestamp: {TIMESTAMP}")

Configuration:
  - Project root: /Users/joshuacroppenstedt/Desktop/Work/JP Morgan/short_interest_prediction
  - Windows: [10, 21, 42, 63, 126, 252]
  - Use cache: True
  - Timestamp: 20251001_1319


In [9]:
# Load API Keys
load_dotenv(PROJECT_ROOT / 'notebooks' / '.env')

API_KEYS = {
    'alphavantage': os.getenv('ALPHAVANTAGE_API_KEY'),
    'finnhub': os.getenv('FINNHUB_API_KEY'),
    'eodhd': os.getenv('EODHD_API_KEY')
}

# Verify keys are loaded (don't print actual keys)
missing_keys = [k for k, v in API_KEYS.items() if not v]
if missing_keys:
    logger.warning(f"Missing API keys: {missing_keys}")
else:
    logger.info("All API keys loaded successfully")

print(f"API keys status: {[k + ('✓' if v else '✗') for k, v in API_KEYS.items()]}")

2025-10-01 13:19:54,501 - INFO - All API keys loaded successfully


API keys status: ['alphavantage✓', 'finnhub✓', 'eodhd✓']


In [11]:
# Load shares outstanding data to get ticker universe
print("Loading shares outstanding data...")

# Find the most recent unified price data file
shares_outstanding_files = list(FEATURES_DIR.glob("shares_outstanding_multiindex_corrected_*.parquet"))
if not shares_outstanding_files:
    raise FileNotFoundError("No shares outstanding data files found. Please run the price cleaning notebook first.")

# Use the most recent file
shares_outstanding_file = max(shares_outstanding_files, key=lambda x: x.stat().st_mtime)
print(f"Loading: {shares_outstanding_file.name}")

# Load the unified price data
shares_outstanding_data = pd.read_parquet(shares_outstanding_file)
print(f"Unified price data shape: {shares_outstanding_data.shape}")
print(f"Date range: {shares_outstanding_data.index.min()} to {shares_outstanding_data.index.max()}")

# Extract ticker list
if isinstance(shares_outstanding_data.columns, pd.MultiIndex):
    tickers = sorted(shares_outstanding_data.columns.get_level_values(0).unique().tolist())
else:
    raise ValueError("Expected MultiIndex columns in unified price data")

print(f"Total tickers: {len(tickers)}")
print(f"Sample tickers: {tickers[:10]}")

# Store date range for feature alignment
feature_start_date = shares_outstanding_data.index.min()
feature_end_date = shares_outstanding_data.index.max()
print(f"Feature date range: {feature_start_date} to {feature_end_date}")

Loading shares outstanding data...
Loading: shares_outstanding_multiindex_corrected_20250929_204539.parquet
Unified price data shape: (1803, 2273)
Date range: 2008-03-28 00:00:00 to 2025-08-24 00:00:00
Total tickers: 2273
Sample tickers: ['A', 'AA', 'AAL', 'AAME', 'AAOI', 'AAON', 'AAP', 'AAPL', 'AAT', 'ABBV']
Feature date range: 2008-03-28 00:00:00 to 2025-08-24 00:00:00


In [80]:
from eodhd import APIClient

api_token = API_KEYS['eodhd']

def get_sentiment_data(ticker, from_date, to_date):
    url = f'https://eodhd.com/api/sentiments'
    query = {'api_token': api_token, 's': ticker, 'from': from_date, 'to': to_date, 'fmt': 'json'}
    response = requests.get(url, params=query)
    if response.status_code != 200:
            print(f"Error retrieving sentiment data: {response.status_code}")
            print(response.text)
            return None
    sentiment_data = response.json()
    # Access the sentiment data using the ticker symbol as a key
    #sentiment_df = pd.DataFrame(sentiment_data[ticker])
    # Convert date string to datetime
    #sentiment_df['date'] = pd.to_datetime(sentiment_df['date'])
    # Set date as index
    #sentiment_df.set_index('date', inplace=True)
    # Sort by date (ascending)
    #sentiment_df.sort_index(inplace=True)
    # Rename column normalized to sentiment
    #sentiment_df.rename(columns={'normalized': 'sentiment'}, inplace=True)
    #return sentiment_df
    return sentiment_data
#sentiment_df = get_sentiment_data(TICKER, from_date, to_date)

sentiment_df = get_sentiment_data('TSLA.US', '2017-12-01', '2018-01-10')

In [81]:
sentiment_df

{'TSLA.US': [{'date': '2018-01-03', 'count': 1, 'normalized': 0.994}]}

In [40]:
pip install finnhub-python

Collecting finnhub-python
  Downloading finnhub_python-2.4.25-py3-none-any.whl.metadata (9.2 kB)
Downloading finnhub_python-2.4.25-py3-none-any.whl (11 kB)
Installing collected packages: finnhub-python
Successfully installed finnhub-python-2.4.25
Note: you may need to restart the kernel to use updated packages.


In [68]:
import finnhub

finnhub_client = finnhub.Client(api_key=API_KEYS['finnhub'])

print(finnhub_client.stock_social_sentiment('MSFT', _from='2021-06-01', to='2021-04-01'))

{'data': [], 'symbol': 'MSFT'}
