In [4]:
import numpy as np
import pandas as pd
import pywt
from scipy import signal
from sklearn.preprocessing import StandardScaler
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import pytz
from datetime import datetime
import json

In [5]:
KEY = "XpqF6xBLLrj6WALk4SS1UlkgphXmHQec"

def convert_timestamp_est(timestamp):
    # Create a naive datetime object from the UNIX timestamp
    dt_naive = datetime.utcfromtimestamp(timestamp)
    # Convert the naive datetime object to a timezone-aware one (UTC)
    dt_utc = pytz.utc.localize(dt_naive)
    # Convert the UTC datetime to EST
    dt_est = dt_utc.astimezone(pytz.timezone('US/Eastern'))
    
    return dt_est

class CustomRetry(Retry):
    def is_retry(self, method, status_code, has_retry_after=False):
        """ Return True if we should retry the request, otherwise False. """
        if status_code != 200:
            return True
        return super().is_retry(method, status_code, has_retry_after)
    
def setup_session_retries(
    retries: int = 3,
    backoff_factor: float = 0.05,
    status_forcelist: tuple = (500, 502, 504),
):
    """
    Sets up a requests Session with retries.
    
    Parameters:
    - retries: Number of retries before giving up. Default is 3.
    - backoff_factor: A factor to use for exponential backoff. Default is 0.3.
    - status_forcelist: A tuple of HTTP status codes that should trigger a retry. Default is (500, 502, 504).

    Returns:
    - A requests Session object with retry configuration.
    """
    retry_strategy = CustomRetry(
        total=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
        allowed_methods=frozenset(["GET", "POST", "PUT", "DELETE", "HEAD", "OPTIONS"]),
        raise_on_status=False
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session = requests.Session()
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    
    return session

def execute_polygon_call(url):
    session = setup_session_retries()
    response = session.request("GET", url, headers={}, data={})
    return response 

def call_polygon(symbol, from_stamp, to_stamp, timespan, multiplier):
    hours = [9, 10, 11, 12, 13, 14, 15]
    url = f"https://api.polygon.io/v2/aggs/ticker/{symbol}/range/{multiplier}/{timespan}/{from_stamp}/{to_stamp}?adjusted=true&sort=asc&limit=50000&apiKey={KEY}"

    response = execute_polygon_call(url)

    response_data = json.loads(response.text)
    results = response_data['results']
    results_df = pd.DataFrame(results)
    results_df['t'] = results_df['t'].apply(lambda x: int(x/1000))
    results_df['date'] = results_df['t'].apply(lambda x: convert_timestamp_est(x))
    results_df['hour'] = results_df['date'].apply(lambda x: x.hour)
    results_df['symbol'] = symbol
    results_df = results_df[results_df['hour'].isin(hours)]

    return results_df   

In [9]:
def generate_sample_data(n_days=60):
    times = pd.date_range(start='2023-01-01', periods=n_days*48, freq='30T')
    base = 1000 + np.random.normal(0, 50, len(times))
    trend = np.linspace(0, 500, len(times))
    daily_pattern = 200 * np.sin(np.pi * (times.hour * 60 + times.minute) / (24 * 60))
    weekly_pattern = 100 * np.sin(2 * np.pi * (times.dayofweek) / 7)
    volume = base + trend + daily_pattern + weekly_pattern
    return pd.Series(volume, index=times)

def extract_cycle(data, cycle_length):
    kernel = np.ones(cycle_length) / cycle_length
    extracted_cycle = np.convolve(data, kernel, mode='same')
    return extracted_cycle

def wavelet_transform(data, wavelet='db8', level=None):
    if level is None:
        level = pywt.dwt_max_level(len(data), pywt.Wavelet(wavelet).dec_len)
    coeffs = pywt.wavedec(data, wavelet, level=level)
    return coeffs

def compute_trend_strength(data, window_size):
    trend = extract_cycle(data, window_size)
    trend_strength = np.abs(data - trend) / np.std(data)
    return 1 / (1 + trend_strength)  # Normalize to [0, 1]

def compute_cycle_strength(data, cycle_length):
    cycle = extract_cycle(data, cycle_length)
    cycle_strength = np.std(cycle) / np.std(data)
    return cycle_strength

def compute_cycle_divergence(short_cycle, long_cycle):
    difference = short_cycle - long_cycle
    divergence = np.abs(difference) / np.std(short_cycle)
    return 1 / (1 + divergence)  # Normalize to [0, 1]

def compute_wavelet_energy(coeffs):
    energy = [np.sum(np.square(c)) for c in coeffs]
    total_energy = np.sum(energy)
    return np.array(energy) / total_energy

def rolling_wavelet_energy(data, window_size, wavelet='db8', level=None):
    energies = []
    max_length = 0
    for i in range(len(data)):
        window = data[max(0, i-window_size+1):i+1]
        coeffs = wavelet_transform(window, wavelet, level)
        energy = compute_wavelet_energy(coeffs)
        energies.append(energy)
        max_length = max(max_length, len(energy))
    
    # Pad shorter arrays with zeros
    padded_energies = [np.pad(e, (0, max_length - len(e)), 'constant') for e in energies]
    return np.array(padded_energies)

def build_features(volume_data, daily_cycle_length=48, weekly_cycle_length=48*7):
    # Extract cycles
    daily_cycle = extract_cycle(volume_data, daily_cycle_length)
    weekly_cycle = extract_cycle(volume_data, weekly_cycle_length)
    
    # Compute basic features
    daily_trend_strength = compute_trend_strength(volume_data, daily_cycle_length)
    weekly_trend_strength = compute_trend_strength(volume_data, weekly_cycle_length)
    daily_cycle_strength = compute_cycle_strength(volume_data, daily_cycle_length)
    weekly_cycle_strength = compute_cycle_strength(volume_data, weekly_cycle_length)
    cycle_divergence = compute_cycle_divergence(daily_cycle, weekly_cycle)
    
    # Compute rolling wavelet energy
    window_size = daily_cycle_length * 2  # Two days
    wavelet_energy = rolling_wavelet_energy(volume_data, window_size)
    
    # Create feature DataFrame
    features = pd.DataFrame({
        'daily_trend_strength': daily_trend_strength,
        'weekly_trend_strength': weekly_trend_strength,
        'daily_cycle_strength': daily_cycle_strength,
        'weekly_cycle_strength': weekly_cycle_strength,
        'cycle_divergence': cycle_divergence,
    })
    
    # Add wavelet energy features
    for i in range(wavelet_energy.shape[1]):
        features[f'wavelet_energy_{i}'] = wavelet_energy[:, i]
    
    # Add time-based features
    features['hour'] = volume_data.index.hour
    features['day_of_week'] = volume_data.index.dayofweek
    features['day_of_month'] = volume_data.index.day
    features['month'] = volume_data.index.month
    
    # Normalize features
    scaler = StandardScaler()
    normalized_features = pd.DataFrame(scaler.fit_transform(features), columns=features.columns, index=features.index)
    
    return normalized_features

# Use your data
data = call_polygon("AAPL", "2024-06-01", "2024-08-01", "minute", 30)
volume_data = pd.Series(data['v'].values, index=data['date'])
print(volume_data)
print(type(volume_data))

# Build features
features = build_features(volume_data)

print("Feature DataFrame:")
print(features.head())

print("\nFeature descriptions:")
for column in features.columns:
    print(f"{column}: {features[column].describe()}")

# Plot some key features
import matplotlib.pyplot as plt

fig, axs = plt.subplots(3, 1, figsize=(15, 15))

axs[0].plot(features.index, features['daily_trend_strength'], label='Daily Trend Strength')
axs[0].plot(features.index, features['weekly_trend_strength'], label='Weekly Trend Strength')
axs[0].set_title('Trend Strengths')
axs[0].legend()

axs[1].plot(features.index, features['daily_cycle_strength'], label='Daily Cycle Strength')
axs[1].plot(features.index, features['weekly_cycle_strength'], label='Weekly Cycle Strength')
axs[1].set_title('Cycle Strengths')
axs[1].legend()

axs[2].plot(features.index, features['cycle_divergence'])
axs[2].set_title('Cycle Divergence')

plt.tight_layout()
plt.show()

date
2024-06-03 09:00:00-04:00     148963.0
2024-06-03 09:30:00-04:00    8899895.0
2024-06-03 10:00:00-04:00    5007666.0
2024-06-03 10:30:00-04:00    3841875.0
2024-06-03 11:00:00-04:00    2627520.0
                               ...    
2024-08-01 13:30:00-04:00    3505931.0
2024-08-01 14:00:00-04:00    2884136.0
2024-08-01 14:30:00-04:00    2998257.0
2024-08-01 15:00:00-04:00    3032147.0
2024-08-01 15:30:00-04:00    7414885.0
Length: 588, dtype: float64
<class 'pandas.core.series.Series'>


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (588,) + inhomogeneous part.