In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from umap import UMAP
import requests
import json
import pytz
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
KEY = "XpqF6xBLLrj6WALk4SS1UlkgphXmHQec"

def convert_timestamp_est(timestamp):
    # Create a naive datetime object from the UNIX timestamp
    dt_naive = datetime.utcfromtimestamp(timestamp)
    # Convert the naive datetime object to a timezone-aware one (UTC)
    dt_utc = pytz.utc.localize(dt_naive)
    # Convert the UTC datetime to EST
    dt_est = dt_utc.astimezone(pytz.timezone('US/Eastern'))
    
    return dt_est

class CustomRetry(Retry):
    def is_retry(self, method, status_code, has_retry_after=False):
        """ Return True if we should retry the request, otherwise False. """
        if status_code != 200:
            return True
        return super().is_retry(method, status_code, has_retry_after)
    
def setup_session_retries(
    retries: int = 3,
    backoff_factor: float = 0.05,
    status_forcelist: tuple = (500, 502, 504),
):
    """
    Sets up a requests Session with retries.
    
    Parameters:
    - retries: Number of retries before giving up. Default is 3.
    - backoff_factor: A factor to use for exponential backoff. Default is 0.3.
    - status_forcelist: A tuple of HTTP status codes that should trigger a retry. Default is (500, 502, 504).

    Returns:
    - A requests Session object with retry configuration.
    """
    retry_strategy = CustomRetry(
        total=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
        allowed_methods=frozenset(["GET", "POST", "PUT", "DELETE", "HEAD", "OPTIONS"]),
        raise_on_status=False
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session = requests.Session()
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    
    return session

def execute_polygon_call(url):
    session = setup_session_retries()
    response = session.request("GET", url, headers={}, data={})
    return response 

def call_polygon(symbol, from_stamp, to_stamp, timespan, multiplier):
    hours = [9, 10, 11, 12, 13, 14, 15]
    url = f"https://api.polygon.io/v2/aggs/ticker/{symbol}/range/{multiplier}/{timespan}/{from_stamp}/{to_stamp}?adjusted=true&sort=asc&limit=50000&apiKey={KEY}"

    response = execute_polygon_call(url)

    response_data = json.loads(response.text)
    results = response_data['results']
    results_df = pd.DataFrame(results)
    results_df['t'] = results_df['t'].apply(lambda x: int(x/1000))
    results_df['date'] = results_df['t'].apply(lambda x: convert_timestamp_est(x))
    results_df['hour'] = results_df['date'].apply(lambda x: x.hour)
    results_df['symbol'] = symbol
    results_df = results_df[results_df['hour'].isin(hours)]

    return results_df 


In [None]:
df = call_polygon("AAPL", "2024-06-01", "2024-08-01", "minute", 30)

# Add time as a feature
df['time'] = (df.index - df.index[0]).total_seconds()

# Normalize time to be on a similar scale as other features
time_scaler = MinMaxScaler()
df['time_normalized'] = time_scaler.fit_transform(df[['time']])

# Standardize the data (excluding the original time column)
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df.drop('time', axis=1))

# Apply UMAP
umap_model = UMAP(n_components=3, random_state=42)
umap_result = umap_model.fit_transform(scaled_data)

# Create a DataFrame with UMAP results
umap_df = pd.DataFrame(umap_result, index=df.index, columns=['UMAP1', 'UMAP2', 'UMAP3'])

# Plot original data (first 5 features for visibility) and UMAP components
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))

# Original data (first 5 features)
df.iloc[:, :5].plot(ax=ax1)
ax1.set_title('Original Time Series Data (First 5 Features)')
ax1.set_xlabel('')

# UMAP components
umap_df.plot(ax=ax2)
ax2.set_title('UMAP Components (with Time Encoding)')

plt.tight_layout()
plt.show()

# 3D scatter plot of UMAP components
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(umap_df['UMAP1'], umap_df['UMAP2'], umap_df['UMAP3'], 
                     c=df['time_normalized'], cmap='viridis')
ax.set_xlabel('UMAP1')
ax.set_ylabel('UMAP2')
ax.set_zlabel('UMAP3')
ax.set_title('3D UMAP Projection of Time Series Data (with Time Encoding)')
plt.colorbar(scatter, label='Normalized Time')
plt.show()

print(f"Original data shape: {df.shape}")
print(f"UMAP data shape: {umap_df.shape}")

# Analyze the importance of time in UMAP components
correlation_with_time = umap_df.apply(lambda x: x.corr(df['time_normalized']))
print("\nCorrelation of UMAP components with time:")
print(correlation_with_time)

# Find the component most correlated with time
most_time_correlated = correlation_with_time.abs().idxmax()
print(f"\nUMAP component most correlated with time: {most_time_correlated}")
print(f"Correlation value: {correlation_with_time[most_time_correlated]:.4f}")