In [6]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import pytz
from ib_insync import *
from utils.market_session import MarketSessionClassifier
import json
import time

import nest_asyncio
import asyncio

# This allows running IB event loop inside Jupyter
nest_asyncio.apply()

# VERIFY following code by checking with QC data

In [5]:

class IBKRReturnsCalculator:
    def __init__(self, time_horizons, add_minutes_to_open=5):
        """Initialize the returns calculator"""
        self.time_horizons = time_horizons
        self.volume_time_horizons = {'V' + k: v for k, v in self.time_horizons.items()}
        self.add_minutes_to_open = add_minutes_to_open
        self.ib = IB()
        self.eastern = pytz.timezone('US/Eastern')
        self.market_classifier = MarketSessionClassifier()
        self.price_data = None
        self.volume_data = None
        
    # def connect(self):
    #     """Establish connection to IBKR"""
    #     if self.ib.isConnected():
    #         self.ib.disconnect()
    #     self.ib.connect('127.0.0.1', 7496, clientId=456)
    #     print("Connected to IBKR")

    def connect(self):
        """Establish connection to IBKR"""
        try:
            if self.ib.isConnected():
                self.ib.disconnect()
            # Use a random clientId to avoid conflicts
            import random
            client_id = random.randint(1000, 9999)
            self.ib.connect('127.0.0.1', 7496, clientId=client_id)
            print("Connected to IBKR")
        except Exception as e:
            print(f"Error connecting to IBKR: {str(e)}")
            raise  # Re-raise the exception to handle it in the calling code


    def fetch_minute_data(self, symbol, start_date, end_date):
        """Fetch minute-level price and volume data"""
        try:
            if not self.ib.isConnected():
                raise ConnectionError("Not connected to IBKR")

            contract = Stock(symbol, 'SMART', 'USD')
            
            # Convert dates to Eastern time for IBKR
            fetch_start = pd.Timestamp(start_date).tz_convert('US/Eastern')
            fetch_end = pd.Timestamp(end_date).tz_convert('US/Eastern')
            
            bars = self.ib.reqHistoricalData(
                contract,
                endDateTime=fetch_end,
                durationStr=f'{(fetch_end - fetch_start).days + 1} D',
                barSizeSetting='1 min',
                whatToShow='TRADES',
                useRTH=False
            )
            
            if not bars:
                print(f"No data returned for {symbol}")
                return None, None
                
            df = pd.DataFrame({
                'time': [b.date for b in bars],
                'close': [b.close for b in bars],
                'volume': [b.volume for b in bars]
            })
            
            # Ensure timezone handling
            df['time'] = pd.to_datetime(df['time'])
            if df['time'].dt.tz is None:
                df['time'] = df['time'].dt.tz_localize('UTC').dt.tz_convert('US/Eastern')
            else:
                df['time'] = df['time'].dt.tz_convert('US/Eastern')
                
            df.set_index('time', inplace=True)
            
            return df['close'], df['volume']
            
        except Exception as e:
            print(f"Error fetching data for {symbol}: {str(e)}")
            return None, None    

        
    def calculate_returns(self, symbol, timestamp):
        """
        Calculate returns for a single ticker and timestamp
        """
        try:
            # Ensure timestamp is in Eastern time
            if timestamp.tzinfo is None:
                timestamp = pd.Timestamp(timestamp, tz='US/Eastern')
            elif timestamp.tzinfo != self.eastern:
                timestamp = timestamp.tz_convert('US/Eastern')
                
            # Get session info directly from MarketSessionClassifier
            session_info = self.market_classifier.get_session_times(timestamp)
            if not session_info:
                print("Could not determine session times")
                return None
                
            # Determine start and end times based on market session
            market_session = session_info['market_session']
            if market_session == 'market_closed':
                if timestamp.hour >= 16:  # After market close
                    start_time = session_info['current']['session_close']
                    end_time = session_info['next']['session_open']
                else:  # Before market open
                    start_time = session_info['previous']['session_close']
                    end_time = session_info['current']['session_open']
            else:
                start_time = timestamp
                if market_session == 'in_market':
                    end_time = session_info['current']['session_close']
                elif market_session == 'pre_market':
                    end_time = session_info['current']['session_open']
                else:  # post_market
                    end_time = session_info['next']['session_open']
                    
            # Calculate data range
            data_start = min(start_time, timestamp - timedelta(minutes=max(self.time_horizons.values())))
            data_end = max(end_time, timestamp + timedelta(minutes=max(self.time_horizons.values())))
            
            # Fetch required data
            self.price_data, self.volume_data = self.fetch_minute_data(symbol, data_start, data_end)
            
            if self.price_data is None:
                return None
                
            # Calculate returns
            session_returns = self.calculate_session_returns(start_time, end_time)
            horizon_returns = self.calculate_horizon_returns(timestamp)
            
            formatted_horizons = {
                k: float(round(v, 2)) if v is not None else None 
                for k, v in horizon_returns.items()
            }
            
            return {
                'session': session_returns,
                'horizons': formatted_horizons,
                'market_session': market_session
            }
            
        except Exception as e:
            print(f"Error calculating returns: {str(e)}")
            import traceback
            traceback.print_exc()
            return None

    def calculate_session_returns(self, start_time, end_time):
        """Calculate session returns matching QC implementation"""
        try:
            if self.price_data is None or self.volume_data is None:
                return None
                
            # Find closest available data points
            start_price = self.price_data.asof(start_time)
            end_price = self.price_data.asof(end_time)
            start_volume = self.volume_data.asof(start_time)
            end_volume = self.volume_data.asof(end_time)
            
            if any(x is None for x in [start_price, end_price, start_volume, end_volume]):
                print("Missing data points for session calculation")
                return None
                
            # Calculate returns
            price_return = ((end_price / start_price) - 1) * 100
            volume_return = end_volume / start_volume if start_volume != 0 else 0
            
            return {
                'price_return': round(float(price_return), 2),
                'volume_return': round(float(volume_return), 2),
                'start_price': float(start_price),
                'end_price': float(end_price),
                'start_volume': int(start_volume),
                'end_volume': int(end_volume)
            }
        except Exception as e:
            print(f"Error calculating session returns: {str(e)}")
            return None

    def calculate_horizon_returns(self, timestamp):
        """Calculate horizon returns matching QC implementation"""
        returns = {}
        
        for horizon, minutes in {**self.time_horizons, **self.volume_time_horizons}.items():
            try:
                is_volume = horizon.startswith('V')
                data = self.volume_data if is_volume else self.price_data
                
                if minutes < 0:
                    current_time = timestamp
                    ref_time = timestamp + timedelta(minutes=abs(minutes))
                else:
                    current_time = timestamp + timedelta(minutes=minutes)
                    ref_time = timestamp
                    
                current_value = data.loc[current_time]
                ref_value = data.loc[ref_time]
                
                if is_volume:
                    return_value = current_value / ref_value
                else:
                    return_value = ((current_value / ref_value) - 1) * 100
                    
                returns[horizon] = round(return_value, 2)
                
            except KeyError:
                returns[horizon] = None
                
        return returns
        
    def disconnect(self):
        """Safely disconnect from IBKR"""
        if self.ib.isConnected():
            self.ib.disconnect()
            print("Disconnected from IBKR")


### Trial Run to check above code 

In [7]:
# Define time horizons for returns calculation
time_horizons = {
    '-5m': -5,    # 5 minutes ago
    '-2m': -2,    # 2 minutes ago
    '2m': 2,      # 2 minutes ahead
    '5m': 5,      # 5 minutes ahead
    '15m': 15,    # 15 minutes ahead
    '30m': 30,    # 30 minutes ahead
    '1h': 60,     # 1 hour ahead
    '2h': 120,    # 2 hours ahead
    '4h': 240     # 4 hours ahead
}

# Test with error handling
try:
    # Initialize calculator
    calculator = IBKRReturnsCalculator(time_horizons)
    
    # Connect with retry logic
    max_retries = 3
    for attempt in range(max_retries):
        try:
            calculator.connect()
            break
        except Exception as e:
            if attempt == max_retries - 1:
                print(f"Failed to connect after {max_retries} attempts")
                raise
            print(f"Connection attempt {attempt + 1} failed, retrying...")
            time.sleep(2)
    
    timestamp = pd.Timestamp('2024-03-20 09:35:00', tz='US/Eastern')
    
    print("\nTesting MarketSessionClassifier...")
    classifier = MarketSessionClassifier()
    
    # Format the output to show times in ET
    def format_session_info(info):
        formatted = {}
        for key, value in info.items():
            if isinstance(value, pd.Timestamp):
                formatted[key] = value.tz_convert('US/Eastern').strftime('%Y-%m-%d %H:%M:%S %Z')
            else:
                formatted[key] = value
        return formatted
    
    market_session = classifier.get_market_session(timestamp)
    session_schedule = classifier.get_session_schedule(timestamp)
    session_times = classifier.get_session_times(timestamp)
    
    print(f"Market Session: {market_session}")
    print("\nSession Schedule:")
    print(json.dumps(format_session_info(session_schedule), indent=2))
    
    print("\nSession Times:")
    formatted_times = {
        'market_session': session_times['market_session'],
        'current': format_session_info(session_times['current']),
        'previous': format_session_info(session_times['previous']),
        'next': format_session_info(session_times['next'])
    }
    print(json.dumps(formatted_times, indent=2))
    
    print("\nTesting IBKRReturnsCalculator...")
    returns = calculator.calculate_returns('AAPL', timestamp)
    print(json.dumps(returns, indent=2))
    
finally:
    if 'calculator' in locals() and calculator.ib.isConnected():
        calculator.disconnect()

Connected to IBKR

Testing MarketSessionClassifier...
Market Session: in_market

Session Schedule:
{
  "session_open": "2024-03-20 09:30:00 EDT",
  "session_close": "2024-03-20 16:00:00 EDT",
  "pre_market_start": "2024-03-20 04:00:00 EDT",
  "post_market_end": "2024-03-20 20:00:00 EDT",
  "is_early_close": false
}

Session Times:
{
  "market_session": "in_market",
  "current": {
    "session_open": "2024-03-20 09:30:00 EDT",
    "session_close": "2024-03-20 16:00:00 EDT",
    "pre_market_start": "2024-03-20 04:00:00 EDT",
    "post_market_end": "2024-03-20 20:00:00 EDT",
    "is_early_close": false
  },
  "previous": {
    "session_open": "2024-03-19 09:30:00 EDT",
    "session_close": "2024-03-19 16:00:00 EDT",
    "pre_market_start": "2024-03-19 04:00:00 EDT",
    "post_market_end": "2024-03-19 20:00:00 EDT",
    "is_early_close": false
  },
  "next": {
    "session_open": "2024-03-21 09:30:00 EDT",
    "session_close": "2024-03-21 16:00:00 EDT",
    "pre_market_start": "2024-03-21 

### Compare with QC data

In [None]:
import pandas as pd
df = pd.read_csv('../News/NewsQC.csv', low_memory=False, on_bad_lines='warn', thousands=',', index_col=0)
df.index = pd.to_numeric(df.index, errors='coerce').fillna(-1).astype(int)
df.head(2)

In [9]:
def compare_returns_with_qc(calculator, df_row):
    """
    Fetch IBKR returns for a specific timestamp and compare with QC data
    """
    try:
        # Get timestamp from QC data
        timestamp = df_row['timestamp']
        
        # Clean the symbol for IBKR (strip everything after the space)
        full_symbol = df_row['symbol']
        ibkr_symbol = full_symbol.split()[0]  # Take only the first part before space
        
        print(f"\nComparing returns for {ibkr_symbol} (QC: {full_symbol}) at {timestamp}")
        
        # Get IBKR returns
        ibkr_returns = calculator.calculate_returns(ibkr_symbol, timestamp)
        if ibkr_returns is None:
            print(f"No IBKR returns available for {ibkr_symbol} at {timestamp}")
            return None
            
        # Columns to compare
        price_cols = ['-2m', '-5m', '15m', '1h', '2m', '30m', '5m']
        volume_cols = ['V-2m', 'V-5m', 'V15m', 'V1h', 'V2m', 'V30m', 'V5m']
        session_cols = ['price_return', 'volume_return', 'start_price', 'end_price', 
                       'start_volume', 'end_volume']
        
        comparison = {
            'symbol': ibkr_symbol,
            'timestamp': timestamp,
            'qc_symbol': full_symbol
        }
        
        # Compare horizon returns
        for col in price_cols:
            ibkr_val = ibkr_returns['horizons'].get(col)
            qc_val = pd.to_numeric(df_row[col], errors='coerce')  # Convert to numeric
            comparison[f'{col}_IBKR'] = ibkr_val
            comparison[f'{col}_QC'] = qc_val
            comparison[f'{col}_diff'] = ibkr_val - qc_val if (ibkr_val is not None and pd.notna(qc_val)) else None
        
        # Compare volume returns
        for col in volume_cols:
            ibkr_val = ibkr_returns['horizons'].get(col)
            qc_val = pd.to_numeric(df_row[col], errors='coerce')  # Convert to numeric
            comparison[f'{col}_IBKR'] = ibkr_val
            comparison[f'{col}_QC'] = qc_val
            comparison[f'{col}_diff'] = ibkr_val - qc_val if (ibkr_val is not None and pd.notna(qc_val)) else None
        
        # Compare session returns
        if ibkr_returns['session']:
            for col in session_cols:
                ibkr_val = ibkr_returns['session'].get(col)
                qc_val = pd.to_numeric(df_row[col], errors='coerce')  # Convert to numeric
                comparison[f'{col}_IBKR'] = ibkr_val
                comparison[f'{col}_QC'] = qc_val
                comparison[f'{col}_diff'] = ibkr_val - qc_val if (ibkr_val is not None and pd.notna(qc_val)) else None
        
        return pd.DataFrame([comparison])
        
    except Exception as e:
        print(f"Error comparing returns: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

In [None]:
import numpy as np

def is_valid_timestamp(x):
    """Check if a value can be converted to a timestamp"""
    try:
        pd.Timestamp(x)
        return True
    except:
        return False

# First, let's look at the data
print("Original data shape:", df.shape)
print("\nSample of original timestamps:")
print(df['originalTime'].head())

# Filter for valid timestamps
valid_df = df[df['originalTime'].apply(is_valid_timestamp)].copy()
print(f"\nRows with valid timestamps: {len(valid_df)}")

try:
    # Convert timestamps with proper timezone handling
    valid_df['timestamp'] = pd.to_datetime(valid_df['originalTime'], utc=True).dt.tz_convert('US/Eastern')
    
    # Remove any rows where conversion failed
    valid_df = valid_df.dropna(subset=['timestamp'])
    print(f"Rows after removing invalid dates: {len(valid_df)}")
    
    # Filter for date range (using timezone-aware timestamps)
    start_date = pd.Timestamp('2020-01-01', tz='US/Eastern')
    end_date = pd.Timestamp('2024-03-21', tz='US/Eastern')
    
    recent_df = valid_df[
        (valid_df['timestamp'] >= start_date) &
        (valid_df['timestamp'] <= end_date)
    ].copy()
    
    print(f"\nFinal rows in filtered dataset: {len(recent_df)}")
    
    print("\nSample of filtered data:")
    print(recent_df[['timestamp', 'symbol']].head())
    
    # Define columns for comparison
    price_cols = ['-2m', '-5m', '15m', '1h', '2m', '30m', '5m']
    volume_cols = ['V-2m', 'V-5m', 'V15m', 'V1h', 'V2m', 'V30m', 'V5m']
    session_cols = ['price_return', 'volume_return', 'start_price', 'end_price', 
                   'start_volume', 'end_volume']
    
    # Show data types and sample values
    print("\nQC data types:")
    print(recent_df.dtypes)
    print("\nSample of numeric columns:")
    for col in price_cols + volume_cols + session_cols:
        if col in recent_df.columns:
            print(f"\n{col} sample values:")
            print(recent_df[col].head())
    
    # Test with the filtered data
    calculator = IBKRReturnsCalculator(time_horizons)
    
    try:
        calculator.connect()
        
        # Take a sample of rows to compare
        sample_rows = recent_df.head(3)
        
        all_comparisons = []
        for _, row in sample_rows.iterrows():
            comparison = compare_returns_with_qc(calculator, row)
            if comparison is not None:
                all_comparisons.append(comparison)
        
        # Process all comparisons after the loop
            
    finally:
        calculator.disconnect()
        
except Exception as e:
    print(f"Error processing data: {str(e)}")
    import traceback
    traceback.print_exc()

In [None]:
import numpy as np
from datetime import datetime, timedelta

try:
    # First clean the timestamp data
    def is_valid_timestamp_format(x):
        try:
            # Check if it matches our expected format
            if isinstance(x, str) and len(x) > 10:  # Basic length check
                pd.to_datetime(x)
                return True
            return False
        except:
            return False

    # Filter out invalid timestamps first
    valid_df = df[df['originalTime'].apply(is_valid_timestamp_format)].copy()
    print(f"Original rows: {len(df)}")
    print(f"Valid timestamp rows: {len(valid_df)}")
    
    # Convert timestamps for valid data
    valid_df['timestamp'] = pd.to_datetime(valid_df['originalTime'], utc=True).dt.tz_convert('US/Eastern')
    
    # Filter for more recent date range (last 6 months)
    end_date = pd.Timestamp('2024-03-21', tz='US/Eastern')
    start_date = end_date - pd.Timedelta(days=180)  # Last 6 months
    
    print(f"\nTesting data from {start_date} to {end_date}")
    
    recent_df = valid_df[
        (valid_df['timestamp'] >= start_date) &
        (valid_df['timestamp'] <= end_date)
    ].copy()
    
    print(f"\nFiltered dataset shape: {recent_df.shape}")
    print("\nSample data:")
    print(recent_df[['timestamp', 'symbol']].head())
    
    if len(recent_df) == 0:
        print("\nNo data found in the specified date range!")
    else:
        # Now let's compare with IBKR
        calculator = IBKRReturnsCalculator(time_horizons)
        
        try:
            calculator.connect()
            print("\nConnected to IBKR")
            
            # Take first 3 rows as a test
            test_rows = recent_df.head(3)
            for _, row in test_rows.iterrows():
                symbol = row['symbol'].split()[0]  # Get clean symbol
                timestamp = row['timestamp']
                
                print(f"\n{'='*50}")
                print(f"Testing symbol: {symbol}")
                print(f"Original symbol: {row['symbol']}")
                print(f"Timestamp: {timestamp}")
                
                # Get IBKR data
                print("\nRequesting IBKR data...")
                ibkr_returns = calculator.calculate_returns(symbol, timestamp)
                
                if ibkr_returns:
                    print("\nIBKR data received:")
                    if 'horizons' in ibkr_returns:
                        print("\nHorizon returns:")
                        for k, v in ibkr_returns['horizons'].items():
                            print(f"{k}: {v}")
                else:
                    print("No IBKR data available")
                    
        finally:
            calculator.disconnect()
            print("\nDisconnected from IBKR")
            
except Exception as e:
    print(f"Error: {str(e)}")
    import traceback
    traceback.print_exc()