In [362]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mplfinance as mpf
import scipy
import math
import pandas_ta as ta
from pathlib import Path
import yfinance as yf
from datetime import datetime, timedelta, date
from warnings import filterwarnings

In [363]:
nas_path = Path('./nasdaq_2019_to_present.csv')
nas_df = pd.read_csv(nas_path)
nas_df['date'] = pd.to_datetime(nas_df['date'])
nas_df = nas_df.set_index('date')
nas_df.columns = ['close', 'open', 'high', 'low']
nas_df

Unnamed: 0_level_0,close,open,high,low
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-01-21,17462.75,17462.75,17462.75,17462.75
2024-01-20,17462.75,17462.75,17462.75,17462.75
2024-01-19,17438.50,17116.00,17471.25,17107.25
2024-01-18,17110.00,16854.50,17125.75,16834.25
2024-01-17,16869.75,16978.00,16982.00,16689.25
...,...,...,...,...
2020-07-07,10532.25,10600.00,10694.50,10505.75
2020-07-06,10598.50,10347.25,10614.50,10341.50
2020-07-03,10328.88,10348.00,10389.25,10310.20
2020-07-02,10355.75,10258.50,10422.25,10246.50


In [366]:
def find_levels(
        price: np.array,       # Array of log closing prices.
        atr: float,            # Average True Range (ATR), a measure of market volatility.
        first_w: float = 0.1,  # Initial weight for the weighted kernel density estimation.
        atr_mult: float = 3.0,  # Multiplier for ATR to adjust bandwidth in kernel density estimation.
        prom_thresh: float = 0.1  # Threshold for prominence in peak finding, relative to the maximum of the probability density function.
):

    # Setup for weights in the kernel density estimation.
    last_w = 1.0  # Final weight.
    w_step = (last_w - first_w) / len(price)  # Step size for weights based on the number of price points.
    weights = first_w + np.arange(len(price)) * w_step  # Linearly increasing weights array.
    weights[weights < 0] = 0.0  # Ensuring no negative weights.

    # Gaussian Kernel Density Estimation on the price data with weights and adjusted bandwidth.
    # Kernel Density Estimation is a non-parametric way to estimate the probability density function (PDF) of a random variable. 
    # In simple terms, it's a technique used to visualize the distribution of data points. 
    # The Gaussian KDE uses the Gaussian (normal) distribution as the kernel. 
    # This means it approximates the distribution of your data points by summing up Gaussian distributions (bell curves) centered at each data point.
    kernel = scipy.stats.gaussian_kde(price, bw_method=atr*atr_mult, weights=weights)

    # Construct market profile by evaluating the kernel density across a range of prices.
    min_v = np.min(price)  # Minimum price in the range.
    max_v = np.max(price)  # Maximum price in the range.
    step = (max_v - min_v) / 200  # Step size for the price range.
    price_range = np.arange(min_v, max_v, step)  # Price range for evaluation.
    pdf = kernel(price_range)  # Market profile as a probability density function.

    # Find significant peaks in the market profile.
    pdf_max = np.max(pdf)  # Maximum value in the probability density function.
    prom_min = pdf_max * prom_thresh  # Minimum prominence for a peak to be considered significant.

    # Finding peaks in the market profile with the defined minimum prominence.
    peaks, props = scipy.signal.find_peaks(pdf, prominence=prom_min)

    # Converting the log price back to the actual price at each peak to identify levels.
    levels = [np.exp(price_range[peak]) for peak in peaks]

    return levels, peaks, props, price_range, pdf, weights  # Return identified levels and related data.


def support_resistance_levels(
        data,                 # DataFrame containing the financial data.
        lookback: int,        # The number of periods to look back for calculating ATR and levels.
        first_w: float = 0.03,  # Initial weight for kernel density estimation in find_levels.
        atr_mult: float = 3.5,  # Multiplier for ATR to adjust bandwidth in kernel density estimation.
        prom_thresh: float = 0.20  # Threshold for prominence in peak finding.
):
    # Check if necessary columns are present in the DataFrame.
    if 'high' not in data.columns or 'low' not in data.columns or 'close' not in data.columns:
        raise KeyError("Columns 'high', 'low', and 'close' must be present in the DataFrame.")

    # Calculate the logarithmic Average True Range (ATR) for the given lookback period.
    atr = ta.atr(np.log(data['high']), np.log(data['low']), np.log(data['close']), lookback)

    # Initialize a list to store the levels for each data point.
    all_levels = [None] * len(data)

    # Iterate over the data, starting from the lookback period to the end of the data.
    for i in range(lookback, len(data)):
        i_start = i - lookback  # Determine the starting index for the lookback period.
        
        # Extract logarithmic closing prices for the lookback period.
        vals = np.log(data.iloc[i_start + 1: i + 1]['close'].to_numpy())

        # Calculate the support and resistance levels using the find_levels function.
        levels, peaks, props, price_range, pdf, weights = find_levels(
            vals, atr.iloc[i], first_w, atr_mult, prom_thresh
        )

        # Store the calculated levels for the current index.
        all_levels[i] = levels

    return all_levels  

def sr_penetration_signal(data, levels: list):
    # Initialize a signal array with zeros, same length as the data.
    signal = np.zeros(len(data))

    # Variable to store the current signal state.
    curr_sig = 0.0

    # Convert the 'close' column of the data to a numpy array for efficiency.
    close_arr = data['close'].to_numpy()

    # Iterate through each data point, starting from the second one (index 1).
    for i in range(1, len(data)):
        # If there are no levels identified for the current data point, skip to the next.
        if levels[i] is None:
            continue
        # Store the previous and current closing prices.
        last_c = close_arr[i - 1]
        curr_c = close_arr[i]
        # Check for penetration of each level identified for the current data point.
        for level in levels[i]:
            # If the current close is above the level and the last close was at or below it,
            # it indicates a bullish signal (cross above the support/resistance level).
            if curr_c > level and last_c <= level:
                curr_sig = 1.0
            # If the current close is below the level and the last close was at or above it,
            # it indicates a bearish signal (cross below the support/resistance level).
            elif curr_c < level and last_c >= level:
                curr_sig = -1.0

        # Update the signal array for the current index with the current signal state.
        signal[i] = curr_sig
    # Return the array containing the signals.
    return signal




def get_trades_from_signal(data, signal: np.array):
    # Initialize lists to store long and short trades.
    long_trades = []
    short_trades = []

    # Convert the 'close' column of the data to a numpy array for efficient processing.
    close_arr = data['close'].to_numpy()

    # Variable to store the last signal processed.
    last_sig = 0.0

    # Variable to keep track of the current open trade.
    open_trade = None

    # Get the index of the DataFrame, which represents time.
    idx = data.index

    # Iterate through each data point.
    for i in range(len(data)):
        # Check for a long entry signal and that it's not a continuation of the previous signal.
        if signal[i] == 1.0 and last_sig != 1.0:  # Long entry
            # Close any open short trade.
            if open_trade is not None:
                open_trade[2] = idx[i]  # Exit time.
                open_trade[3] = close_arr[i]  # Exit price.
                short_trades.append(open_trade)

            # Open a new long trade.
            open_trade = [idx[i], close_arr[i], -1, np.nan]  # Initialize with entry time and price.

        # Check for a short entry signal and that it's not a continuation of the previous signal.
        if signal[i] == -1.0 and last_sig != -1.0:  # Short entry
            # Close any open long trade.
            if open_trade is not None:
                open_trade[2] = idx[i]  # Exit time.
                open_trade[3] = close_arr[i]  # Exit price.
                long_trades.append(open_trade)

            # Open a new short trade.
            open_trade = [idx[i], close_arr[i], -1, np.nan]  # Initialize with entry time and price.

        # Update the last signal variable.
        last_sig = signal[i]

    # Convert the lists of trades to DataFrames for easier manipulation and analysis.
    long_trades = pd.DataFrame(long_trades, columns=['exit_time', 'entry_price', 'entry_time', 'exit_price'])
    short_trades = pd.DataFrame(short_trades, columns=['exit_time', 'entry_price', 'entry_time', 'exit_price'])

    # Calculate the percentage gain/loss for each trade.
    long_trades['percent'] = (long_trades['exit_price'] - long_trades['entry_price']) / long_trades['entry_price']
    short_trades['percent'] = -1 * (short_trades['exit_price'] - short_trades['entry_price']) / short_trades['entry_price']

    # Set the entry time as the index for both DataFrames.
    long_trades = long_trades.set_index('entry_time')
    short_trades = short_trades.set_index('entry_time')

    return long_trades, short_trades

In [367]:
if __name__ == '__main__': # This condition checks if the script is being run as the main program and not imported as a module.

    # Assigning 'nas_df' to 'data'. 'nas_df' should be a DataFrame containing financial data.
    data = nas_df

    # Set the plotting style to 'dark_background' for better visualization.
    plt.style.use('dark_background')

    # Calculate support and resistance levels using the 'support_resistance_levels' function.
    # It uses a 365-day lookback period, with specific parameters for weighted kernel density estimation.
    levels = support_resistance_levels(data, 365, first_w=1.0, atr_mult=3.0)

    # Generate trading signals based on the penetration of support and resistance levels.
    data['sr_signal'] = sr_penetration_signal(data, levels)

    # Calculate the logarithmic return of the 'close' price, shifting the result by -1 to align with the next period.
    data['log_ret'] = np.log(data['close']).diff().shift(-1)

    # Calculate the return from the strategy by multiplying the signal by the logarithmic return.
    data['sr_return'] = data['sr_signal'] * data['log_ret']
    
    # Generate lists of long and short trades based on the trading signals.
    long_trades, short_trades = get_trades_from_signal(data, data['sr_signal'].to_numpy())    

In [369]:
# Calculate the total points gained or lost in each long trade: (exit price - entry price).
long_trades['total_points'] = long_trades['exit_price'] - long_trades['entry_price']
# Calculate the profit and loss (PnL). Here, '20' is the point value per contract for the futures.
# Therefore, each point movement in the price results in a $20 gain or loss per contract.
long_trades['pnl_1_contract'] = long_trades['total_points'] * 20
long_trades['position'] = 'long'
long_trades

Unnamed: 0_level_0,exit_time,entry_price,exit_price,percent,total_points,pnl_1_contract,position
entry_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-08-09,2022-09-21,11710.0,13031.5,0.112852,1321.5,26430.0,long
2022-08-02,2022-08-05,13228.75,12924.5,-0.022999,-304.25,-6085.0,long
2022-07-06,2022-07-08,12152.0,11880.25,-0.022363,-271.75,-5435.0,long
2022-06-22,2022-06-29,11691.0,11565.75,-0.010713,-125.25,-2505.0,long
2022-03-28,2022-06-10,11840.0,14985.25,0.265646,3145.25,62905.0,long
2021-11-03,2021-11-08,16327.75,16129.75,-0.012127,-198.0,-3960.0,long
2021-09-30,2021-10-01,14761.75,14682.5,-0.005369,-79.25,-1585.0,long
2021-08-31,2021-09-29,14739.75,15582.5,0.057175,842.75,16855.0,long
2021-08-28,2021-08-30,15597.5,15428.25,-0.010851,-169.25,-3385.0,long
2021-07-28,2021-07-29,15037.75,15011.5,-0.001746,-26.25,-525.0,long


In [370]:
#calculate net profit of all long positions
long_pnl = long_trades['pnl_1_contract'].sum()
long_pnl

60590.0

In [371]:
# Calculate the total points gained or lost in each long trade.
short_trades['total_points'] = short_trades['entry_price'] - short_trades['exit_price']
# Calculate the profit and loss (PnL). Here, '20' is the point value per contract for the futures.
# Therefore, each point movement in the price results in a $20 gain or loss per contract.
short_trades['pnl_1_contract'] = short_trades['total_points'] * 20
short_trades['position'] = 'short'
short_trades

Unnamed: 0_level_0,exit_time,entry_price,exit_price,percent,total_points,pnl_1_contract,position
entry_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-08-05,2022-08-09,13031.5,13228.75,-0.015136,-197.25,-3945.0,short
2022-07-08,2022-08-02,12924.5,12152.0,0.05977,772.5,15450.0,short
2022-06-29,2022-07-06,11880.25,11691.0,0.01593,189.25,3785.0,short
2022-06-10,2022-06-22,11565.75,11840.0,-0.023712,-274.25,-5485.0,short
2021-11-08,2022-03-28,14985.25,16327.75,-0.089588,-1342.5,-26850.0,short
2021-10-01,2021-11-03,16129.75,14761.75,0.084812,1368.0,27360.0,short
2021-09-29,2021-09-30,14682.5,14739.75,-0.003899,-57.25,-1145.0,short
2021-08-30,2021-08-31,15582.5,15597.5,-0.000963,-15.0,-300.0,short
2021-07-29,2021-08-28,15428.25,15037.75,0.025311,390.5,7810.0,short
2021-07-26,2021-07-28,15011.5,15117.75,-0.007078,-106.25,-2125.0,short


In [372]:
#calculate net profit of all short positions
short_pnl = short_trades['pnl_1_contract'].sum()
short_pnl

72955.0

In [395]:
# Concat the longs and shorts together
total_trades = pd.concat([long_trades, short_trades])
# order them by the first trade
total_trades.sort_values(by='entry_time', inplace=True)

total_trades.reset_index(inplace=True)
total_trades.rename(columns={'entry_time': 'entry'}, inplace=True)
# add a column for cumulative returns 

total_trades['entry'] = pd.to_datetime(total_trades['entry'])
total_trades['date'] = total_trades['entry']
total_trades = total_trades.set_index('date')

position_mapping = {'long': 1, 'short': 2}
total_trades['position'] = total_trades['position'].map(position_mapping)

total_trades

Unnamed: 0_level_0,entry,exit_time,entry_price,exit_price,percent,total_points,pnl_1_contract,position
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-08-05,2020-08-05,2020-08-06,11261.25,11091.75,-0.015052,-169.5,-3390.0,1
2020-08-06,2020-08-06,2020-08-25,11726.25,11261.25,0.039655,465.0,9300.0,2
2020-08-25,2020-08-25,2020-08-26,11991.25,11726.25,-0.022099,-265.0,-5300.0,1
2020-08-26,2020-08-26,2021-04-12,13808.75,11991.25,0.131619,1817.5,36350.0,2
2021-04-12,2021-04-12,2021-04-13,13975.75,13808.75,-0.011949,-167.0,-3340.0,1
2021-04-13,2021-04-13,2021-04-14,13798.75,13975.75,-0.012827,-177.0,-3540.0,2
2021-04-14,2021-04-14,2021-04-16,14029.5,13798.75,-0.016447,-230.75,-4615.0,1
2021-04-16,2021-04-16,2021-04-23,13927.0,14029.5,-0.00736,-102.5,-2050.0,2
2021-04-23,2021-04-23,2021-04-26,14011.5,13927.0,-0.006031,-84.5,-1690.0,1
2021-04-26,2021-04-26,2021-07-22,14928.5,14011.5,0.061426,917.0,18340.0,2


In [374]:
nas_and_trades = pd.concat([total_trades, nas_df], axis=1)
nas_and_trades = nas_and_trades.drop(nas_and_trades.index[:23])
n = 23
nas_and_trades.drop(nas_and_trades.tail(n).index, inplace=True)
columns_to_drop = ['sr_signal', 'log_ret', 'sr_return']
nas_and_trades = nas_and_trades.drop(columns=columns_to_drop)
nas_and_trades['cumulative_pnl'] = nas_and_trades['pnl_1_contract'].fillna(0).cumsum() + 100000
nas_and_trades

Unnamed: 0_level_0,entry,exit_time,entry_price,exit_price,percent,total_points,pnl_1_contract,position,close,open,high,low,cumulative_pnl
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2020-08-03,NaT,NaT,,,,,,,11044.25,10933.25,11073.00,10876.25,100000.0
2020-08-04,NaT,NaT,,,,,,,11086.00,11054.00,11094.75,10983.00,100000.0
2020-08-05,2020-08-05,2020-08-06,11261.25,11091.75,-0.015052,-169.5,-3390.0,1.0,11091.75,11080.50,11144.00,11055.75,96610.0
2020-08-06,2020-08-06,2020-08-25,11726.25,11261.25,0.039655,465.0,9300.0,2.0,11261.25,11094.00,11271.75,11051.25,105910.0
2020-08-07,NaT,NaT,,,,,,,11122.75,11260.50,11283.25,11035.25,105910.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-24,NaT,NaT,,,,,,,16962.50,16962.50,16962.50,16962.00,233545.0
2023-12-25,NaT,NaT,,,,,,,17004.00,16991.75,17011.00,16985.00,233545.0
2023-12-26,NaT,NaT,,,,,,,17083.50,16964.50,17111.25,16964.50,233545.0
2023-12-27,NaT,NaT,,,,,,,17113.25,17090.25,17133.00,17055.50,233545.0


In [375]:
news_file = Path('C:\\Users\\Elliot\\Documents\\coding\\Berkeley\\project2\\calendar-event-list.csv')
news_df = pd.read_csv(news_file)
news_df.columns = ['id', 'date', 'name', 'impact', 'currency']
news_df['date'] = pd.to_datetime(news_df['date'])

news_df = news_df.set_index('date')
news_df.drop(news_df[news_df['impact'] == 'LOW'].index, inplace=True)
news_df.drop(news_df[news_df['impact'] == 'NONE'].index, inplace=True)
news_df = news_df.drop('id', axis=1)
news_df

Unnamed: 0_level_0,name,impact,currency
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-08-03 01:45:00,Caixin Manufacturing PMI,MEDIUM,CNY
2020-08-03 07:55:00,HCOB Manufacturing PMI,MEDIUM,EUR
2020-08-03 08:30:00,S&P Global/CIPS Manufacturing PMI,MEDIUM,GBP
2020-08-03 13:45:00,S&P Global Manufacturing PMI,MEDIUM,USD
2020-08-03 23:30:00,"Tokyo CPI ex Food, Energy (YoY)",MEDIUM,JPY
...,...,...,...
2024-01-24 14:45:00,BoC Interest Rate Decision,HIGH,CAD
2024-01-24 14:45:00,S&P Global Composite PMI,MEDIUM,USD
2024-01-24 14:45:00,S&P Global Manufacturing PMI,HIGH,USD
2024-01-24 14:45:00,S&P Global Services PMI,HIGH,USD


In [376]:
news_df.index = pd.to_datetime(news_df.index.date)

# Group by the modified 'date' index and aggregate the other columns
grouped_news = news_df.groupby(news_df.index).agg({
    'name': lambda x: list(x),
    'impact': lambda x: list(x),
    'currency': lambda x: list(x)
})

grouped_news = grouped_news.rename_axis('date')

# Display the resulting DataFrame
grouped_news

Unnamed: 0_level_0,name,impact,currency
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-08-03,"[Caixin Manufacturing PMI, HCOB Manufacturing ...","[MEDIUM, MEDIUM, MEDIUM, MEDIUM, MEDIUM, MEDIU...","[CNY, EUR, GBP, USD, JPY, JPY, JPY]"
2020-08-04,"[BoE MPC Vote Rate Unchanged, BoE MPC Vote Rat...","[HIGH, HIGH, MEDIUM, MEDIUM]","[GBP, GBP, CAD, NZD]"
2020-08-05,"[Caixin Services PMI, BoJ's Governor Kuroda sp...","[MEDIUM, HIGH, HIGH, MEDIUM, MEDIUM, MEDIUM, M...","[CNY, JPY, USD, USD, USD, USD, USD, USD, USD]"
2020-08-06,"[RBNZ Inflation Expectations (QoQ), BoE's Gove...","[MEDIUM, HIGH, HIGH, MEDIUM, MEDIUM, MEDIUM]","[NZD, GBP, GBP, USD, USD, USD]"
2020-08-10,"[Consumer Price Index (MoM), Consumer Price In...","[MEDIUM, HIGH, MEDIUM, MEDIUM]","[CNY, CNY, CNY, GBP]"
...,...,...,...
2024-01-18,"[Fed's Bostic speech, Fed's Bostic speech, Bus...","[MEDIUM, MEDIUM, MEDIUM, MEDIUM, MEDIUM, MEDIUM]","[USD, USD, NZD, JPY, JPY, JPY]"
2024-01-19,"[Producer Price Index (MoM), Producer Price In...","[MEDIUM, MEDIUM, MEDIUM, HIGH, MEDIUM, MEDIUM,...","[EUR, EUR, GBP, GBP, GBP, GBP, USD]"
2024-01-22,"[PBoC Interest Rate Decision, Business NZ PSI]","[MEDIUM, MEDIUM]","[CNY, NZD]"
2024-01-23,"[BoJ Outlook Report, BoJ Monetary Policy State...","[MEDIUM, HIGH, HIGH, HIGH, HIGH, HIGH]","[JPY, JPY, JPY, JPY, NZD, NZD]"


In [377]:
impact_mapping = {'HIGH': 1, 'MEDIUM': 2}
grouped_news['impact'] = grouped_news['impact'].apply(lambda x: [impact_mapping[impact] for impact in x] if isinstance(x, list) else x)
grouped_news['impact'] = grouped_news['impact'].apply(lambda x: sum(x) if isinstance(x, list) else x)

nas_and_news = pd.concat([nas_df, grouped_news], axis=1)
columns_to_drop = ['log_ret', 'sr_return']
nas_and_news = nas_and_news.drop(columns=columns_to_drop)
nas_and_news = nas_and_news.dropna()
nas_and_news

Unnamed: 0_level_0,close,open,high,low,sr_signal,name,impact,currency
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-08-03,11044.25,10933.25,11073.00,10876.25,-1.0,"[Caixin Manufacturing PMI, HCOB Manufacturing ...",14.0,"[CNY, EUR, GBP, USD, JPY, JPY, JPY]"
2020-08-04,11086.00,11054.00,11094.75,10983.00,-1.0,"[BoE MPC Vote Rate Unchanged, BoE MPC Vote Rat...",6.0,"[GBP, GBP, CAD, NZD]"
2020-08-05,11091.75,11080.50,11144.00,11055.75,-1.0,"[Caixin Services PMI, BoJ's Governor Kuroda sp...",15.0,"[CNY, JPY, USD, USD, USD, USD, USD, USD, USD]"
2020-08-06,11261.25,11094.00,11271.75,11051.25,1.0,"[RBNZ Inflation Expectations (QoQ), BoE's Gove...",10.0,"[NZD, GBP, GBP, USD, USD, USD]"
2020-08-10,11072.00,11140.25,11156.50,10928.25,-1.0,"[Consumer Price Index (MoM), Consumer Price In...",7.0,"[CNY, CNY, CNY, GBP]"
...,...,...,...,...,...,...,...,...
2024-01-15,16936.25,16946.75,17022.75,16933.50,0.0,"[Bank of Canada Business Outlook Survey, NZIER...",4.0,"[CAD, NZD]"
2024-01-16,16966.50,16961.75,17033.75,16811.75,0.0,"[ZEW Survey – Economic Sentiment, ZEW Survey –...",8.0,"[EUR, EUR, EUR, USD]"
2024-01-17,16869.75,16978.00,16982.00,16689.25,0.0,"[Fed's Barr speech, Fed's Bowman speech, Germa...",10.0,"[USD, USD, EUR, USD, USD]"
2024-01-18,17110.00,16854.50,17125.75,16834.25,0.0,"[Fed's Bostic speech, Fed's Bostic speech, Bus...",12.0,"[USD, USD, NZD, JPY, JPY, JPY]"


In [378]:
news_nas_and_trades = pd.concat([nas_and_trades, grouped_news], axis=1)
c = 19
news_nas_and_trades.drop(news_nas_and_trades.tail(c).index, inplace=True)
impact_mapping = {'HIGH': 1, 'MEDIUM': 2}
news_nas_and_trades['impact'] = news_nas_and_trades['impact'].apply(lambda x: [impact_mapping[impact] for impact in x] if isinstance(x, list) else x)
news_nas_and_trades['impact'] = news_nas_and_trades['impact'].apply(lambda x: sum(x) if isinstance(x, list) else x)
news_nas_and_trades.head(20)

Unnamed: 0_level_0,entry,exit_time,entry_price,exit_price,percent,total_points,pnl_1_contract,position,close,open,high,low,cumulative_pnl,name,impact,currency
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2020-08-03,NaT,NaT,,,,,,,11044.25,10933.25,11073.0,10876.25,100000.0,"[Caixin Manufacturing PMI, HCOB Manufacturing ...",14.0,"[CNY, EUR, GBP, USD, JPY, JPY, JPY]"
2020-08-04,NaT,NaT,,,,,,,11086.0,11054.0,11094.75,10983.0,100000.0,"[BoE MPC Vote Rate Unchanged, BoE MPC Vote Rat...",6.0,"[GBP, GBP, CAD, NZD]"
2020-08-05,2020-08-05,2020-08-06,11261.25,11091.75,-0.015052,-169.5,-3390.0,1.0,11091.75,11080.5,11144.0,11055.75,96610.0,"[Caixin Services PMI, BoJ's Governor Kuroda sp...",15.0,"[CNY, JPY, USD, USD, USD, USD, USD, USD, USD]"
2020-08-06,2020-08-06,2020-08-25,11726.25,11261.25,0.039655,465.0,9300.0,2.0,11261.25,11094.0,11271.75,11051.25,105910.0,"[RBNZ Inflation Expectations (QoQ), BoE's Gove...",10.0,"[NZD, GBP, GBP, USD, USD, USD]"
2020-08-07,NaT,NaT,,,,,,,11122.75,11260.5,11283.25,11035.25,105910.0,,,
2020-08-10,NaT,NaT,,,,,,,11072.0,11140.25,11156.5,10928.25,105910.0,"[Consumer Price Index (MoM), Consumer Price In...",7.0,"[CNY, CNY, CNY, GBP]"
2020-08-11,NaT,NaT,,,,,,,10878.5,11072.25,11157.75,10845.5,105910.0,"[Average Earnings Excluding Bonus (3Mo/Yr), Em...",13.0,"[GBP, GBP, GBP, GBP, GBP, GBP, EUR, EUR]"
2020-08-12,NaT,NaT,,,,,,,11126.0,10928.5,11185.75,10878.5,105910.0,"[RBNZ Press Conference, 10-y Bond Auction, Mon...",5.0,"[NZD, EUR, USD]"
2020-08-13,NaT,NaT,,,,,,,11175.25,11115.5,11266.0,11094.25,105910.0,"[REINZ House Price Index (MoM), Business NZ PMI]",4.0,"[NZD, NZD]"
2020-08-14,NaT,NaT,,,,,,,11133.75,11180.5,11237.75,11096.0,105910.0,,,


In [379]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import os
# Set environment variable for KMeans
os.environ['OMP_NUM_THREADS'] = '1'
features = nas_and_news[['open', 'close', 'impact', 'sr_signal']]
# Handling missing values - Option 1: Drop rows with NaN values
features_cleaned = features.dropna()
# Standardize the features (important for KMeans)
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features_cleaned)
# Number of clusters - adjust this based on your data and requirements
n_clusters = 4
# Apply KMeans clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=1).fit(features_scaled)
# Add the cluster labels to your DataFrame
features_cleaned['cluster'] = kmeans.labels_

In [380]:
prediction = kmeans.predict(features_scaled)
print(prediction)

[2 2 2 1 2 2 2 2 2 2 2 2 2 2 1 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2
 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2
 2 2 2 3 2 2 2 2 3 2 3 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2
 2 2 2 2 2 3 3 2 3 3 2 3 2 2 3 3 2 2 3 2 2 2 2 2 2 3 2 2 2 3 2 3 2 3 2 2 2
 2 2 2 2 2 2 2 2 2 2 3 3 2 3 2 2 2 2 2 2 2 2 2 2 1 3 1 1 2 3 2 3 1 3 2 3 3
 2 2 3 2 2 2 2 2 2 2 2 2 2 3 2 2 2 3 2 2 3 2 3 2 2 2 2 2 2 2 2 2 2 2 2 3 3
 0 3 0 0 3 0 0 0 0 3 0 0 3 3 0 0 0 0 3 0 0 0 0 0 0 3 0 3 0 0 0 0 0 0 0 0 0
 0 3 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 3 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 3 0 0 0 3 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0
 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 3 0 2 2
 0 0 0 0 3 0 0 0 0 0 0 3 0 0 2 2 3 2 2 2 2 3 3 3 2 2 2 2 2 2 2 2 3 2 0 0 0
 3 3 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 1 1 1 1 1 3 2 2 2 2 1 1
 2 2 2 2 2 2 2 2 3 2 2 2 

In [381]:
features_scaled_copy = features_cleaned.copy()
features_scaled_copy['prediction'] = prediction
features_scaled_copy

Unnamed: 0_level_0,open,close,impact,sr_signal,cluster,prediction
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-08-03,10933.25,11044.25,14.0,-1.0,2,2
2020-08-04,11054.00,11086.00,6.0,-1.0,2,2
2020-08-05,11080.50,11091.75,15.0,-1.0,2,2
2020-08-06,11094.00,11261.25,10.0,1.0,1,1
2020-08-10,11140.25,11072.00,7.0,-1.0,2,2
...,...,...,...,...,...,...
2024-01-15,16946.75,16936.25,4.0,0.0,0,0
2024-01-16,16961.75,16966.50,8.0,0.0,0,0
2024-01-17,16978.00,16869.75,10.0,0.0,0,0
2024-01-18,16854.50,17110.00,12.0,0.0,0,0


In [399]:
import hvplot.pandas 

features_scaled_copy.hvplot.scatter(
    x="date",
    y='impact',
    by="prediction"
)

In [397]:
features_scaled_copy.hvplot.scatter(
    x='date',
    y='open',
    by="prediction"
)