In [None]:
import importlib
from concurrent.futures import ThreadPoolExecutor, as_completed
import utils_datasets as utils
importlib.reload(utils)

# Construction of the datasets

We'll first construct our datasets on several stocks listed in tickers. For each ticker and from a period going from START_TIME to today we will construct the following dataset. For a rolling window of N_DAYS over the subsequent period we'll construct an image containing
- 20 days mooving avergae.
- volume traded for each days
- opening and and closing prices.
- highest and lowest prices


## Construction of second dataset

In [None]:
import pandas as pd
importlib.reload(utils)
tickers = [
    "GE",    # General Electric – Conglomerate
    "CAT",   # Caterpillar – Construction machinery & equipment
    "DE",    # Deere & Co. – Agricultural & heavy equipment
    "HON",   # Honeywell – Industrial automation & aerospace
    "MMM",   # 3M – Industrial products & diversified tech
    "BA",    # Boeing – Aerospace & defense
    "LMT",   # Lockheed Martin – Defense contractor
    "RTX",   # Raytheon Technologies – Aerospace & defense
    "NOC",   # Northrop Grumman – Military tech & aerospace
    "EMR",   # Emerson Electric – Industrial automation
    "ETN",   # Eaton – Power management & electrical systems
    "ITW",   # Illinois Tool Works – Industrial manufacturing
    "PH",    # Parker-Hannifin – Motion & control technologies
    "UPS",   # UPS – Global logistics & transportation
    "FDX"    # FedEx – Courier & freight logistics
]
for ticker in tickers:
    result = utils.create_data_set_bis(ticker, tresh = 0.007, target_size=None, ma = True)

# Manipulation and visualisation of the datasets

In [None]:
importlib.reload(utils)
import pandas as pd
import yfinance as yf
tickers = [
    "GE",    # General Electric – Conglomerate
    "CAT",   # Caterpillar – Construction machinery & equipment
    "DE",    # Deere & Co. – Agricultural & heavy equipment
    "HON",   # Honeywell – Industrial automation & aerospace
    "MMM",   # 3M – Industrial products & diversified tech
    "BA",    # Boeing – Aerospace & defense
    "LMT",   # Lockheed Martin – Defense contractor
    "RTX",   # Raytheon Technologies – Aerospace & defense
    "NOC",   # Northrop Grumman – Military tech & aerospace
    "EMR",   # Emerson Electric – Industrial automation
    "ETN",   # Eaton – Power management & electrical systems
    "ITW",   # Illinois Tool Works – Industrial manufacturing
    "PH",    # Parker-Hannifin – Motion & control technologies
    "UPS",   # UPS – Global logistics & transportation
    "FDX"    # FedEx – Courier & freight logistics
]
res = pd.DataFrame()
for tick in tickers:
    TICKER_DATA_DATE = yf.download(tick, start="2000-01-01", auto_adjust=True, interval='1d')
    TICKER_DATA_DATE.columns = TICKER_DATA_DATE.columns.get_level_values(0)
    TICKER_DATA_DATE = TICKER_DATA_DATE[['Open', 'Close', 'High', 'Low', 'Volume']].copy()
    #TICKER_DATA_DATE['Return'] = TICKER_DATA_DATE['Close'].pct_change()
    TICKER_DATA_DATE['Name'] = [tick] * len(TICKER_DATA_DATE)
    TICKER_DATA_DATE.columns.name = None 
    res = pd.concat([res, TICKER_DATA_DATE], axis = 0, ignore_index=True)
res.dropna(inplace=True)

import numpy as np
res['Return'] = np.log(res['Close'] / res['Open'])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as mtick
import numpy as np

# Clean data
res = res.dropna(subset=['Return']).copy()
res['Return'] = res['Return'].clip(lower=-0.1, upper=0.1)

tickers = res['Name'].unique()

plt.style.use('dark_background')
plt.figure(figsize=(15, 8))
common_sample_size = 1000

# Plot individual KDEs
for ticker in tickers:
    sub = res[res['Name'] == ticker]['Return'].sample(n=common_sample_size, random_state=42)
    sns.kdeplot(
        sub,
        label=ticker,
        fill=False,
        linewidth=1,
        bw_adjust=1.2
    )

# Plot overall KDE
sampled = res['Return'].sample(n=common_sample_size, random_state=42)
sns.kdeplot(
    sampled,
    color='white',
    linewidth=4,
    fill=True,
    alpha=0.4,
    label='Overall',
    bw_adjust=1.2
)

# Axis settings
plt.xlim(-0.05, 0.05)
plt.gca().xaxis.set_major_formatter(mtick.PercentFormatter(1.0))

# Vertical line at 0.7%
plt.axvline(0.007, color='red', linestyle='--', linewidth=1.5, alpha=0.7, label='Threshold 0.7%')

# Final polish
plt.title("Distribution of Daily Returns by Ticker", fontsize=18, weight='bold')
plt.xlabel("Daily Return", fontsize=14)
plt.ylabel("Density", fontsize=14)
plt.axvline(0, color='white', linestyle='--', alpha=0.3)
plt.grid(alpha=0.2)
plt.legend(
    title='Ticker',
    fontsize=12,
    title_fontsize=14,
    ncol=3,
    loc='upper right',
    frameon=True
)
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

# Log of close prices
res = res[res['Name'] == 'UPS']
log_prices = np.log(res['Close'].dropna())

# Histogram and KDE
sns.histplot(log_prices, kde=True)
plt.title("Log of Close Prices")
plt.show()

# Q-Q plot
stats.probplot(log_prices, dist="norm", plot=plt)
plt.title("Q-Q Plot of Log Prices")
plt.show()

# Normality test
stat, p = stats.shapiro(log_prices)
print("Shapiro-Wilk p-value:", p)

## Drop the indexes that will be used to compute the hedging (this indexes have been determined later, see the result notebook), 
- Explanation: It's better to keep two versions of the dataset, one for training that doesn't contain the file droped below and another one used in the result notebook that will be used to determine on whihc stock and which time window our hedging will be realized.

In [None]:
import os
begin = 2206 
end = 2245
path_images =  os.path.join(os.getcwd(), 'v-b', 'UPS', 'images')
path_labels = os.path.join(os.getcwd(), 'v-b', 'UPS', 'labels', 'labels.csv')
old_labels = pd.read_csv(path_labels)
new_labels = old_labels[((old_labels['id'] < begin) | (old_labels['id'] >= end))]
new_labels.to_csv(path_labels)

for i in range(begin, end):
    filename = f"{i}.png"                     # adapt extension
    img_path = os.path.join(path_images, filename)

    if os.path.isfile(img_path):
        os.remove(img_path)
        deleted += 1
    else:
        missing += 1

print(f"Removed {deleted} images; {missing} not found.")
