#### Statistical Analysis and Visualization <br>
 We will empirically analyse the historical performance and some statistics about the SMALL LoBM and BIG HiBM Portfolios. 

In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [9]:
# show available styles
plt.style.use('seaborn-v0_8-dark-palette')
sns.set_context("talk")


In [12]:
file_path = r"C:\Users\GIORDANO\Desktop\financial-time-series-forecasting\data\Developed_6_Portfolios_ME_BE-ME_cleaned_decimals.csv"
df = pd.read_csv(file_path)
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)


In [13]:
desc_stats = df.describe().T  # Transpose for readability
desc_stats['skew'] = df.skew()
desc_stats['kurtosis'] = df.kurtosis()
print("\nDescriptive Statistics:")
print(desc_stats)


Descriptive Statistics:
            count      mean       std     min       25%      50%       75%  \
SMALL LoBM  414.0  0.004280  0.054821 -0.2476 -0.025250  0.00965  0.036825   
ME1 BM2     414.0  0.006821  0.047400 -0.2257 -0.019350  0.01090  0.034975   
SMALL HiBM  414.0  0.009030  0.044552 -0.2006 -0.013200  0.01120  0.036275   
BIG LoBM    414.0  0.007613  0.045751 -0.1750 -0.016575  0.01095  0.037025   
ME2 BM2     414.0  0.007591  0.042813 -0.1874 -0.017200  0.01090  0.033675   
BIG HiBM    414.0  0.007336  0.048936 -0.2241 -0.020250  0.01135  0.037625   

               max      skew  kurtosis  
SMALL LoBM  0.1749 -0.586607  1.727346  
ME1 BM2     0.1492 -0.683758  2.155353  
SMALL HiBM  0.1531 -0.701097  2.379846  
BIG LoBM    0.1304 -0.510043  0.844719  
ME2 BM2     0.1226 -0.638844  1.522604  
BIG HiBM    0.1879 -0.651964  2.625677  


In [18]:
import os

num_portfolios = df.shape[1]
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(18, 10), sharex=True)
axes = axes.flatten()

for i, col in enumerate(df.columns):
    axes[i].plot(df.index, df[col], label=col, lw=1.5)
    axes[i].set_title(col)
    axes[i].set_ylabel("Return (decimal)")
    axes[i].legend()

plt.tight_layout()
# save the plot to a new folder in the project directory called 'plots'and call it 'returns_plot.png'
if not os.path.exists('plots'):
    os.makedirs('plots')
plt.savefig('plots/returns_plot.png')
plt.close()


In [19]:
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(18, 10))
axes = axes.flatten()
for i, col in enumerate(df.columns):
    axes[i].hist(df[col].dropna(), bins=30, edgecolor='k', alpha=0.75)
    axes[i].set_title(f"Histogram: {col}")
    axes[i].set_xlabel("Return (decimal)")
    axes[i].set_ylabel("Frequency")

plt.tight_layout()
plt.savefig('plots/histograms.png')
plt.close()

Correlation Analysis: we try to understand how the portfolios are related. 

In [20]:
# Correlation matrix
corr = df.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f", vmin=-1, vmax=1)
plt.title("Correlation Matrix")
plt.tight_layout() 
plt.savefig('plots/correlation_matrix.png')
plt.close()

In [23]:
selected_portfolios = ['SMALL LoBM', 'BIG HiBM']
df_selected = df[selected_portfolios].dropna()
print("\nSelected Portfolios (first 5 rows):")
print(df_selected.head())


Selected Portfolios (first 5 rows):
            SMALL LoBM  BIG HiBM
date                            
1990-07-01      0.0243    0.0139
1990-08-01     -0.1222   -0.1021
1990-09-01     -0.1082   -0.1160
1990-10-01      0.0520    0.0874
1990-11-01     -0.0312   -0.0345


In [35]:
# save the selected portfolios in the data folder as 'selected_portfolios.csv' in this path C:\Users\GIORDANO\Desktop\financial-time-series-forecasting\data
df_selected.to_csv(r"C:\Users\GIORDANO\Desktop\financial-time-series-forecasting\data\selected_portfolios.csv")


_We select the SMALL LoBM and BIG HiBM as the portfolios to forecast. <br>
Perform the ADF test on each of the two series to evaluate their stationarity. <br>
 A non-stationary series may require differencing prior to ARIMA modeling._

In [26]:
def adf_test(series, title=''):
    print(f"Results of ADF Test for {title}:")
    result = adfuller(series, autolag='AIC')
    labels = ['ADF Statistic', 'p-value', '# Lags Used', 'Number of Observations']
    for label, value in zip(labels, result[0:4]):
        print(f"{label} : {value}")
    for key, value in result[4].items():
        print(f"Critical Value ({key}) : {value}")
    print("\n")

    for col in df_selected.columns:
       adf_test(df_selected[col], title=col)

In [28]:
for col in df_selected.columns:
    fig, axes = plt.subplots(1, 2, figsize=(15, 4))
    plot_acf(df_selected[col], ax=axes[0], lags=30)
    axes[0].set_title(f"ACF of {col}")
    plot_pacf(df_selected[col], ax=axes[1], lags=30)
    axes[1].set_title(f"PACF of {col}")
    plt.tight_layout()
    plt.savefig(f'plots/{col}_acf_pacf.png')
    plt.close()


Volatility Analysis: Rolling Volatility 

_to capture volatility clustering—a common feature in financial time series—we compute the rolling standard deviation over a 12-month window and plot it._

In [30]:
rolling_window = 12  # 12-month window for rolling volatility
df_rolling_vol = df_selected.rolling(window=rolling_window).std()

plt.figure(figsize=(12, 6))
for col in df_selected.columns:
    plt.plot(df_rolling_vol.index, df_rolling_vol[col], label=f"{col} Rolling Volatility (window={rolling_window})")
plt.xlabel("Date")
plt.ylabel("Rolling Volatility (Std. Dev.)")
plt.title("Rolling Volatility of Selected Portfolios")
plt.legend()
plt.tight_layout()
plt.savefig('plots/rolling_volatility.png')
plt.close()


In [31]:
# Descriptive statistics for selected portfolios

desc_stats = df_selected.describe().T  # Transposed for easier reading
desc_stats['skew'] = df_selected.skew()
desc_stats['kurtosis'] = df_selected.kurtosis()
print("Descriptive Statistics for Selected Portfolios:")
print(desc_stats)


Descriptive Statistics for Selected Portfolios:
            count      mean       std     min      25%      50%       75%  \
SMALL LoBM  414.0  0.004280  0.054821 -0.2476 -0.02525  0.00965  0.036825   
BIG HiBM    414.0  0.007336  0.048936 -0.2241 -0.02025  0.01135  0.037625   

               max      skew  kurtosis  
SMALL LoBM  0.1749 -0.586607  1.727346  
BIG HiBM    0.1879 -0.651964  2.625677  


In [33]:
# Plotting the selected time series
plt.figure(figsize=(12, 6))
for col in df_selected.columns:
    plt.plot(df_selected.index, df_selected[col], label=col)
plt.xlabel("Date")
plt.ylabel("Return (decimal)")
plt.title("Time Series of Selected Portfolios")
plt.legend()
plt.tight_layout()
plt.savefig('plots/selected_portfolios_time_series.png')
plt.close()