In [36]:
## Libraries

import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from statsmodels.formula.api import ols
import statsmodels.api as sm
from numpy.linalg import inv, solve
from scipy.stats import f

In [24]:
## Data loading 

def load_data():
    vw_industries = pd.read_excel('../data/Problem_Set4.xlsx', sheet_name = 'Industry portfolios (VW)', header=2)
    vw_industries = vw_industries.loc[:, :'Other']
    pr_portfolios = pd.read_excel('../data/Problem_Set4.xlsx', sheet_name='Past return portfolios', header=8)
    beme_portfolios = pd.read_excel('../data/Problem_Set4.xlsx', sheet_name='25 size and BEME portfolios', header=[1, 2])
    factors = pd.read_excel('../data/Problem_Set4.xlsx', sheet_name='Market, Rf', header =1)
    
    new_cols = []
    for col in beme_portfolios.columns:
        if 'Unnamed' in str(col[0]):
            new_cols.append('Date')
        else:
            new_cols.append(f"{col[0]}_{col[1]}")
    beme_portfolios.columns = new_cols

    return vw_industries, pr_portfolios, beme_portfolios, factors

vw_industries, pr_portfolios, beme_portfolios ,factors = load_data()

# Clean the data: 
def clean_data(df, replacement):
    df = df.replace([-99.99, -999], replacement)
    df['date'] = pd.to_datetime(df.iloc[:,0], format='%Y%m')
    df.set_index('date', inplace=True)
    df = df.iloc[:,1:]
    return df

vw_industries = clean_data(vw_industries,  np.nan)
pr_portfolios = clean_data(pr_portfolios, np.nan)
beme_portfolios = clean_data(beme_portfolios, np.nan)
factors = clean_data(factors, np.nan)

def excess_returns(df, factors):
    df = df.subtract(factors['RF'], axis=0)
    return df

vw_industries = excess_returns(vw_industries, factors)
pr_portfolios = excess_returns(pr_portfolios, factors)
beme_portfolios = excess_returns(beme_portfolios, factors)

print(vw_industries.head(5), pr_portfolios.head(5), beme_portfolios.head(5), sep='\n\n')

            Food   Beer  Smoke  Games  Books  Hshld  Clths  Hlth  Chems  \
date                                                                      
1926-07-01  0.34  -5.41   1.07   2.71  10.75  -0.70   7.86  1.55   7.92   
1926-08-01  2.34  26.78   6.25   0.30   9.76  -3.83  -2.76  4.00   5.25   
1926-09-01  0.93   3.79   1.03   6.35  -1.22   0.50  -0.74  0.46   5.10   
1926-10-01 -3.38  -3.63   0.74  -5.08   9.15  -5.00  -0.20 -0.89  -5.08   
1926-11-01  6.04   6.98   4.24   1.35  -6.11  -0.85   1.56  5.11   4.89   

            Txtls  ...  Telcm  Servs  BusEq  Paper  Trans  Whlsl  Rtail  \
date               ...                                                    
1926-07-01   0.17  ...   0.61   9.00   1.84   7.48   1.71 -24.01  -0.15   
1926-08-01   7.89  ...   1.92   1.77   4.14  -2.63   4.63   5.14  -1.00   
1926-09-01   2.08  ...   2.18   2.02  -0.04  -5.77  -0.18  -8.10   0.02   
1926-10-01   0.68  ...  -0.43  -2.32  -1.41  -5.40  -2.96 -15.70  -2.52   
1926-11-01   2.80  ...  

In [None]:
'''
Notes on TA session: 
- Problem B, just follow the formulas and calculate the GRS statistic => GRS is testing whether a portfolio is mean variance efficient. Which means 
that there is no other portfolio that has a higher return for the same level of risk.
- Problem D => economic significance and statistical signifcance ("alpha of 5% but p values is very large 60%, so we reject the null hypothesis at 5% level but 
the magnitude is still very high"). Remember if alpha is positive then there some return that is not explained by the risk factors. 
- Part B, we just changed the dataset 
- Part C => "How is it different to last weeks assignment? 
    - 
'''

In [None]:
### Part 1
## Question A 
''' Lets calculate the sample mean and sample standard deviation for each of the 30 portfolios.'''
results = []

def summary_stats(df):
    for columns in df.columns: 
        mean = df[columns].mean()
        std = df[columns].std()
        sharpe_ratio = mean / std if std != 0 else 0
        results.append((columns, mean, std, sharpe_ratio))
    df_results = pd.DataFrame(results, columns= ['Portfolio', 'Mean', 'Std', 'Sharpe Ratio'])
    df_results.set_index('Portfolio', inplace=True)
    overall_mean = df_results['Mean'].mean()
    overall_std = df_results['Std'].mean()
    overall_sharpe = overall_mean / overall_std if overall_std != 0 else 0
    df_results.loc['Overall'] = [overall_mean, overall_std, overall_sharpe]
    return df_results

summary_stats = summary_stats(vw_industries)


Unnamed: 0_level_0,Mean,Std,Sharpe Ratio
Portfolio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Food,0.699733,4.759748,0.14701
Beer,0.930664,7.251468,0.128341
Smoke,0.891705,5.789458,0.154022
Games,0.845336,8.983094,0.094103
Books,0.644654,7.198844,0.08955
Hshld,0.645502,5.859066,0.110172
Clths,0.6467,6.117961,0.105705
Hlth,0.801889,5.597811,0.143251
Chems,0.782083,6.347498,0.123211
Txtls,0.724507,7.68825,0.094236


In [43]:
## Question B 

def estimate_regressions(x, y):
    results = []
    # **FIXED:** Use y.columns instead of the hardcoded vw_industries.columns 
    # to keep the function general if you reuse it.
    for columns in y.columns:
        X = sm.add_constant(x)
        Y = y[columns]
        # Data alignment should be handled in the call, but OLS handles indices automatically
        model =sm.OLS(Y, X, missing='drop').fit()
        alpha = model.params['const']
        beta = model.params[x.columns[0]]
        residuals = model.resid
        results.append((columns, alpha, beta, residuals))
    return pd.DataFrame(results, columns=['Portfolio', 'Alpha', 'Beta', 'Residuals']).set_index('Portfolio')

# Use vw_industries (already excess returns) as the test assets (y)
regression_results = estimate_regressions(factors[['RM-RF']], vw_industries)
print(regression_results)

# Careful need to check if this covariance matrix is unbiased or not 
def residuals_covariance_matrix(residuals):
    # Stack residuals into a 2D array: shape (n_portfolios, n_periods) of dimension (n_portfolios x n_periods)
    residuals_matrix = np.vstack([r.values for r in residuals.values]).T 
    # **FIXED:** Changed to rowvar=False for T rows, N columns to ensure correct N x N covariance 
    # calculation, which is the preferred way when data is organized T x N.
    residuals_covariance_matrix = np.cov(residuals_matrix, rowvar=False)
    return residuals_covariance_matrix

sigma_hat = residuals_covariance_matrix(regression_results['Residuals'])

# Now lets compute the sample means and sample covariance matrix 
def sample_means_and_covariance(returns):
    returns_matrix = returns.values
    # Calculates mean for each column (factor)
    sample_means_vector = returns_matrix.mean(axis=0) 
    # Computes K x K covariance matrix (rowvar=False assumes T rows, K columns)
    sample_means_covariance = np.cov(returns_matrix, rowvar=False)
    return sample_means_vector, sample_means_covariance

# **CORRECTION:** The call uses the correct factor data (RM-RF)
sample_means, sample_covariance_matrix = sample_means_and_covariance(factors[['RM-RF']])
print(sample_means, sample_covariance_matrix)

def grs_statistic(T, N, K, alpha_vector, sigma_hat, m_hat, omega_hat):
    """
    Computes the GRS test statistic. (Fixed scaling and input reshaping)

    Args:
        T (int): Number of time periods.
        N (int): Number of test assets/portfolios.
        K (int): Number of factors.
        alpha_vector (np.ndarray): N x 1 vector of estimated intercepts (alphas).
        sigma_hat (np.ndarray): N x N covariance matrix of residuals (Sigma).
        m_hat (np.ndarray): K x 1 vector of factor means (m).
        omega_hat (np.ndarray): K x K covariance matrix of factors (Omega).

    Returns:
        float: The GRS test statistic.
    """
    
    # Ensure inputs are correctly shaped for matrix algebra
    alpha_vector = alpha_vector.reshape(-1, 1)
    m_hat = m_hat.reshape(-1, 1)
    # If omega_hat is a scalar (K=1), ensure it's treated as a 1x1 array
    if K == 1 and omega_hat.ndim == 0:
        omega_hat = np.array([[omega_hat]])

    # 1. Numerator: alpha' * Sigma_inv * alpha
    numerator_quadratic = alpha_vector.T @ solve(sigma_hat, alpha_vector)
    
    # 2. Denominator: 1 + m' * Omega_inv * m
    denominator_quadratic = m_hat.T @ solve(omega_hat, m_hat)
    denominator = 1 + denominator_quadratic

    # GRS Statistic (W)
    grs_stat = (numerator_quadratic / denominator)
    grs_stat_w = grs_stat[0][0]

    # 3. Scaling factor (FIXED TO STANDARD GRS F-STATISTIC FORMULA)
    df1 = N
    df2 = T - N - K
    # Standard GRS F-statistic scaling for F(N, T-N-K) distribution
    scaling_factor_f = (T - N - K) / N
    
    grs_stat_normalised = grs_stat_w * scaling_factor_f

    # 4. Compute the p-value
    p_value = f.sf(grs_stat_normalised, df1, df2)

    return grs_stat_w, grs_stat_normalised, p_value

# Determine K based on the factor data used in the regression call
K = factors[['RM-RF']].shape[1] 

# **CORRECTION:** Reshape K-dimensional factor moments to be safe for the function call
m_hat_reshaped = sample_means.reshape(K, 1)
omega_hat_reshaped = sample_covariance_matrix.reshape(K, K)

# The shape attributes (T, N, K) are calculated correctly in Cell 1/start of Cell 3.
grs_stat_w, grs_stat_f, p_value = grs_statistic(
    T=vw_industries.shape[0], 
    N=vw_industries.shape[1], 
    K=K, 
    alpha_vector=regression_results['Alpha'].values, 
    sigma_hat=sigma_hat, # Now based on correct residuals covariance calculation
    m_hat=m_hat_reshaped, 
    omega_hat=omega_hat_reshaped
)

print(f"GRS Statistic (W): {grs_stat_w}, Scaled GRS F-Statistic: {grs_stat_f}, p-value: {p_value}")

''' basically the hypothesis test in a GRS tests to see if the alphas are jointly equal to zero. 
Jointly equal to zero means that there is no portfolio that can beat the market portfolio.'''

              Alpha      Beta  \
Portfolio                       
Food       0.218167  0.739370   
Beer       0.317082  0.942062   
Smoke      0.482244  0.628665   
Games     -0.059115  1.388648   
Books     -0.076998  1.107988   
Hshld      0.057837  0.902271   
Clths      0.117064  0.813177   
Hlth       0.255263  0.839263   
Chems      0.103032  1.042579   
Txtls     -0.017285  1.138909   
Cnstr     -0.102418  1.172375   
Steel     -0.243096  1.357006   
FabPr     -0.035984  1.241626   
ElcEq      0.057924  1.284064   
Autos     -0.012242  1.252084   
Carry      0.073151  1.188701   
Mines      0.042680  0.908179   
Coal      -0.050477  1.299820   
Oil        0.185034  0.868141   
Util       0.093951  0.778292   
Telcm      0.153157  0.661480   
Servs      0.397597  0.812840   
BusEq      0.135660  1.077502   
Paper      0.132112  0.954503   
Trans     -0.091217  1.137771   
Whlsl     -0.158732  1.089085   
Rtail      0.120500  0.965016   
Meals      0.163777  0.945508   
Fin       

' basically the hypothesis test in a GRS tests to see if the alphas are jointly equal to zero. \nJointly equal to zero means that there is no portfolio that can beat the market portfolio.'