# Mini Project 2

**2025 Introduction to Quantiative Methods in Finance**

**The Erdös Institute**


###  Hypothesis Testing of Standard Assumptions Theoretical Financial Mathematics

In the theory of mathematical finance, it is common to assume the log returns of a stock/index are normally distributed.


Investigate if the log returns of stocks or indexes of your choosing are normally distributed. Some suggestions for exploration include:

    1) Test if there are period of times when the log-returns of a stock/index have evidence of normal distribution.
    
    2) Test if removing extremal return data creates a distribution with evidence of being normal.
    
    3) Create a personalized portfolio of stocks with historical log return data that is normally distributed.
    
    4) Test if the portfolio you created in the first mini-project has significant periods of time with evidence of normally distributed log returns.
    
    5) Gather x-number of historical stock data and just perform a normality test on their log return data to see if any of the stocks exhibit evidence of log returns that are normally distributed.

In [14]:
import pandas as pd
from curl_cffi import requests
import yfinance as yf
session = requests.Session(impersonate="chrome")
ticker = yf.Ticker("AAPL", session=session)
hist = ticker.history(period="1mo")
print(hist.tail())



                                 Open        High         Low       Close  \
Date                                                                        
2025-06-23 00:00:00-04:00  201.630005  202.300003  198.960007  201.500000   
2025-06-24 00:00:00-04:00  202.589996  203.440002  200.199997  200.300003   
2025-06-25 00:00:00-04:00  201.449997  203.669998  200.619995  201.559998   
2025-06-26 00:00:00-04:00  201.429993  202.639999  199.460007  201.000000   
2025-06-27 00:00:00-04:00  201.895004  203.220001  200.220001  201.080002   

                             Volume  Dividends  Stock Splits  
Date                                                          
2025-06-23 00:00:00-04:00  55814300        0.0           0.0  
2025-06-24 00:00:00-04:00  54064000        0.0           0.0  
2025-06-25 00:00:00-04:00  39525700        0.0           0.0  
2025-06-26 00:00:00-04:00  50799100        0.0           0.0  
2025-06-27 00:00:00-04:00  70534466        0.0           0.0  


In [3]:
hist.to_csv("AAPL_1y")
log_rets = hist

In [4]:

import numpy as np
log_rets['Log_Returns'] = np.log(log_rets['Close'] / log_rets['Close'].shift(1))

log_rets = log_rets["Log_Returns"]
log_rets.dropna(axis = 0, inplace=True)

In [5]:
#1. normality test: Shapiro-Wilk
import scipy
stat, p = scipy.stats.shapiro(log_rets)
if p>0.05:
    print(p,  " This is normally distributed. ")


0.8206809198692798  This is normally distributed. 


In [7]:
#2 removing extreme data: 
std_dev = log_rets.std()
mu = log_rets.mean()


log_rets_filtered = log_rets[(log_rets > mu-3*std_dev) & (log_rets < mu+3*std_dev)]
stat1, p1 = scipy.stats.shapiro(log_rets_filtered)
print(p1)
#0.8206809198692798 this is still normally distributed.

0.8206809198692798


In [8]:
low_percentile = np.percentile(log_rets.dropna(), 1)
high_percentile = np.percentile(log_rets.dropna(), 99)
filtered_returns = log_rets[(log_rets >= low_percentile) & (log_rets <= high_percentile)]

stat, p = scipy.stats.shapiro(filtered_returns)
print(f'Shapiro-Wilk Test Statistic (filtered): {stat}, p-value: {p}')
#filter out super high and low values

Shapiro-Wilk Test Statistic (filtered): 0.9527903231827752, p-value: 0.4402120924489167


In [9]:
#3Create a personalized portfolio of stocks with historical log return data that is normally distributed.
stocks_to_buy = [
    "MSFT","NVDA","AAPL","GOOG","AMZN","META",
    "AVGO","BRK-A","WMT","JPM","V","MA","XOM"
]
data = yf.download(stocks_to_buy, start="2023-01-01", end="2025-01-01")['Close']


  data = yf.download(stocks_to_buy, start="2023-01-01", end="2025-01-01")['Close']
[*********************100%***********************]  13 of 13 completed


In [10]:
data.to_csv("Aggregate_Data.csv")
log_returns = np.log(data / data.shift(1))
log_returns.to_csv("Log_Agg_Data.csv")

In [11]:
portfolio_daily_avg_returns = log_returns.mean(axis =0)
portfolio_daily_avg_returns

Ticker
AAPL     0.001407
AMZN     0.001873
AVGO     0.002931
BRK-A    0.000742
GOOG     0.001510
JPM      0.001250
MA       0.000857
META     0.003094
MSFT     0.001160
NVDA     0.004470
V        0.000872
WMT      0.001322
XOM      0.000154
dtype: float64

In [12]:
weights1 = np.array([0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.05, 0.05, 0.05, 0.05, 0.05])  
weights2 = np.array([0.2, 0.2, 0.1, 0.1, 0.1, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05])  
weights3 = np.array([0.9,0,0,0.1,0,0,0,0,0,0,0,0,0])

In [15]:
def calculate_portfolio_returns(log_returns, weights):
    portfolio_returns = log_returns.dot(weights)  
    return portfolio_returns

portfolio_returns1 = calculate_portfolio_returns(log_returns, weights1)
portfolio_returns2 = calculate_portfolio_returns(log_returns, weights2)
portfolio_returns3 = calculate_portfolio_returns(log_returns, weights3)


portfolio_df = pd.DataFrame({
    'Portfolio 1': portfolio_returns1,
    'Portfolio 2': portfolio_returns2,
    'Portfolio 3': portfolio_returns3
})
portfolio_returns1.dropna(inplace= True, axis = 0)
portfolio_returns2.dropna(inplace= True, axis = 0)
portfolio_returns3.dropna(inplace= True, axis = 0)


In [17]:
def test_normality(data, portfolio_name):
    stat, p = scipy.stats.shapiro(data)
    print(f'{portfolio_name} - Shapiro-Wilk Test Statistic: {stat}, p-value: {p}')
    if p > 0.05:
        print(f'{portfolio_name} log returns are likely normally distributed.\n', p)
    else:
        print(f'{portfolio_name} log returns are not normally distributed.\n', p)
test_normality(portfolio_returns1, 'Portfolio 1')
test_normality(portfolio_returns2, 'Portfolio 2')
test_normality(portfolio_returns3,'Portfolio 3')

Portfolio 1 - Shapiro-Wilk Test Statistic: 0.9863480771000245, p-value: 0.00012171269497245295
Portfolio 1 log returns are not normally distributed.
 0.00012171269497245295
Portfolio 2 - Shapiro-Wilk Test Statistic: 0.9896538780536737, p-value: 0.0013388183655660474
Portfolio 2 log returns are not normally distributed.
 0.0013388183655660474
Portfolio 3 - Shapiro-Wilk Test Statistic: 0.9742563317242638, p-value: 1.0291492731505707e-07
Portfolio 3 log returns are not normally distributed.
 1.0291492731505707e-07


Portfolio 1 - Shapiro-Wilk Test Statistic: 0.9863480771000245, p-value: 0.00012171269497245295
Portfolio 1 log returns are not normally distributed.
 0.00012171269497245295
Portfolio 2 - Shapiro-Wilk Test Statistic: 0.9896538780536737, p-value: 0.0013388183655660474
Portfolio 2 log returns are not normally distributed.
 0.0013388183655660474
Portfolio 3 - Shapiro-Wilk Test Statistic: 0.9742563317242638, p-value: 1.0291492731505707e-07
Portfolio 3 log returns are not normally distributed.
 1.0291492731505707e-07

In [None]:
portfolio_weights =  [0.16343286, 0.1268183,  0.07835773, 0.0982189,  0.00558896, 0.05585723,
 0.01619433, 0.04599168, 0.04329709, 0.1329311,  0.02189666, 0.20684973,
 0.00456543]
returns = calculate_portfolio_returns(log_returns, portfolio_weights)
returns.dropna(inplace =True, axis = 0)
test_normality(returns, "Current Portfolio")
#Current Portfolio - Shapiro-Wilk Test Statistic: 0.9941318528080283, p-value: 0.05058236159862364
#Current Portfolio log returns are likely normally distributed.
 #0.05058236159862364

Current Portfolio - Shapiro-Wilk Test Statistic: 0.9941318528080283, p-value: 0.05058236159862364
Current Portfolio log returns are likely normally distributed.
 0.05058236159862364


In [None]:


def brute_force_find_normal_portfolio(log_returns, num_trials=1000):
    num_stocks = log_returns.shape[1]
    for _ in range(num_trials):

        weights = np.random.rand(num_stocks)
        weights /= np.sum(weights) 
        portfolio_returns = calculate_portfolio_returns(log_returns, weights)

        portfolio_returns.dropna(inplace=True, axis=0)
        if test_normality(portfolio_returns, f"Random {weights}"):
            return weights  
    print("No portfolio found in 10000 trials.")
    return None  
weights = brute_force_find_normal_portfolio(log_returns, num_trials=10000)
if weights is not None:
    print("Found portfolio with normally distributed returns:", weights)
else:
    print("No normally distributed portfolio found.")


Random Portfolio [0.06104797 0.11115009 0.15928597 0.12792243 0.04799347 0.10358002
 0.04185219 0.08421229 0.01289186 0.00770048 0.0790189  0.09783108
 0.06551326] - Shapiro-Wilk Test Statistic: 0.9850802757367764, p-value: 5.167936505675497e-05
Random Portfolio [0.15546248 0.0086351  0.10572463 0.08519284 0.10880034 0.04665317
 0.04865301 0.07328923 0.01662453 0.13809754 0.06095734 0.03971887
 0.11219092] - Shapiro-Wilk Test Statistic: 0.9889340984828021, p-value: 0.0007775535774279054
Random Portfolio [0.0362548  0.02168212 0.05286529 0.15080185 0.01431281 0.07681376
 0.03230201 0.12440518 0.10125792 0.16250554 0.01211809 0.11307262
 0.101608  ] - Shapiro-Wilk Test Statistic: 0.9872546414837546, p-value: 0.00022925831943590658
Random Portfolio [0.16029484 0.01376087 0.14502925 0.04555122 0.06473246 0.14894746
 0.04585583 0.00283928 0.0530662  0.12772592 0.04709756 0.01232157
 0.13277754] - Shapiro-Wilk Test Statistic: 0.9865274484325077, p-value: 0.00013776433796035698
Random Portfol

In [19]:
    
   # 4) Test if the portfolio you created in the first mini-project has significant periods of
   #  time with evidence of normally distributed log returns.
log_returns = pd.read_csv("Log_Agg_Data.csv")
log_returns.head()
log_returns1 = log_returns.drop("Date", axis= 1)

In [20]:

MinVarWeights=  [6.94375823e-17, 4.71351734e-01, 0.00000000e+00, 2.69149531e-01,
 0.00000000e+00, 1.64223805e-17, 0.00000000e+00, 0.00000000e+00,
 4.03689355e-17, 9.98897644e-17,8.09704852e-18 ,2.59498735e-01,
 0.00000000e+00]

MinVarReturns = log_returns1.dot(MinVarWeights)

In [None]:
window_size = 10
total_windows = len(MinVarReturns) - window_size + 1

normal_windows = 0
for start in range(total_windows):
    window = MinVarReturns.iloc[start : start + window_size]
    stat, p_value = scipy.stats.shapiro(window)
    if p_value > 0.05:
        normal_windows += 1
print(normal_windows)
#457; this has 457 periods where it looks likes its normally distributed. 

457


In [None]:
#5    5) Gather x-number of historical stock data and just perform a 
# normality test on their log return data to see if any of the stocks exhibit evidence of log returns that are normally distributed.
log_returns.head()


Unnamed: 0,Date,AAPL,AMZN,AVGO,BRK-A,GOOG,JPM,MA,META,MSFT,NVDA,V,WMT,XOM
0,2023-01-03,,,,,,,,,,,,,
1,2023-01-04,0.010262,-0.007955,0.01214,0.014242,-0.011098,0.009282,0.023792,0.020865,-0.044729,0.029867,0.024859,0.001113,0.002906
2,2023-01-05,-0.010661,-0.024012,-0.009361,-0.005474,-0.022112,-0.000222,-0.009563,-0.003382,-0.030086,-0.033366,-0.00708,-0.003414,0.022128
3,2023-01-06,0.036133,0.034992,0.058454,0.017361,0.015892,0.018955,0.04583,0.023974,0.011716,0.040797,0.030968,0.024204,0.012014
4,2023-01-09,0.00408,0.01476,-0.019806,-0.007435,0.007233,-0.004141,0.008935,-0.004239,0.009689,0.050459,0.003896,-0.012546,-0.018813


In [27]:
log_returns1 = log_returns.drop("Date", axis= 1)
columns = log_returns1.columns

In [33]:
for col in log_returns1.columns:
    series = log_returns1[col]
    total_windows = len(series) - window_size + 1
    normal_count = 0

    for start in range(total_windows):
        window = series.iloc[start : start + window_size]
        stat, p_value = scipy.stats.shapiro(window)
        if p_value > 0.05:
            normal_count += 1

    pct = normal_count / total_windows * 100
    print(normal_count, pct)

439 89.04665314401623
453 91.88640973630832
442 89.65517241379311
456 92.49492900608519
423 85.80121703853956
414 83.97565922920892
461 93.50912778904666
426 86.40973630831643
443 89.8580121703854
441 89.4523326572008
430 87.2210953346856
440 89.24949290060852
447 90.66937119675457


439 89.04665314401623
453 91.88640973630832
442 89.65517241379311
456 92.49492900608519
423 85.80121703853956
414 83.97565922920892
461 93.50912778904666
426 86.40973630831643
443 89.8580121703854
441 89.4523326572008
430 87.2210953346856
440 89.24949290060852
447 90.66937119675457