In [1]:
import pandas as pd
import yfinance as yf
import numpy as np
from feature_engineering import load_mock_data, calculate_linguistic_features


In [2]:
def get_stock_data(tickers, start_date, end_date):
    """Fetch historical stock data from Yahoo Finance."""
    data = yf.download(tickers, start=start_date, end=end_date, group_by='ticker')
    return data

def calculate_volatility(stock_data, window=90):
    """Calculate rolling volatility for each stock."""
    # Use log returns for volatility calculation
    log_returns = np.log(stock_data['Close'] / stock_data['Close'].shift(1))
    volatility = log_returns.rolling(window=window).std() * np.sqrt(window)
    return volatility.rename('volatility')


In [3]:
# 1. Load linguistic features
mock_data = load_mock_data()
features_df = calculate_linguistic_features(mock_data)
features_df['date'] = pd.to_datetime(features_df['date'])
features_df.head()


Unnamed: 0,ticker,date,speaker,complexity_score,sentiment_score,generalizing_score,self_reference_score,future_tense_ratio,past_tense_ratio
0,ENRN,1999-04-12,CEO,9.242647,0.9601,0.025,0.175,0.15,0.15
1,ENRN,1999-04-12,CFO,10.694706,0.9136,0.0,0.153846,0.128205,0.128205
2,ENRN,2000-10-27,CEO,17.305606,0.8225,0.025,0.05,0.2,0.2
3,ENRN,2000-10-27,CFO,18.166667,0.91,0.04,0.02,0.12,0.12
4,AAPL,2023-01-25,CEO,10.243636,0.9359,0.0,0.122449,0.183673,0.183673


In [4]:
# 2. Get stock data
tickers = features_df['ticker'].unique().tolist()
start_date = features_df['date'].min() - pd.Timedelta(days=1)
# We need data for 90 days after the last earnings call to calculate future volatility
end_date = features_df['date'].max() + pd.Timedelta(days=91) 

all_stock_data = []
for ticker in tickers:
    stock_data = yf.download(ticker, start=start_date, end=end_date)
    if not stock_data.empty:
        stock_data['ticker'] = ticker
        all_stock_data.append(stock_data)

if all_stock_data:
    stock_data = pd.concat(all_stock_data)
    stock_data.head()
else:
    print("Could not download any stock data.")

  stock_data = yf.download(ticker, start=start_date, end=end_date)
HTTP Error 404: {"quoteSummary":{"result":null,"error":{"code":"Not Found","description":"Quote not found for symbol: ENRN"}}}
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['ENRN']: YFTzMissingError('possibly delisted; no timezone found')
  stock_data = yf.download(ticker, start=start_date, end=end_date)
[*********************100%***********************]  1 of 1 completed


In [5]:
# 3. Calculate volatility
if not stock_data.empty:
    stock_data['returns'] = stock_data.groupby('ticker')['Close'].pct_change()
    stock_data['volatility'] = stock_data.groupby('ticker')['returns'].rolling(window=30).std().reset_index(0,drop=True) * np.sqrt(252)
    stock_data = stock_data.reset_index()
    stock_data['Date'] = pd.to_datetime(stock_data['Date'])

    # 4. Merge features with future volatility
    # For each earnings call, we want to see the volatility in the *next* 90 days.
    # We can approximate this by taking the average volatility in the 90 days following the call.
    
    merged_data = []
    for index, row in features_df.iterrows():
        call_date = row['date']
        future_volatility_period = stock_data[
            (stock_data['ticker'] == row['ticker']) &
            (stock_data['Date'] > call_date) &
            (stock_data['Date'] <= call_date + pd.Timedelta(days=90))
        ]
        
        if not future_volatility_period.empty:
            avg_future_volatility = future_volatility_period['volatility'].mean()
            
            new_row = row.to_dict()
            new_row['avg_future_volatility'] = avg_future_volatility
            merged_data.append(new_row)

    final_df = pd.DataFrame(merged_data)
    if not final_df.empty:
        final_df.head()
    else:
        print("Could not merge features with volatility data.")


In [6]:
# 5. Correlation Analysis
if not final_df.empty:
    correlation = final_df[['complexity_score', 'sentiment_score', 'generalizing_score', 'self_reference_score', 'future_tense_ratio', 'past_tense_ratio', 'avg_future_volatility']].corr()
    
    print("Correlation Matrix:")
    display(correlation)
    
    print("\nCorrelation with future volatility:")
    display(correlation['avg_future_volatility'].sort_values(ascending=False))


Correlation Matrix:


Unnamed: 0,complexity_score,sentiment_score,generalizing_score,self_reference_score,future_tense_ratio,past_tense_ratio,avg_future_volatility
complexity_score,1.0,1.0,,1.0,1.0,1.0,
sentiment_score,1.0,1.0,,1.0,1.0,1.0,
generalizing_score,,,,,,,
self_reference_score,1.0,1.0,,1.0,1.0,1.0,
future_tense_ratio,1.0,1.0,,1.0,1.0,1.0,
past_tense_ratio,1.0,1.0,,1.0,1.0,1.0,
avg_future_volatility,,,,,,,



Correlation with future volatility:


complexity_score        NaN
sentiment_score         NaN
generalizing_score      NaN
self_reference_score    NaN
future_tense_ratio      NaN
past_tense_ratio        NaN
avg_future_volatility   NaN
Name: avg_future_volatility, dtype: float64