In [33]:
! pip install plotly --quiet
! pip install pyarrow --quiet

import duckdb
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.express as px
from statsmodels.tsa.stattools import grangercausalitytests

In [34]:
con_gdelt = duckdb.connect('eda-ddb/eda-gdelt.ddb')
print(con_gdelt.execute("SELECT * FROM gdelt LIMIT 1").fetch_df().columns)
con_gdelt.close()

con_yfinance = duckdb.connect('eda-ddb/eda-yfinance.ddb')
print(con_yfinance.execute("SELECT * FROM yfinance LIMIT 1").fetch_df().columns)
con_yfinance.close()

Index(['GlobalEventID', 'Day', 'MonthYear', 'Year', 'FractionDate',
       'Actor1Code', 'Actor1Name', 'Actor1CountryCode', 'Actor1KnownGroupCode',
       'Actor2Code', 'Actor2Name', 'Actor2CountryCode', 'Actor2KnownGroupCode',
       'IsRootEvent', 'EventCode', 'EventBaseCode', 'EventRootCode',
       'QuadClass', 'GoldsteinScale', 'NumMentions', 'NumSources',
       'NumArticles', 'AvgTone', 'Actor1Geo_Type', 'Actor1Geo_CountryCode',
       'Actor2Geo_Type', 'Actor2Geo_CountryCode', 'ActionGeo_Type',
       'ActionGeo_CountryCode', 'SOURCEURL'],
      dtype='object')
Index(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Sector',
       'Stock', 'Date'],
      dtype='object')


---

# Question 3
What is the average delay in an event happening and the consequences/effect on the stock market (if any)?

---

# 2019

## Monthly

In [35]:
# Connect to DuckDB (create a single connection)
con_gdelt = duckdb.connect('eda-ddb/eda-gdelt.ddb')

# Load and aggregate GDELT data by month for 2019
gdelt_monthly_df = con_gdelt.execute("""
    SELECT 
        STRFTIME(STRPTIME(CAST(Day AS VARCHAR), '%Y%m%d'), '%Y-%m') AS Month,
        COUNT(GlobalEventID) AS EventCount,
        AVG(AvgTone) AS AvgTone
    FROM gdelt
    WHERE STRPTIME(CAST(Day AS VARCHAR), '%Y%m%d') BETWEEN '2019-01-01' AND '2019-12-31'
    GROUP BY Month
""").fetch_df()

# Close the GDELT connection
con_gdelt.close()

In [36]:
# Connect to Yahoo Finance database
con_yfinance = duckdb.connect('eda-ddb/eda-yfinance.ddb')

# Load and aggregate Yahoo Finance data by month for 2019
yfinance_monthly_df = con_yfinance.execute("""
    SELECT 
        STRFTIME(Date, '%Y-%m') AS Month,
        Stock,
        Sector,
        AVG(Close) AS AvgClose
    FROM yfinance 
    WHERE Date BETWEEN '2019-01-01' AND '2019-12-31'
    GROUP BY Month, Stock, Sector
""").fetch_df()


# Close the Yahoo Finance connection
con_yfinance.close()

In [37]:
merged_monthly_df = pd.merge(gdelt_monthly_df, yfinance_monthly_df, on='Month', how='inner')
merged_monthly_df['Month'] = pd.to_datetime(merged_monthly_df['Month'])

### Time-Series Plot of Events and Stock Prices Monthly

In [38]:
# Group by sector then find the average close price for each month
sector_avg_close_df = merged_monthly_df.groupby(['Month', 'Sector'], as_index=False).agg(
    {'AvgClose': 'mean', 'EventCount': 'sum'}
)
sector_avg_close_df

Unnamed: 0,Month,Sector,AvgClose,EventCount
0,2019-01-01,agriculture,100.262619,9487062
1,2019-01-01,energy,72.409762,18974124
2,2019-01-01,finance,145.137937,14230593
3,2019-01-01,food,83.045079,14230593
4,2019-01-01,health,84.916431,14230593
...,...,...,...,...
91,2019-12-01,food,105.325079,12083499
92,2019-12-01,health,89.172952,12083499
93,2019-12-01,materials,35.779524,12083499
94,2019-12-01,tech,76.309083,12083499


In [39]:
# Create subplots with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Define unique sectors for color separation
sectors = sector_avg_close_df['Sector'].unique()

# Create a color palette to assign different colors to each sector
colors = {
    sector: color
    for sector, color in zip(sectors, px.colors.qualitative.Set1)
}

# Add traces for each sector for AvgClose (primary y-axis)
for sector in sectors:
    sector_data = sector_avg_close_df[sector_avg_close_df['Sector'] == sector]
    
    # Plot AvgClose for the sector
    fig.add_trace(
        go.Scatter(
            x=sector_data['Month'], 
            y=sector_data['AvgClose'], 
            mode='lines+markers',
            name=f'{sector} AvgClose',
            line=dict(color=colors[sector]),
        ),
        secondary_y=False  # Primary y-axis
    )

    # Plot EventCount for the sector (secondary y-axis)
    fig.add_trace(
        go.Scatter(
            x=sector_data['Month'], 
            y=sector_data['EventCount'], 
            mode='lines+markers',
            name=f'{sector} EventCount',
            line=dict(dash='dot', color=colors[sector]),  # Dashed line for EventCount
        ),
        secondary_y=True  # Secondary y-axis
    )

# Update y-axis titles
fig.update_yaxes(title_text="AvgClose (Stock Price)", secondary_y=False)
fig.update_yaxes(title_text="EventCount", secondary_y=True)

# Update x-axis title and overall title
fig.update_layout(
    autosize=False,
    width=1200,
    height=800,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    title_text="Average Close Price and Event Count by Sector",
    xaxis_title="Month",
    legend_title="Sector",
)

# Show the plot
fig.show()

### Testing the Hypotheses

#### Granger Causality Test

Then apply a Granger causality test to assess whether past values of EventCount can predict AvgClose changes.

In [40]:
data_lag = merged_monthly_df[['EventCount', 'AvgClose']].dropna()
granger_test_result = grangercausalitytests(data_lag, maxlag=5, verbose=True)
p_values = {lag: test[0]['ssr_ftest'][1] for lag, test in granger_test_result.items()}
p_values_df = pd.DataFrame(p_values.items(), columns=['Lag', 'p-value'])
top_5_results = p_values_df.sort_values(by='p-value')
top_5_results


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=1.2836  , p=0.2582  , df_denom=290, df_num=1
ssr based chi2 test:   chi2=1.2969  , p=0.2548  , df=1
likelihood ratio test: chi2=1.2940  , p=0.2553  , df=1
parameter F test:         F=1.2836  , p=0.2582  , df_denom=290, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=1.2741  , p=0.2813  , df_denom=287, df_num=2
ssr based chi2 test:   chi2=2.5926  , p=0.2735  , df=2
likelihood ratio test: chi2=2.5811  , p=0.2751  , df=2
parameter F test:         F=1.2741  , p=0.2813  , df_denom=287, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=1.1258  , p=0.3389  , df_denom=284, df_num=3
ssr based chi2 test:   chi2=3.4605  , p=0.3259  , df=3
likelihood ratio test: chi2=3.4401  , p=0.3286  , df=3
parameter F test:         F=1.1258  , p=0.3389  , df_denom=284, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=0.9699  , p=0.4243  


verbose is deprecated since functions should not print results



Unnamed: 0,Lag,p-value
0,1,0.258166
1,2,0.281264
2,3,0.338938
3,4,0.424346
4,5,0.558967


In [41]:
data_lag = merged_monthly_df[['AvgClose', 'EventCount']].dropna()
granger_test_result = grangercausalitytests(data_lag, maxlag=5, verbose=True)
p_values = {lag: test[0]['ssr_ftest'][1] for lag, test in granger_test_result.items()}
p_values_df = pd.DataFrame(p_values.items(), columns=['Lag', 'p-value'])
top_5_results = p_values_df.sort_values(by='p-value')
top_5_results


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.5788  , p=0.4474  , df_denom=290, df_num=1
ssr based chi2 test:   chi2=0.5848  , p=0.4445  , df=1
likelihood ratio test: chi2=0.5842  , p=0.4447  , df=1
parameter F test:         F=0.5788  , p=0.4474  , df_denom=290, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.6376  , p=0.5293  , df_denom=287, df_num=2
ssr based chi2 test:   chi2=1.2973  , p=0.5227  , df=2
likelihood ratio test: chi2=1.2944  , p=0.5235  , df=2
parameter F test:         F=0.6376  , p=0.5293  , df_denom=287, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=1.5418  , p=0.2039  , df_denom=284, df_num=3
ssr based chi2 test:   chi2=4.7393  , p=0.1919  , df=3
likelihood ratio test: chi2=4.7011  , p=0.1950  , df=3
parameter F test:         F=1.5418  , p=0.2039  , df_denom=284, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=2.6314  , p=0.0346  


verbose is deprecated since functions should not print results



Unnamed: 0,Lag,p-value
3,4,0.034637
4,5,0.042453
2,3,0.203877
0,1,0.447414
1,2,0.529332


## Daily

In [42]:
# Connect to DuckDB (create a single connection)
con_gdelt = duckdb.connect('eda-ddb/eda-gdelt.ddb')

# Load and aggregate GDELT data by day for 2019
gdelt_daily_df = con_gdelt.execute("""
    SELECT 
        STRFTIME(STRPTIME(CAST(Day AS VARCHAR), '%Y%m%d'), '%Y-%m-%d') AS Day,
        COUNT(GlobalEventID) AS EventCount,
        AVG(AvgTone) AS AvgTone
    FROM gdelt
    WHERE STRPTIME(CAST(Day AS VARCHAR), '%Y%m%d') BETWEEN '2019-01-01' AND '2019-12-31'
    GROUP BY Day
""").fetch_df()

# Close the GDELT connection
con_gdelt.close()

In [43]:
# Connect to Yahoo Finance database
con_yfinance = duckdb.connect('eda-ddb/eda-yfinance.ddb')

# Load and aggregate Yahoo Finance data by day for 2019
yfinance_daily_df = con_yfinance.execute("""
    SELECT 
        STRFTIME(Date, '%Y-%m-%d') AS Day,
        Stock,
        Sector,
        AVG(Close) AS AvgClose
    FROM yfinance 
    WHERE Date BETWEEN '2019-01-01' AND '2019-12-31'
    GROUP BY Day, Stock, Sector
""").fetch_df()

# Close the Yahoo Finance connection
con_yfinance.close()

In [44]:
merged_daily_df = pd.merge(gdelt_daily_df, yfinance_daily_df, on='Day', how='inner')
merged_daily_df['Day'] = pd.to_datetime(merged_daily_df['Day'])

### Time-Series Plot of Events and Stock Prices Daily

In [45]:
# Group by sector then find the average close price for each day
sector_avg_close_daily_df = merged_daily_df.groupby(['Day', 'Sector'], as_index=False).agg(
    {'AvgClose': 'mean', 'EventCount': 'sum'}
)

In [46]:
# Create subplots with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Define unique sectors for color separation
sectors = sector_avg_close_daily_df['Sector'].unique()

# Create a color palette to assign different colors to each sector
colors = {
    sector: color
    for sector, color in zip(sectors, px.colors.qualitative.Set1)
}

# Add traces for each sector for AvgClose (primary y-axis)
for sector in sectors:
    sector_data = sector_avg_close_daily_df[sector_avg_close_daily_df['Sector'] == sector]
    
    # Plot AvgClose for the sector
    fig.add_trace(
        go.Scatter(
            x=sector_data['Day'], 
            y=sector_data['AvgClose'], 
            mode='lines+markers',
            name=f'{sector} AvgClose',
            line=dict(color=colors[sector]),
        ),
        secondary_y=False  # Primary y-axis
    )

    # Plot EventCount for the sector (secondary y-axis)
    fig.add_trace(
        go.Scatter(
            x=sector_data['Day'], 
            y=sector_data['EventCount'], 
            mode='lines+markers',
            name=f'{sector} EventCount',
            line=dict(dash='dot', color=colors[sector]),  # Dashed line for EventCount
        ),
        secondary_y=True  # Secondary y-axis
    )

# Update y-axis titles
fig.update_yaxes(title_text="AvgClose (Stock Price)", secondary_y=False)
fig.update_yaxes(title_text="EventCount", secondary_y=True)

# Update x-axis title and overall title
fig.update_layout(
    autosize=False,
    width=1400,
    height=800,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    title_text="Average Close Price and Event Count by Sector",
    xaxis_title="Month",
    legend_title="Sector",
)

# Show the plot
fig.show()

### Testing the Hypothesis

#### Granger Causality Test

In [47]:
data_lag = merged_daily_df[['EventCount', 'AvgClose']].dropna()
granger_test_result = grangercausalitytests(data_lag, maxlag=30, verbose=True)
# print(granger_test_result)

p_values = {lag: test[0]['ssr_ftest'][1] for lag, test in granger_test_result.items()}
p_values_df = pd.DataFrame(p_values.items(), columns=['Lag', 'p-value'])
top_5_results = p_values_df.sort_values(by='p-value').head(5)
print(top_5_results)

# p_values = {lag: test[0]['ssr_ftest'][1] for lag, test in granger_test_result.items()}
# p_values_df = pd.DataFrame(p_values.items(), columns=['Lag', 'p-value'])
# significant_lags = p_values_df[p_values_df['p-value'] < 0.05].sort_values(by='p-value')
# if not significant_lags.empty:
#     first_significant_lag = significant_lags.iloc[0]
#     print(f"The first lag with p-value < 0.05 is Lag {first_significant_lag['Lag']} with p-value {first_significant_lag['p-value']:.4f}.")
# else:
#     print("No significant lags found with p-value < 0.05.")


verbose is deprecated since functions should not print results




Granger Causality
number of lags (no zero) 1
ssr based F test:         F=3.2447  , p=0.0717  , df_denom=6144, df_num=1
ssr based chi2 test:   chi2=3.2463  , p=0.0716  , df=1
likelihood ratio test: chi2=3.2454  , p=0.0716  , df=1
parameter F test:         F=3.2447  , p=0.0717  , df_denom=6144, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=1.6310  , p=0.1958  , df_denom=6141, df_num=2
ssr based chi2 test:   chi2=3.2646  , p=0.1955  , df=2
likelihood ratio test: chi2=3.2638  , p=0.1956  , df=2
parameter F test:         F=1.6310  , p=0.1958  , df_denom=6141, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=2.6957  , p=0.0443  , df_denom=6138, df_num=3
ssr based chi2 test:   chi2=8.0962  , p=0.0441  , df=3
likelihood ratio test: chi2=8.0909  , p=0.0442  , df=3
parameter F test:         F=2.6957  , p=0.0443  , df_denom=6138, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=2.9704  , p=0.

In [48]:
data_lag = merged_daily_df[['AvgClose', 'EventCount']].dropna()
granger_test_result = grangercausalitytests(data_lag, maxlag=30, verbose=True)
# print(granger_test_result)

p_values = {lag: test[0]['ssr_ftest'][1] for lag, test in granger_test_result.items()}
p_values_df = pd.DataFrame(p_values.items(), columns=['Lag', 'p-value'])
top_5_results = p_values_df.sort_values(by='p-value').head(5)
print(top_5_results)

# p_values = {lag: test[0]['ssr_ftest'][1] for lag, test in granger_test_result.items()}
# p_values_df = pd.DataFrame(p_values.items(), columns=['Lag', 'p-value'])
# significant_lags = p_values_df[p_values_df['p-value'] < 0.05].sort_values(by='p-value')
# if not significant_lags.empty:
#     first_significant_lag = significant_lags.iloc[0]
#     print(f"The first lag with p-value < 0.05 is Lag {first_significant_lag['Lag']} with p-value {first_significant_lag['p-value']:.4f}.")
# else:
#     print("No significant lags found with p-value < 0.05.")


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=2.7101  , p=0.0998  , df_denom=6144, df_num=1
ssr based chi2 test:   chi2=2.7114  , p=0.0996  , df=1
likelihood ratio test: chi2=2.7108  , p=0.0997  , df=1
parameter F test:         F=2.7101  , p=0.0998  , df_denom=6144, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=1.7042  , p=0.1820  , df_denom=6141, df_num=2
ssr based chi2 test:   chi2=3.4112  , p=0.1817  , df=2
likelihood ratio test: chi2=3.4102  , p=0.1818  , df=2
parameter F test:         F=1.7042  , p=0.1820  , df_denom=6141, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=1.3053  , p=0.2708  , df_denom=6138, df_num=3
ssr based chi2 test:   chi2=3.9203  , p=0.2702  , df=3
likelihood ratio test: chi2=3.9191  , p=0.2703  , df=3
parameter F test:         F=1.3053  , p=0.2708  , df_denom=6138, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=1.7100  , p=0.


verbose is deprecated since functions should not print results



likelihood ratio test: chi2=14.1584 , p=0.0279  , df=6
parameter F test:         F=2.3575  , p=0.0282  , df_denom=6129, df_num=6

Granger Causality
number of lags (no zero) 7
ssr based F test:         F=1.9982  , p=0.0516  , df_denom=6126, df_num=7
ssr based chi2 test:   chi2=14.0218 , p=0.0508  , df=7
likelihood ratio test: chi2=14.0058 , p=0.0511  , df=7
parameter F test:         F=1.9982  , p=0.0516  , df_denom=6126, df_num=7

Granger Causality
number of lags (no zero) 8
ssr based F test:         F=1.8977  , p=0.0559  , df_denom=6123, df_num=8
ssr based chi2 test:   chi2=15.2236 , p=0.0549  , df=8
likelihood ratio test: chi2=15.2048 , p=0.0553  , df=8
parameter F test:         F=1.8977  , p=0.0559  , df_denom=6123, df_num=8

Granger Causality
number of lags (no zero) 9
ssr based F test:         F=1.8207  , p=0.0595  , df_denom=6120, df_num=9
ssr based chi2 test:   chi2=16.4373 , p=0.0583  , df=9
likelihood ratio test: chi2=16.4153 , p=0.0587  , df=9
parameter F test:         F=1.820

## Conclusion

**Summary of Key Findings**
1. No Causal Relationship in Monthly Data
    - p-values > 0.05 across all lags (1, 2, 3)
    - This means event counts do not significantly predict stock price changes (or vice versa) on a monthly scale
2. Significant Causality Found at Daily Level (Lag 22 - 30)
    - In the daily Granger causality test with 30 lags for serveral times, Lag 22 - 30 was found to have a significant p-value < 0.05. This suggests events have a delayed effect on stock prices after 27 days.