In [17]:
! pip install plotly --quiet
! pip install pyarrow --quiet

import duckdb
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from statsmodels.tsa.stattools import grangercausalitytests

In [18]:
con_gdelt = duckdb.connect('eda-ddb/eda-gdelt.ddb')
print(con_gdelt.execute("SELECT * FROM gdelt LIMIT 1").fetch_df().columns)
con_gdelt.close()

con_yfinance = duckdb.connect('eda-ddb/eda-yfinance.ddb')
print(con_yfinance.execute("SELECT * FROM yfinance LIMIT 1").fetch_df().columns)
con_yfinance.close()

Index(['GlobalEventID', 'Day', 'MonthYear', 'Year', 'FractionDate',
       'Actor1Code', 'Actor1Name', 'Actor1CountryCode', 'Actor1KnownGroupCode',
       'Actor2Code', 'Actor2Name', 'Actor2CountryCode', 'Actor2KnownGroupCode',
       'IsRootEvent', 'EventCode', 'EventBaseCode', 'EventRootCode',
       'QuadClass', 'GoldsteinScale', 'NumMentions', 'NumSources',
       'NumArticles', 'AvgTone', 'Actor1Geo_Type', 'Actor1Geo_CountryCode',
       'Actor2Geo_Type', 'Actor2Geo_CountryCode', 'ActionGeo_Type',
       'ActionGeo_CountryCode', 'SOURCEURL'],
      dtype='object')
Index(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Sector',
       'Stock', 'Date'],
      dtype='object')


---

# 2019

## Monthly

In [19]:
# Connect to DuckDB (create a single connection)
con_gdelt = duckdb.connect('eda-ddb/eda-gdelt.ddb')

# Load and aggregate GDELT data by month for 2019
gdelt_monthly_df = con_gdelt.execute("""
    SELECT 
        STRFTIME(STRPTIME(CAST(Day AS VARCHAR), '%Y%m%d'), '%Y-%m') AS Month,
        COUNT(GlobalEventID) AS EventCount,
        AVG(AvgTone) AS AvgTone
    FROM gdelt
    WHERE STRPTIME(CAST(Day AS VARCHAR), '%Y%m%d') BETWEEN '2019-01-01' AND '2019-12-31'
    GROUP BY Month
""").fetch_df()

# Close the GDELT connection
con_gdelt.close()


In [20]:
# Connect to Yahoo Finance database
con_yfinance = duckdb.connect('eda-ddb/eda-yfinance.ddb')

# Load and aggregate Yahoo Finance data by month for 2019
yfinance_monthly_df = con_yfinance.execute("""
    SELECT 
        STRFTIME(Date, '%Y-%m') AS Month,
        Stock,
        Sector,
        AVG(Close) AS AvgClose
    FROM yfinance 
    WHERE Date BETWEEN '2019-01-01' AND '2019-12-31'
    GROUP BY Month, Stock, Sector
""").fetch_df()


# Close the Yahoo Finance connection
con_yfinance.close()


In [21]:
# Merge the datasets on Month
merged_monthly_df = pd.merge(gdelt_monthly_df, yfinance_monthly_df, on='Month', how='inner')

In [22]:
# Plot the relationship between EventCount and AvgClose by Stock
fig = px.scatter(merged_monthly_df, x='EventCount', y='AvgClose', color='Stock', title='EventCount vs AvgClose by Stock')
fig.update_layout(
    autosize=False,
    width=1000,
    height=800,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    paper_bgcolor="LightSteelBlue",
)
fig.show()

In [23]:
# Plot the relationship between EventCount and AvgClose by Stock and Sector
fig = px.scatter(merged_monthly_df, 
                 x='EventCount', 
                 y='AvgClose', 
                 color='Stock', 
                 facet_col='Sector', 
                 title='EventCount vs AvgClose by Stock and Sector (2019)',)

fig.update_layout(
    autosize=False,
    width=1000,
    height=800,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    paper_bgcolor="LightSteelBlue",
)

fig.show()

In [24]:
merged_monthly_df['PctChangeClose'] = merged_monthly_df.groupby('Stock')['AvgClose'].pct_change()


In [25]:
from statsmodels.tsa.stattools import grangercausalitytests

# Prepare the data for Granger causality test
test_data = merged_monthly_df[['EventCount', 'PctChangeClose']].dropna()

# Perform Granger causality test with 1 to 3 lags
granger_result = grangercausalitytests(test_data, maxlag=3, verbose=True)



Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.2431  , p=0.6224  , df_denom=265, df_num=1
ssr based chi2 test:   chi2=0.2458  , p=0.6200  , df=1
likelihood ratio test: chi2=0.2457  , p=0.6201  , df=1
parameter F test:         F=0.2431  , p=0.6224  , df_denom=265, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.5860  , p=0.5573  , df_denom=262, df_num=2
ssr based chi2 test:   chi2=1.1943  , p=0.5504  , df=2
likelihood ratio test: chi2=1.1916  , p=0.5511  , df=2
parameter F test:         F=0.5860  , p=0.5573  , df_denom=262, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.3967  , p=0.7555  , df_denom=259, df_num=3
ssr based chi2 test:   chi2=1.2224  , p=0.7476  , df=3
likelihood ratio test: chi2=1.2196  , p=0.7483  , df=3
parameter F test:         F=0.3967  , p=0.7555  , df_denom=259, df_num=3



verbose is deprecated since functions should not print results



In [26]:
# Prepare data for Granger test using AvgTone
test_data_sentiment = merged_monthly_df[['AvgTone', 'PctChangeClose']].dropna()

# Perform Granger causality test
granger_result_sentiment = grangercausalitytests(test_data_sentiment, maxlag=3, verbose=True)



Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.2736  , p=0.6014  , df_denom=265, df_num=1
ssr based chi2 test:   chi2=0.2767  , p=0.5989  , df=1
likelihood ratio test: chi2=0.2766  , p=0.5990  , df=1
parameter F test:         F=0.2736  , p=0.6014  , df_denom=265, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.1785  , p=0.8366  , df_denom=262, df_num=2
ssr based chi2 test:   chi2=0.3638  , p=0.8337  , df=2
likelihood ratio test: chi2=0.3635  , p=0.8338  , df=2
parameter F test:         F=0.1785  , p=0.8366  , df_denom=262, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.1234  , p=0.9462  , df_denom=259, df_num=3
ssr based chi2 test:   chi2=0.3802  , p=0.9443  , df=3
likelihood ratio test: chi2=0.3800  , p=0.9443  , df=3
parameter F test:         F=0.1234  , p=0.9462  , df_denom=259, df_num=3



verbose is deprecated since functions should not print results



## Daily

In [27]:
con_gdelt = duckdb.connect('eda-ddb/eda-gdelt.ddb')
con_yfinance = duckdb.connect('eda-ddb/eda-yfinance.ddb')

# Create daily aggregates for both GDELT and Yahoo Finance datasets
gdelt_daily_df = con_gdelt.execute("""
    SELECT 
        STRFTIME(STRPTIME(CAST(Day AS VARCHAR), '%Y%m%d'), '%Y-%m-%d') AS Date,
        COUNT(GlobalEventID) AS EventCount,
        AVG(AvgTone) AS AvgTone
    FROM gdelt
    WHERE STRPTIME(CAST(Day AS VARCHAR), '%Y%m%d') BETWEEN '2019-01-01' AND '2019-12-31'
    GROUP BY Date
""").fetch_df()

# Aggregate Yahoo Finance data by day
yfinance_daily_df = con_yfinance.execute("""
    SELECT 
        STRFTIME(Date, '%Y-%m-%d') AS Date,
        Stock,
        Sector,
        AVG(Close) AS AvgClose
    FROM yfinance 
    WHERE Date BETWEEN '2019-01-01' AND '2019-12-31'
    GROUP BY Date, Stock, Sector
""").fetch_df()


# Merge datasets on Date
merged_daily_df = pd.merge(gdelt_daily_df, yfinance_daily_df, on='Date', how='inner')

# Create time-lagged columns to examine stock price changes after events
for lag in [1, 3, 7]:
    merged_daily_df[f'AvgClose_Lag{lag}'] = merged_daily_df.groupby('Stock')['AvgClose'].shift(-lag)


In [28]:
merged_daily_df = merged_daily_df.dropna()
merged_daily_df

Unnamed: 0,Date,EventCount,AvgTone,Stock,Sector,AvgClose,AvgClose_Lag1,AvgClose_Lag3,AvgClose_Lag7
0,2019-01-29,193281,-2.219368,AAPL,tech,38.669998,43.742500,43.767502,49.875000
1,2019-01-29,193281,-2.219368,CVX,energy,111.830002,122.029999,113.849998,125.540001
2,2019-01-29,193281,-2.219368,XOM,energy,71.510002,80.000000,70.769997,81.930000
3,2019-01-29,193281,-2.219368,V,finance,135.000000,149.470001,161.330002,157.490005
4,2019-01-29,193281,-2.219368,FCX,materials,10.450000,12.870000,9.710000,13.380000
...,...,...,...,...,...,...,...,...,...
5971,2019-09-27,181217,-2.085142,CVX,energy,118.599998,119.849998,121.190002,124.680000
5972,2019-09-27,181217,-2.085142,PG,food,124.570000,124.470001,104.239998,115.440002
5973,2019-09-27,181217,-2.085142,FCX,materials,9.650000,13.010000,11.280000,11.150000
5974,2019-09-27,181217,-2.085142,MSFT,tech,137.729996,157.589996,125.500000,136.419998


In [29]:
fig = px.scatter(merged_daily_df, x='EventCount', y=f'AvgClose', color='Stock', 
                     title=f'EventCount vs AvgClose by Stock (2019)')
    
fig.update_layout(
    autosize=False,
    width=1000,
    height=800,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    paper_bgcolor="LightSteelBlue",
)

fig.show()

In [30]:
# for lag in [1, 3, 7]:
#     fig = px.scatter(merged_daily_df, x='EventCount', y=f'AvgClose_Lag{lag}', color='Stock',
#                      title=f'EventCount vs AvgClose with {lag}-Day Lag')
    
#     fig.update_layout(
#         autosize=False,
#         width=1000,
#         height=800,
#         margin=dict(
#             l=50,
#             r=50,
#             b=100,
#             t=100,
#             pad=4
#         ),
#         paper_bgcolor="LightSteelBlue",
#     )
    
#     fig.show()


In [31]:
fig = px.scatter(merged_daily_df, x='EventCount', y='AvgClose', color='Stock',
                 facet_col='Sector', title='Sector-Wise Impact of Events on Stock Prices')

fig.update_layout(
    autosize=False,
    width=1000,
    height=800,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    paper_bgcolor="LightSteelBlue",
)

fig.show()
