In [2]:
! pip install plotly --quiet
! pip install pyarrow --quiet

import duckdb
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

In [39]:
con_gdelt = duckdb.connect('eda-ddb/eda-gdelt.ddb')
print(con_gdelt.execute("SELECT * FROM gdelt LIMIT 1").fetch_df().columns)

con_yfinance = duckdb.connect('eda-ddb/eda-yfinance.ddb')
print(con_yfinance.execute("SELECT * FROM yfinance LIMIT 1").fetch_df().columns)

Index(['GlobalEventID', 'Day', 'MonthYear', 'Year', 'FractionDate',
       'Actor1Code', 'Actor1Name', 'Actor1CountryCode', 'Actor1KnownGroupCode',
       'Actor2Code', 'Actor2Name', 'Actor2CountryCode', 'Actor2KnownGroupCode',
       'IsRootEvent', 'EventCode', 'EventBaseCode', 'EventRootCode',
       'QuadClass', 'GoldsteinScale', 'NumMentions', 'NumSources',
       'NumArticles', 'AvgTone', 'Actor1Geo_Type', 'Actor1Geo_CountryCode',
       'Actor2Geo_Type', 'Actor2Geo_CountryCode', 'ActionGeo_Type',
       'ActionGeo_CountryCode', 'SOURCEURL'],
      dtype='object')
Index(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Sector',
       'Stock', 'Date'],
      dtype='object')


In [55]:
con_gdelt.execute("""
CREATE OR REPLACE TABLE gdelt_clean AS
SELECT GlobalEventID, 
       STRPTIME(CAST(Day AS VARCHAR), '%Y%m%d') AS EventDate, 
       AvgTone, GoldsteinScale, NumMentions
FROM gdelt
WHERE Year = 2019
""")

gdelt_df = con_gdelt.execute("SELECT * FROM gdelt_clean").fetch_df()
gdelt_df

Unnamed: 0,GlobalEventID,EventDate,AvgTone,GoldsteinScale,NumMentions
0,813415804,2019-01-01,0.607903,1.9,8.0
1,813415805,2019-01-01,2.501833,4.0,48.0
2,813415806,2019-01-01,3.339186,1.9,192.0
3,813415807,2019-01-01,-1.510989,3.5,20.0
4,813415808,2019-01-01,-3.867403,-2.0,4.0
...,...,...,...,...,...
56081412,962072061,2019-12-31,-12.032086,-5.0,6.0
56081413,962072062,2019-12-31,-10.223642,0.0,10.0
56081414,962072063,2019-12-31,-12.032086,-5.0,2.0
56081415,962072064,2019-12-31,-12.032086,-5.0,4.0


In [44]:
query = """
SELECT Date, Close, Stock, Sector
FROM yfinance
WHERE Date BETWEEN '2019-01-01' AND '2019-12-31'
"""
yfinance_df = con_yfinance.execute(query).fetch_df()
yfinance_df['Date'] = pd.to_datetime(yfinance_df['Date'])
yfinance_df

Unnamed: 0,Date,Close,Stock,Sector
0,2019-01-02,39.480000,AAPL,tech
1,2019-01-03,35.547501,AAPL,tech
2,2019-01-04,37.064999,AAPL,tech
3,2019-01-07,36.982498,AAPL,tech
4,2019-01-08,37.687500,AAPL,tech
...,...,...,...,...
6395,2019-12-24,61.279999,VZ,telecom
6396,2019-12-26,61.290001,VZ,telecom
6397,2019-12-27,61.529999,VZ,telecom
6398,2019-12-30,61.209999,VZ,telecom


---

In [26]:
# Connect to DuckDB (create a single connection)
con = duckdb.connect('eda-ddb/eda-gdelt.ddb')

# Load and aggregate GDELT data by month for 2019
gdelt_monthly_df = con.execute("""
    SELECT 
        STRFTIME(STRPTIME(CAST(Day AS VARCHAR), '%Y%m%d'), '%Y-%m') AS Month,
        COUNT(GlobalEventID) AS EventCount,
        AVG(AvgTone) AS AvgTone
    FROM gdelt
    WHERE STRPTIME(CAST(Day AS VARCHAR), '%Y%m%d') BETWEEN '2019-01-01' AND '2019-12-31'
    GROUP BY Month
""").fetch_df()

# Close the GDELT connection
con.close()


In [32]:
# Connect to Yahoo Finance database
con_yfinance = duckdb.connect('eda-ddb/eda-yfinance.ddb')

# Load and aggregate Yahoo Finance data by month for 2019
yfinance_monthly_df = con_yfinance.execute("""
    SELECT 
        STRFTIME(Date, '%Y-%m') AS Month,
        Stock,
        Sector,
        AVG(Close) AS AvgClose
    FROM yfinance 
    WHERE Date BETWEEN '2019-01-01' AND '2019-12-31'
    GROUP BY Month, Stock, Sector
""").fetch_df()


# Close the Yahoo Finance connection
con_yfinance.close()


In [33]:
# Merge the datasets on Month
merged_monthly_df = pd.merge(gdelt_monthly_df, yfinance_monthly_df, on='Month', how='inner')

In [63]:
# Plot the relationship between EventCount and AvgClose by Stock
fig = px.scatter(merged_monthly_df, x='EventCount', y='AvgClose', color='Stock', title='EventCount vs AvgClose by Stock')
fig.update_layout(
    autosize=False,
    width=1000,
    height=800,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    paper_bgcolor="LightSteelBlue",
)
fig.show()

In [65]:
# Plot the relationship between EventCount and AvgClose by Stock and Sector
fig = px.scatter(merged_monthly_df, 
                 x='EventCount', 
                 y='AvgClose', 
                 color='Stock', 
                 facet_col='Sector', 
                 title='EventCount vs AvgClose by Stock and Sector (2019)',)

fig.update_layout(
    autosize=False,
    width=1000,
    height=800,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    paper_bgcolor="LightSteelBlue",
)

fig.show()