# Data Engineering EDA Project

By Aommy, Austin, Ling, Tath (yippee)

-----------------------

## Installing Dependencies

### Minio Object Store dependencies

1. Install minio cli if not already installed.
2. Create client
3. Make bucket (if needed)

In [3]:
! pip install minio --quiet

In [1]:
from minio import Minio

# Create client with access and secret key.
# client = Minio("s3.amazonaws.com", "ACCESS-KEY", "SECRET-KEY")

# Create client with access key and secret key with specific region.
client = Minio(
    "localhost:9000",
    access_key="ROOTNAME",
    secret_key="CHANGEME123",
    secure=False
)

if client.bucket_exists("eda"):
    print("Bucket exists.")
else:
    client.make_bucket("eda")
    print("Bucket created.")

Bucket exists.


#### DuckDB dependencies

In [2]:
!pip uninstall --quiet --yes malloy
!pip install --quiet --upgrade duckdb
!pip install --quiet jupysql==0.10.12
!pip install --quiet duckdb-engine

[0m

## Importing libraries

In [3]:
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

import io
from urllib.request import urlopen
import zipfile
import os

In [7]:
!pip install --quiet yfinance --upgrade --no-cache-dir

---------------------

## Data Collection

### Data from Global Events

Installing data from http://data.gdeltproject.org/events/index.html for global event data!

In [5]:
for y in range(19, 25):
    for m in range(1, 13):
        if m < 10:
            m = "0" + str(m)
        for d in range(1, 32):
            if d < 10:
                d = "0" + str(d)
            try:
                response = client.get_object("eda", "gdelt-parquet/20{0}/{1}/{2}.export.CSV.zip".format(y,m,d))
                # Read data from response.
            except:
                print("File doesn't exist!")
                try:
                    data = urlopen(
                        "http://data.gdeltproject.org/events/20{0}{1}{2}.export.CSV.zip".format(y,m,d),
                    )
                    print("Uploading file!")
                    result = client.put_object(
                        "eda", "gdelt-parquet/20{0}/{1}/{2}.export.CSV.zip".format(y,m,d), data, length=-1, part_size=10*1024*1024,
                    )
                    print(
                        "created {0} object; etag: {1}, version-id: {2}".format(
                            result.object_name, result.etag, result.version_id,
                        ),
                    )
                except:
                    print("No file to download!")
            finally:
                response.close()
                response.release_conn()

File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
Uploading file!
created gdelt/2020/04/09.export.CSV.zip object; etag: 5cf7553a7e25189741f5f55711b3a5f2, version-id: None
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download

In [101]:
gdelt_headers = {
    "GlobalEventID": int,
    "Day": int,
    "MonthYear": int,
    "Year": int,
    "FractionDate": float,
    "Actor1Code": str,
    "Actor1Name": str,
    "Actor1CountryCode": str,
    "Actor1KnownGroupCode": str,
    "Actor1EthnicCode": str,
    "Actor1Religion1Code": str,
    "Actor1Religion2Code": str,
    "Actor1Type1Code": str,
    "Actor1Type2Code": str,
    "Actor1Type3Code": str,
    "Actor2Code": str,
    "Actor2Name": str,
    "Actor2CountryCode": str,
    "Actor2KnownGroupCode": str,
    "Actor2EthnicCode": str,
    "Actor2Religion1Code": str,
    "Actor2Religion2Code": str,
    "Actor2Type1Code": str,
    "Actor2Type2Code": str,
    "Actor2Type3Code": str,
    "IsRootEvent": bool,
    "EventCode": str,
    "EventBaseCode": str,
    "EventRootCode": str,
    "QuadClass": float,
    "GoldsteinScale": float,
    "NumMentions": float,
    "NumSources": float,
    "NumArticles": float,
    "AvgTone": float,
    "Actor1Geo_Type": float,
    "Actor1Geo_Fullname": str,
    "Actor1Geo_CountryCode": str,
    "Actor1Geo_ADM1Code": str,
    "Actor1Geo_Lat": float,
    "Actor1Geo_Long": object,
    "Actor1Geo_FeatureID": object,
    "Actor2Geo_Type": float,
    "Actor2Geo_Fullname": str,
    "Actor2Geo_CountryCode": str,
    "Actor2Geo_ADM1Code": str,
    "Actor2Geo_Lat": float,
    "Actor2Geo_Long": object,
    "Actor2Geo_FeatureID": object,
    "ActionGeo_Type": float,
    "ActionGeo_Fullname": str,
    "ActionGeo_CountryCode": str,
    "ActionGeo_ADM1Code": str,
    "ActionGeo_Lat": float,
    "ActionGeo_Long": object,
    "ActionGeo_FeatureID": object,
    "DateAdded": float,
    "SOURCEURL": str
}

In [30]:
len(gdelt_headers)

58

In [31]:
gdelt_headers.keys()

dict_keys(['GlobalEventID', 'Day', 'MonthYear', 'Year', 'FractionDate', 'Actor1Code', 'Actor1Name', 'Actor1CountryCode', 'Actor1KnownGroupCode', 'Actor1EthnicCode', 'Actor1Religion1Code', 'Actor1Religion2Code', 'Actor1Type1Code', 'Actor1Type2Code', 'Actor1Type3Code', 'Actor2Code', 'Actor2Name', 'Actor2CountryCode', 'Actor2KnownGroupCode', 'Actor2EthnicCode', 'Actor2Religion1Code', 'Actor2Religion2Code', 'Actor2Type1Code', 'Actor2Type2Code', 'Actor2Type3Code', 'IsRootEvent', 'EventCode', 'EventBaseCode', 'EventRootCode', 'QuadClass', 'GoldsteinScale', 'NumMentions', 'NumSources', 'NumArticles', 'AvgTone', 'Actor1Geo_Type', 'Actor1Geo_Fullname', 'Actor1Geo_CountryCode', 'Actor1Geo_ADM1Code', 'Actor1Geo_Lat', 'Actor1Geo_Long', 'Actor1Geo_FeatureID', 'Actor2Geo_Type', 'Actor2Geo_Fullname', 'Actor2Geo_CountryCode', 'Actor2Geo_ADM1Code', 'Actor2Geo_Lat', 'Actor2Geo_Long', 'Actor2Geo_FeatureID', 'ActionGeo_Type', 'ActionGeo_Fullname', 'ActionGeo_CountryCode', 'ActionGeo_ADM1Code', 'ActionGeo_

### Data from YahooFinance

In [30]:
import yfinance as yf

companies = {
    "tech": ['AAPL', 'MSFT', 'NVDA'],
    "energy": ['XOM', 'CVX', 'BP', 'COP'],
    "food": ['KO', 'PEP', 'PG'],
    "health": ['JNJ', 'PFE', 'ABBV'],
    "finance": ['JPM', 'V', 'MA'],
    "agriculture": ['DE', 'CTVA', 'ADM'],
    "telecom": ['T', 'VZ', 'TMUS'],
    "materials": ['DOW', 'NEM', 'FCX']
}

In [37]:
uploaded = False
for sector in companies:
    for stock in companies[sector]:
        try:
            response = client.get_object("eda", "yfinance/{0}/{1}.snappy.parquet".format(sector,stock))
            # Read data from response.
        except:
            print("File doesn't exist!")
            data = yf.download(stock, start="2019-01-01", end="2024-10-15", progress=False)
            result = client.put_object(
                "eda", "yfinance/{0}/{1}.snappy.parquet".format(sector,stock), io.BytesIO(data.to_parquet(compression='snappy')), length=-1, part_size=10*1024*1024,
            )
            print(
                "created {0} object; etag: {1}, version-id: {2}".format(
                    result.object_name, result.etag, result.version_id,
                ),
            )
            uploaded = True
        finally:
            response.close()
            response.release_conn()
if (not uploaded):
    print("No changes to make.")

No changes to make.


### Data from Google Trend

In [14]:
! pip install pytrends --quiet

In [15]:
import os
import matplotlib.pyplot as plt
from pytrends.request import TrendReq
from minio import Minio
from minio.error import S3Error

companies = {
    # Technology
    'apple': ['Apple Inc.', 'Apple', 'AAPL', 'Apple Corporation'],
    'microsoft': ['Microsoft', 'MSFT', 'Microsoft Corporation'],
    'nvda': ['Nvidia', 'NVDA', 'Nvidia Corporation'],
    # Energy/Oil
    'exxon': ['Exxon Mobil', 'Exxon', 'Exxon Mobil Corporation'],
    'chevron': ['Chevron', 'Chevron Corporation'],
    'bp': ['BP', 'BP plc'],
    'cop': ['ConocoPhillips', 'ConocoPhillips Company'],
    # Consumer Goods/Food & Beverage
    'coca_cola': ['Coca-Cola', 'Coca-Cola Company', 'KO'],
    'pepsi': ['Pepsi', 'PepsiCo', 'PepsiCo Inc.'],
    'pg': ['Procter & Gamble', 'Procter & Gamble Company', 'PG'],
    # Healthcare
    'jnj': ['Johnson & Johnson', 'Johnson & Johnson Company', 'JNJ'],
    'pfe': ['Pfizer', 'Pfizer Inc.', 'PFE'],
    'abbv': ['AbbVie', 'AbbVie Inc.', 'ABBV'],
    # Financial Services
    'jpm': ['JPMorgan Chase', 'JPMorgan Chase & Co.', 'JPM'],
    'v': ['Visa', 'Visa Inc.', 'V'],
    'ma': ['Mastercard', 'Mastercard Incorporated', 'MA'],
    # Agriculture
    'de': ['Deere & Company', 'Deere', 'DE'],
    'ctva': ['Corteva', 'Corteva Inc.', 'CTVA'],
    'adm': ['ADM', 'ADM Company', 'Archer Daniels Midland Company'],
    # Telecommunications
    't': ['AT&T', 'AT&T Inc.', 'T'],
    'vz': ['Verizon', 'Verizon Communications', 'VZ'],
    'tmus': ['T-Mobile', 'T-Mobile US', 'TMUS'],
    # Materials
    'dow': ['Dow Inc.', 'Dow', 'DOW'],
    'nem': ['Newmont', 'Newmont Corporation', 'NEM'],
    'fcx': ['Freeport-McMoRan', 'Freeport-McMoRan Inc.', 'FCX'],
}

platforms = {
    'web': '',
    'youtube': 'youtube',
    'news': 'news',
    'images': 'images',
}

# Initialize Pytrends
pytrends = TrendReq(hl='en-US', tz=360)

output_dir = 'data'
os.makedirs(output_dir, exist_ok=True)
image_dir = 'image'
os.makedirs(image_dir, exist_ok=True)

# Loop through each company and platform to get trends data
for company, keywords in companies.items():
    print(f"Fetching data for: {company}")
    
    # Create a directory for the company
    company_dir = os.path.join(output_dir, company)
    os.makedirs(company_dir, exist_ok=True)

    trends_data = {}  # Store trends for all platforms

    # Fetch trends data for each platform
    for platform, gprop in platforms.items():
        pytrends.build_payload(kw_list=keywords, timeframe='2019-01-01 2024-10-15', geo='', gprop=gprop)
        data = pytrends.interest_over_time()

        # print(f"{platform} data:\n", data[keywords])

        data.fillna(0, inplace=True)

        # Sum all keyword trends for each platform
        trends_data[platform] = data[keywords].sum(axis=1)

        # Save platform data to CSV
        csv_path = os.path.join(company_dir, f'{company}_{platform}.csv')
        data.to_csv(csv_path)
        print(f"Saved {csv_path}")

    # Plot trends for each platform
    plt.figure(figsize=(12, 8))
    for platform, trend in trends_data.items():
        plt.plot(trend.index, trend, label=platform)

    plt.title(f'Search Trends for {company} (2019-2024)')
    plt.xlabel('Date')
    plt.ylabel('Search Interest')
    plt.legend(loc='upper left')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.grid()

    # Save the plot as an image
    image_path = os.path.join(image_dir, f'{company}_search_trends.png')
    plt.savefig(image_path)
    plt.close()
    print(f"Saved {image_path}")

Fetching data for: apple


  df = df.fillna(False)


Saved data\apple\apple_web.csv


  df = df.fillna(False)


Saved data\apple\apple_youtube.csv


  df = df.fillna(False)


Saved data\apple\apple_news.csv


  df = df.fillna(False)


Saved data\apple\apple_images.csv
Saved image\apple_search_trends.png
Fetching data for: microsoft


  df = df.fillna(False)


Saved data\microsoft\microsoft_web.csv


  df = df.fillna(False)


Saved data\microsoft\microsoft_youtube.csv


  df = df.fillna(False)


Saved data\microsoft\microsoft_news.csv


  df = df.fillna(False)


Saved data\microsoft\microsoft_images.csv
Saved image\microsoft_search_trends.png
Fetching data for: nvda


  df = df.fillna(False)


Saved data\nvda\nvda_web.csv


  df = df.fillna(False)


Saved data\nvda\nvda_youtube.csv


  df = df.fillna(False)


Saved data\nvda\nvda_news.csv


  df = df.fillna(False)


Saved data\nvda\nvda_images.csv
Saved image\nvda_search_trends.png
Fetching data for: exxon


  df = df.fillna(False)


Saved data\exxon\exxon_web.csv


  df = df.fillna(False)


Saved data\exxon\exxon_youtube.csv


  df = df.fillna(False)


Saved data\exxon\exxon_news.csv


  df = df.fillna(False)


Saved data\exxon\exxon_images.csv
Saved image\exxon_search_trends.png
Fetching data for: chevron


  df = df.fillna(False)


Saved data\chevron\chevron_web.csv


  df = df.fillna(False)


Saved data\chevron\chevron_youtube.csv


  df = df.fillna(False)


Saved data\chevron\chevron_news.csv


  df = df.fillna(False)


Saved data\chevron\chevron_images.csv
Saved image\chevron_search_trends.png
Fetching data for: bp


  df = df.fillna(False)


Saved data\bp\bp_web.csv


  df = df.fillna(False)


Saved data\bp\bp_youtube.csv


  df = df.fillna(False)


Saved data\bp\bp_news.csv


  df = df.fillna(False)


Saved data\bp\bp_images.csv
Saved image\bp_search_trends.png
Fetching data for: cop


  df = df.fillna(False)


Saved data\cop\cop_web.csv


  df = df.fillna(False)


Saved data\cop\cop_youtube.csv


  df = df.fillna(False)


Saved data\cop\cop_news.csv


  df = df.fillna(False)


Saved data\cop\cop_images.csv
Saved image\cop_search_trends.png
Fetching data for: coca_cola


  df = df.fillna(False)


Saved data\coca_cola\coca_cola_web.csv


  df = df.fillna(False)


Saved data\coca_cola\coca_cola_youtube.csv


  df = df.fillna(False)


Saved data\coca_cola\coca_cola_news.csv


  df = df.fillna(False)


Saved data\coca_cola\coca_cola_images.csv
Saved image\coca_cola_search_trends.png
Fetching data for: pepsi


  df = df.fillna(False)


Saved data\pepsi\pepsi_web.csv


  df = df.fillna(False)


Saved data\pepsi\pepsi_youtube.csv


  df = df.fillna(False)


Saved data\pepsi\pepsi_news.csv


  df = df.fillna(False)


Saved data\pepsi\pepsi_images.csv
Saved image\pepsi_search_trends.png
Fetching data for: pg


  df = df.fillna(False)


Saved data\pg\pg_web.csv


  df = df.fillna(False)


Saved data\pg\pg_youtube.csv


  df = df.fillna(False)


Saved data\pg\pg_news.csv


  df = df.fillna(False)


Saved data\pg\pg_images.csv
Saved image\pg_search_trends.png
Fetching data for: jnj


  df = df.fillna(False)


Saved data\jnj\jnj_web.csv


  df = df.fillna(False)


Saved data\jnj\jnj_youtube.csv


  df = df.fillna(False)


Saved data\jnj\jnj_news.csv


  df = df.fillna(False)


Saved data\jnj\jnj_images.csv
Saved image\jnj_search_trends.png
Fetching data for: pfe


  df = df.fillna(False)


Saved data\pfe\pfe_web.csv


  df = df.fillna(False)


Saved data\pfe\pfe_youtube.csv


  df = df.fillna(False)


Saved data\pfe\pfe_news.csv


  df = df.fillna(False)


Saved data\pfe\pfe_images.csv
Saved image\pfe_search_trends.png
Fetching data for: abbv


  df = df.fillna(False)


Saved data\abbv\abbv_web.csv


  df = df.fillna(False)


Saved data\abbv\abbv_youtube.csv


  df = df.fillna(False)


Saved data\abbv\abbv_news.csv


  df = df.fillna(False)


Saved data\abbv\abbv_images.csv
Saved image\abbv_search_trends.png
Fetching data for: jpm


  df = df.fillna(False)


Saved data\jpm\jpm_web.csv


  df = df.fillna(False)


Saved data\jpm\jpm_youtube.csv


  df = df.fillna(False)


Saved data\jpm\jpm_news.csv


  df = df.fillna(False)


Saved data\jpm\jpm_images.csv
Saved image\jpm_search_trends.png
Fetching data for: v


  df = df.fillna(False)


Saved data\v\v_web.csv


  df = df.fillna(False)


Saved data\v\v_youtube.csv


  df = df.fillna(False)


Saved data\v\v_news.csv


  df = df.fillna(False)


Saved data\v\v_images.csv
Saved image\v_search_trends.png
Fetching data for: ma


  df = df.fillna(False)


Saved data\ma\ma_web.csv


  df = df.fillna(False)


Saved data\ma\ma_youtube.csv


  df = df.fillna(False)


Saved data\ma\ma_news.csv


  df = df.fillna(False)


Saved data\ma\ma_images.csv
Saved image\ma_search_trends.png
Fetching data for: de


  df = df.fillna(False)


Saved data\de\de_web.csv


  df = df.fillna(False)


Saved data\de\de_youtube.csv


  df = df.fillna(False)


Saved data\de\de_news.csv


  df = df.fillna(False)


Saved data\de\de_images.csv
Saved image\de_search_trends.png
Fetching data for: ctva


  df = df.fillna(False)


Saved data\ctva\ctva_web.csv


  df = df.fillna(False)


Saved data\ctva\ctva_youtube.csv


  df = df.fillna(False)


Saved data\ctva\ctva_news.csv


  df = df.fillna(False)


Saved data\ctva\ctva_images.csv
Saved image\ctva_search_trends.png
Fetching data for: adm


  df = df.fillna(False)


Saved data\adm\adm_web.csv


  df = df.fillna(False)


Saved data\adm\adm_youtube.csv


  df = df.fillna(False)


Saved data\adm\adm_news.csv


  df = df.fillna(False)


Saved data\adm\adm_images.csv
Saved image\adm_search_trends.png
Fetching data for: t


  df = df.fillna(False)


Saved data\t\t_web.csv


  df = df.fillna(False)


Saved data\t\t_youtube.csv


  df = df.fillna(False)


Saved data\t\t_news.csv


  df = df.fillna(False)


Saved data\t\t_images.csv
Saved image\t_search_trends.png
Fetching data for: vz


  df = df.fillna(False)


Saved data\vz\vz_web.csv


  df = df.fillna(False)


Saved data\vz\vz_youtube.csv


  df = df.fillna(False)


Saved data\vz\vz_news.csv


  df = df.fillna(False)


Saved data\vz\vz_images.csv
Saved image\vz_search_trends.png
Fetching data for: tmus


  df = df.fillna(False)


Saved data\tmus\tmus_web.csv


  df = df.fillna(False)


Saved data\tmus\tmus_youtube.csv


  df = df.fillna(False)


Saved data\tmus\tmus_news.csv


  df = df.fillna(False)


Saved data\tmus\tmus_images.csv
Saved image\tmus_search_trends.png
Fetching data for: dow


  df = df.fillna(False)


Saved data\dow\dow_web.csv


  df = df.fillna(False)


Saved data\dow\dow_youtube.csv


  df = df.fillna(False)


Saved data\dow\dow_news.csv


  df = df.fillna(False)


Saved data\dow\dow_images.csv
Saved image\dow_search_trends.png
Fetching data for: nem


  df = df.fillna(False)


Saved data\nem\nem_web.csv


  df = df.fillna(False)


Saved data\nem\nem_youtube.csv


  df = df.fillna(False)


Saved data\nem\nem_news.csv


  df = df.fillna(False)


Saved data\nem\nem_images.csv
Saved image\nem_search_trends.png
Fetching data for: fcx


  df = df.fillna(False)


Saved data\fcx\fcx_web.csv


  df = df.fillna(False)


Saved data\fcx\fcx_youtube.csv


  df = df.fillna(False)


Saved data\fcx\fcx_news.csv
Saved data\fcx\fcx_images.csv
Saved image\fcx_search_trends.png


  df = df.fillna(False)


In [16]:
bucket_name = "eda"

def upload_directory_to_minio(local_path, bucket, minio_path=""):
    for root, _, files in os.walk(local_path):
        for file in files:
            local_file = os.path.join(root, file)
            relative_path = os.path.relpath(local_file, local_path)
            minio_file_path = os.path.join(minio_path, relative_path).replace("\\", "/")

            try:
                client.fput_object(bucket, minio_file_path, local_file)
                print(f"Uploaded '{local_file}' to '{minio_file_path}'")
            except S3Error as e:
                print(f"Failed to upload '{local_file}': {e}")

# Upload 'data' and 'image' directories to MinIO under the 'google-trend' folder
upload_directory_to_minio(output_dir, bucket_name, 'google-trend/data')
upload_directory_to_minio(image_dir, bucket_name, 'google-trend/image')

print("All files uploaded to MinIO under 'google-trend' folder.")


Uploaded 'data\abbv\abbv_images.csv' to 'google-trend/data/abbv/abbv_images.csv'
Uploaded 'data\abbv\abbv_news.csv' to 'google-trend/data/abbv/abbv_news.csv'
Uploaded 'data\abbv\abbv_web.csv' to 'google-trend/data/abbv/abbv_web.csv'
Uploaded 'data\abbv\abbv_youtube.csv' to 'google-trend/data/abbv/abbv_youtube.csv'
Uploaded 'data\adm\adm_images.csv' to 'google-trend/data/adm/adm_images.csv'
Uploaded 'data\adm\adm_news.csv' to 'google-trend/data/adm/adm_news.csv'
Uploaded 'data\adm\adm_web.csv' to 'google-trend/data/adm/adm_web.csv'
Uploaded 'data\adm\adm_youtube.csv' to 'google-trend/data/adm/adm_youtube.csv'
Uploaded 'data\apple\apple_images.csv' to 'google-trend/data/apple/apple_images.csv'
Uploaded 'data\apple\apple_news.csv' to 'google-trend/data/apple/apple_news.csv'
Uploaded 'data\apple\apple_web.csv' to 'google-trend/data/apple/apple_web.csv'
Uploaded 'data\apple\apple_youtube.csv' to 'google-trend/data/apple/apple_youtube.csv'
Uploaded 'data\bp\bp_images.csv' to 'google-trend/da

----------------------

## Data Processing

In [50]:
# resp = client.get_object("eda", "gdelt/2019/01/01.export.CSV.zip")
# from io import BytesIO

# df = pd.read_csv(BytesIO(resp.data))

In [102]:
for y in range(19, 25):
    for m in range(1, 13):
        if m < 10:
            m = "0" + str(m)
        for d in range(1, 32):
            if d < 10:
                d = "0" + str(d)
            try:
                response = client.get_object("eda", "gdelt-parquet/20{0}/{1}/{2}.snappy.parquet".format(y,m,d))
                # Read data from response.
            except:
                print("File doesn't exist! 20{0}/{1}/{2}".format(y,m,d))
                try:
                    response = client.get_object("eda", "gdelt/20{0}/{1}/{2}.export.CSV.zip".format(y,m,d))
                    client.fget_object("eda", "gdelt/20{0}/{1}/{2}.export.CSV.zip".format(y,m,d), 
                                       "./data/gdelt/20{0}{1}{2}.export.CSV.zip".format(y,m,d))
                    with zipfile.ZipFile("./data/gdelt/20{0}{1}{2}.export.CSV.zip".format(y,m,d),"r") as zip_ref:
                        zip_ref.extractall("./data/gdelt/temp")
                    data = pd.read_csv("./data/gdelt/temp/20{0}{1}{2}.export.CSV".format(y,m,d), sep='\t', header=None, names=gdelt_headers.keys(), dtype=gdelt_headers, on_bad_lines='warn')
                    result = client.put_object(
                        "eda", "gdelt-parquet/20{0}/{1}/{2}.snappy.parquet".format(y,m,d), io.BytesIO(data.to_parquet(compression='snappy')), length=-1, part_size=10*1024*1024,
                    )
                    print(
                        "created {0} object; etag: {1}, version-id: {2}".format(
                            result.object_name, result.etag, result.version_id,
                        ),
                    )
                    os.remove("./data/gdelt/temp/20{0}{1}{2}.export.CSV".format(y,m,d))
                    os.remove("./data/gdelt/20{0}{1}{2}.export.CSV.zip".format(y,m,d))
                except:
                    print("This date does not exist!!!")

File doesn't exist! 2019/01/01
created gdelt-parquet/2019/01/01.snappy.parquet object; etag: 653b38a41f6ddd26dc179d6d9fba3900, version-id: None
File doesn't exist! 2019/02/29
This date does not exist!!!
File doesn't exist! 2019/02/30
This date does not exist!!!
File doesn't exist! 2019/02/31
This date does not exist!!!
File doesn't exist! 2019/04/31
This date does not exist!!!
File doesn't exist! 2019/06/31
This date does not exist!!!
File doesn't exist! 2019/09/31
This date does not exist!!!
File doesn't exist! 2019/11/31
This date does not exist!!!
File doesn't exist! 2020/02/30
This date does not exist!!!
File doesn't exist! 2020/02/31
This date does not exist!!!
File doesn't exist! 2020/04/31
This date does not exist!!!
File doesn't exist! 2020/06/31
This date does not exist!!!
File doesn't exist! 2020/09/31
This date does not exist!!!
File doesn't exist! 2020/11/31
This date does not exist!!!
File doesn't exist! 2021/02/29
This date does not exist!!!
File doesn't exist! 2021/02/30

In [49]:
data

Unnamed: 0,GlobalEventID,Day,MonthYear,Year,FractionDate,Actor1Code,Actor1Name,Actor1CountryCode,Actor1KnownGroupCode,Actor1EthnicCode,...,Actor2Geo_FeatureID,ActionGeo_Type,ActionGeo_Fullname,ActionGeo_CountryCode,ActionGeo_ADM1Code,ActionGeo_Lat,ActionGeo_Long,ActionGeo_FeatureID,DateAdded,SOURCEURL
0,824358302,20180217,201802,2018,2018.1288,AUSAGR,AUSTRALIA,AUS,,,...,-1555351,4.0,"Melbourne, Victoria, Australia",AS,AS07,-37.816700,144.967,-1586844,20190217.0,https://www.canberratimes.com.au/national/vict...
1,824358303,20180217,201802,2018,2018.1288,AUSAGR,MELBOURNE,AUS,,,...,-1586844,4.0,"Melbourne, Victoria, Australia",AS,AS07,-37.816700,144.967,-1586844,20190217.0,https://www.canberratimes.com.au/national/vict...
2,824358304,20180217,201802,2018,2018.1288,CHN,CHINA,CHN,,,...,-2960561,4.0,"Tehran, Tehran, Iran",IR,IR26,35.750000,51.5148,10074674,20190217.0,http://www.reflector.com/National-Business/201...
3,824358305,20180217,201802,2018,2018.1288,CHN,CHINA,CHN,,,...,10074674,4.0,"Tehran, Tehran, Iran",IR,IR26,35.750000,51.5148,10074674,20190217.0,https://www.americanpress.com/wire/the-latest-...
4,824358306,20180217,201802,2018,2018.1288,CHN,CHINA,CHN,,,...,10074674,4.0,"Tehran, Tehran, Iran",IR,IR26,35.750000,51.5148,10074674,20190217.0,http://www.reflector.com/National-Business/201...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103350,824518312,20190217,201902,2019,2019.1288,asy,ASSYRIAN,,,asy,...,,1.0,United States,US,US,39.828175,-98.5795,US,20190217.0,https://medium.com/@christina.nyoseph/on-the-c...
103351,824518313,20190217,201902,2019,2019.1288,chm,MARI,,,chm,...,,4.0,"Kambuzuma, Harare, Zimbabwe",ZI,ZI10,-17.854400,30.9672,124324,20190217.0,https://www.herald.co.zw/zim-writers-mourn-fal...
103352,824518314,20190217,201902,2019,2019.1288,chm,MARI,,,chm,...,-2007087,4.0,"Farai, Nigeria (general), Nigeria",NI,NI00,9.450000,12.1167,-2007087,20190217.0,https://www.herald.co.zw/zim-writers-mourn-fal...
103353,824518315,20190217,201902,2019,2019.1288,chr,CHEROKEE,,,chr,...,531871,3.0,"Washington, District of Columbia, United States",US,USDC,38.895100,-77.0364,531871,20190217.0,http://wondradio.com/abc-politics/e8063973baa8...


----------------------

## Data Analysis

In [2]:
import duckdb as ddb

In [3]:
%load_ext sql

In [4]:
%sql ROLLBACK

UsageError: No active connection.

To fix it:

Pass a valid connection string:
    Example: %sql postgresql://username:password@hostname/dbname

OR

Set the environment variable $DATABASE_URL

For more details, see: https://jupysql.ploomber.io/en/latest/connecting.html
If you need help solving this issue, send us a message: https://ploomber.io/community


In [5]:
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

In [6]:
%sql duckdb:///eda.ddb

In [7]:
%%sql
INSTALL httpfs;
LOAD httpfs;
SET s3_region='us-east-1';
SET s3_url_style='path';
SET s3_endpoint='localhost:9000';
SET s3_access_key_id='ROOTNAME' ;
SET s3_secret_access_key='CHANGEME123';
SET s3_use_ssl = false;

Unnamed: 0,Success


In [91]:
%%sql
SELECT *
FROM 's3://eda/gdelt-parquet/2019/01/01.snappy.parquet'

Unnamed: 0,GlobalEventID,Day,MonthYear,Year,FractionDate,Actor1Code,Actor1Name,Actor1CountryCode,Actor1KnownGroupCode,Actor1EthnicCode,...,Actor2Geo_FeatureID,ActionGeo_Type,ActionGeo_Fullname,ActionGeo_CountryCode,ActionGeo_ADM1Code,ActionGeo_Lat,ActionGeo_Long,ActionGeo_FeatureID,DateAdded,SOURCEURL
0,813415774,20180101,201801,2018,2018.0027,,,,,,...,-561990,4.0,"Calgary, Alberta, Canada",CA,CA01,51.08330,-114.0830,-561990,20190101.0,https://www.weyburnreview.com/cn-rail-and-cp-r...
1,813415775,20180101,201801,2018,2018.0027,,,,,,...,648428,3.0,"Nashwauk, Minnesota, United States",US,USMN,47.38020,-93.1683,648428,20190101.0,http://www.startribune.com/arcelormittal-takes...
2,813415776,20180101,201801,2018,2018.0027,AGR,FARMER,,,,...,,4.0,"Calgary, Alberta, Canada",CA,CA01,51.08330,-114.0830,-561990,20190101.0,https://www.weyburnreview.com/cn-rail-and-cp-r...
3,813415777,20180101,201801,2018,2018.0027,AGR,FARMER,,,,...,,4.0,"Calgary, Alberta, Canada",CA,CA01,51.08330,-114.0830,-561990,20190101.0,https://www.weyburnreview.com/cn-rail-and-cp-r...
4,813415778,20180101,201801,2018,2018.0027,CAN,CANADA,CAN,,,...,,4.0,"Winnipeg, Manitoba, Canada",CA,CA03,49.88330,-97.1667,-576096,20190101.0,https://globalnews.ca/news/4805430/professor-p...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87325,813542562,20190101,201901,2019,2019.0027,gba,BAYA,,,gba,...,-2250716,4.0,"Kilifi, Coast, Kenya",KE,KE02,-3.63045,39.8499,-2250716,20190101.0,https://www.standardmedia.co.ke/article/200130...
87326,813542563,20190101,201901,2019,2019.0027,gba,BAYA,,,gba,...,-2250716,4.0,"Kilifi, Coast, Kenya",KE,KE02,-3.63045,39.8499,-2250716,20190101.0,https://www.standardmedia.co.ke/article/200130...
87327,813542564,20190101,201901,2019,2019.0027,gba,BAYA,,,gba,...,-2254423,4.0,"Magarini, Coast, Kenya",KE,KE02,-3.03566,40.0713,-2254423,20190101.0,https://www.standardmedia.co.ke/article/200130...
87328,813542565,20190101,201901,2019,2019.0027,gba,BAYA,,,gba,...,-2250716,4.0,"Kilifi, Coast, Kenya",KE,KE02,-3.63045,39.8499,-2250716,20190101.0,https://www.standardmedia.co.ke/article/200130...


In [11]:
%sql DROP TABLE gdelt

RuntimeError: If using snippets, you may pass the --with argument explicitly.
For more details please refer: https://jupysql.ploomber.io/en/latest/compose.html#with-argument


Original error message from DB driver:
(duckdb.duckdb.CatalogException) Catalog Error: Table with name gdelt does not exist!
Did you mean "pg_description"?
[SQL: DROP TABLE gdelt]
(Background on this error at: https://sqlalche.me/e/20/f405)

If you need help solving this issue, send us a message: https://ploomber.io/community


In [12]:
%sql CREATE TABLE gdelt AS SELECT * FROM read_parquet('s3://eda/gdelt-parquet/2019/*/*.snappy.parquet', union_by_name=True);

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,Success


In [None]:
%sql SELECT * FROM gdelt

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [24]:
%sql SELECT count(*) FROM gdelt

Unnamed: 0,count_star()
0,260658282


In [13]:
%sql create table yfinance as select * from read_parquet("s3://eda/yfinance/*/*.snappy.parquet")

Unnamed: 0,Success


In [14]:
%sql SELECT count(*) FROM yfinance

Unnamed: 0,count_star()
0,36248


In [15]:
%sql INSERT INTO gdelt SELECT * FROM read_parquet("s3://eda/gdelt-parquet/2020/*/*.snappy.parquet")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,Success


In [17]:
%sql INSERT INTO gdelt SELECT * FROM read_parquet("s3://eda/gdelt-parquet/2021/*/*.snappy.parquet")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,Success


In [19]:
%sql INSERT INTO gdelt SELECT * FROM read_parquet("s3://eda/gdelt-parquet/2022/*/*.snappy.parquet")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,Success


In [21]:
%sql INSERT INTO gdelt SELECT * FROM read_parquet("s3://eda/gdelt-parquet/2023/*/*.snappy.parquet")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,Success


In [23]:
%sql INSERT INTO gdelt SELECT * FROM read_parquet("s3://eda/gdelt-parquet/2024/*/*.snappy.parquet")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,Success
