# Data Engineering EDA Project

By Aommy, Austin, Ling, Tath (yippee)

-----------------------

## Installing Dependencies

### Minio Object Store dependencies

1. Install minio cli if not already installed.
2. Create client
3. Make bucket (if needed)

In [3]:
! pip install minio --quiet

In [11]:
from minio import Minio

# Create client with access and secret key.
# client = Minio("s3.amazonaws.com", "ACCESS-KEY", "SECRET-KEY")

# Create client with access key and secret key with specific region.
client = Minio(
    "localhost:9000",
    access_key="ROOTNAME",
    secret_key="CHANGEME123",
    secure=False
)

if client.bucket_exists("eda"):
    print("Bucket exists.")
else:
    client.make_bucket("eda")
    print("Bucket created.")

Bucket exists.


## Importing libraries

In [9]:
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

import io
from urllib.request import urlopen

In [7]:
!pip install --quiet yfinance --upgrade --no-cache-dir

---------------------

## Data Collection

### Data from Global Events

Installing data from http://data.gdeltproject.org/events/index.html for global event data!

In [5]:
for y in range(19, 25):
    for m in range(1, 13):
        if m < 10:
            m = "0" + str(m)
        for d in range(1, 32):
            if d < 10:
                d = "0" + str(d)
            try:
                response = client.get_object("eda", "gdelt/20{0}/{1}/{2}.export.CSV.zip".format(y,m,d))
                # Read data from response.
            except:
                print("File doesn't exist!")
                try:
                    data = urlopen(
                        "http://data.gdeltproject.org/events/20{0}{1}{2}.export.CSV.zip".format(y,m,d),
                    )
                    print("Uploading file!")
                    result = client.put_object(
                        "eda", "gdelt/20{0}/{1}/{2}.export.CSV.zip".format(y,m,d), data, length=-1, part_size=10*1024*1024,
                    )
                    print(
                        "created {0} object; etag: {1}, version-id: {2}".format(
                            result.object_name, result.etag, result.version_id,
                        ),
                    )
                except:
                    print("No file to download!")
            finally:
                response.close()
                response.release_conn()

File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
Uploading file!
created gdelt/2020/04/09.export.CSV.zip object; etag: 5cf7553a7e25189741f5f55711b3a5f2, version-id: None
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download!
File doesn't exist!
No file to download

### Data from YahooFinance

In [30]:
import yfinance as yf

companies = {
    "tech": ['AAPL', 'MSFT', 'NVDA'],
    "energy": ['XOM', 'CVX', 'BP', 'COP'],
    "food": ['KO', 'PEP', 'PG'],
    "health": ['JNJ', 'PFE', 'ABBV'],
    "finance": ['JPM', 'V', 'MA'],
    "agriculture": ['DE', 'CTVA', 'ADM'],
    "telecom": ['T', 'VZ', 'TMUS'],
    "materials": ['DOW', 'NEM', 'FCX']
}

In [37]:
uploaded = False
for sector in companies:
    for stock in companies[sector]:
        try:
            response = client.get_object("eda", "yfinance/{0}/{1}.snappy.parquet".format(sector,stock))
            # Read data from response.
        except:
            print("File doesn't exist!")
            data = yf.download(stock, start="2019-01-01", end="2024-10-15", progress=False)
            result = client.put_object(
                "eda", "yfinance/{0}/{1}.snappy.parquet".format(sector,stock), io.BytesIO(data.to_parquet(compression='snappy')), length=-1, part_size=10*1024*1024,
            )
            print(
                "created {0} object; etag: {1}, version-id: {2}".format(
                    result.object_name, result.etag, result.version_id,
                ),
            )
            uploaded = True
        finally:
            response.close()
            response.release_conn()
if (not uploaded):
    print("No changes to make.")

No changes to make.


### Data from Google Trend

In [1]:
! pip install pytrends --quiet

In [12]:
import os
import matplotlib.pyplot as plt
from pytrends.request import TrendReq
from minio import Minio
from minio.error import S3Error

# Companies and platforms data
companies = {
    'apple': ['Apple Inc.', 'Apple', 'AAPL', 'Apple Corporation'],
    'microsoft': ['Microsoft', 'MSFT', 'Microsoft Corporation'],
    'nvda': ['Nvidia', 'NVDA', 'Nvidia Corporation'],
    'exxon': ['Exxon Mobil', 'Exxon', 'Exxon Mobil Corporation'],
    'chevron': ['Chevron', 'Chevron Corporation'],
    'bp': ['BP', 'BP plc'],
    'cop': ['ConocoPhillips', 'ConocoPhillips Company'],
    'coca_cola': ['Coca-Cola', 'Coca-Cola Company', 'KO'],
    'pepsi': ['Pepsi', 'PepsiCo', 'PepsiCo Inc.'],
}

platforms = {
    'web': '',
    'youtube': 'youtube',
    'news': 'news',
    'images': 'images',
}

# Initialize Pytrends
pytrends = TrendReq(hl='en-US', tz=360)

output_dir = 'data'
os.makedirs(output_dir, exist_ok=True)
image_dir = 'image'
os.makedirs(image_dir, exist_ok=True)

# Loop through each company and platform to get trends data
for company, keywords in companies.items():
    print(f"Fetching data for: {company}")
    
    # Create a directory for the company
    company_dir = os.path.join(output_dir, company)
    os.makedirs(company_dir, exist_ok=True)

    trends_data = {}  # Store trends for all platforms

    # Fetch trends data for each platform
    for platform, gprop in platforms.items():
        pytrends.build_payload(kw_list=keywords, timeframe='2019-01-01 2024-10-15', geo='', gprop=gprop)
        data = pytrends.interest_over_time()

        # print(f"{platform} data:\n", data[keywords])

        data.fillna(0, inplace=True)

        # Sum all keyword trends for each platform
        trends_data[platform] = data[keywords].sum(axis=1)

        # Save platform data to CSV
        csv_path = os.path.join(company_dir, f'{company}_{platform}.csv')
        data.to_csv(csv_path)
        print(f"Saved {csv_path}")

    # Plot trends for each platform
    plt.figure(figsize=(12, 8))
    for platform, trend in trends_data.items():
        plt.plot(trend.index, trend, label=platform)

    plt.title(f'Search Trends for {company} (2019-2024)')
    plt.xlabel('Date')
    plt.ylabel('Search Interest')
    plt.legend(loc='upper left')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.grid()

    # Save the plot as an image
    image_path = os.path.join(image_dir, f'{company}_search_trends.png')
    plt.savefig(image_path)
    plt.close()
    print(f"Saved {image_path}")

Fetching data for: apple


  df = df.fillna(False)


Saved data\apple\apple_web.csv


  df = df.fillna(False)


Saved data\apple\apple_youtube.csv


  df = df.fillna(False)


Saved data\apple\apple_news.csv


  df = df.fillna(False)


Saved data\apple\apple_images.csv
Saved image\apple_search_trends.png
Fetching data for: microsoft


  df = df.fillna(False)


Saved data\microsoft\microsoft_web.csv


  df = df.fillna(False)


Saved data\microsoft\microsoft_youtube.csv


  df = df.fillna(False)


Saved data\microsoft\microsoft_news.csv


  df = df.fillna(False)


Saved data\microsoft\microsoft_images.csv
Saved image\microsoft_search_trends.png
Fetching data for: nvda


  df = df.fillna(False)


Saved data\nvda\nvda_web.csv


  df = df.fillna(False)


Saved data\nvda\nvda_youtube.csv


  df = df.fillna(False)


Saved data\nvda\nvda_news.csv


  df = df.fillna(False)


Saved data\nvda\nvda_images.csv
Saved image\nvda_search_trends.png
Fetching data for: exxon


  df = df.fillna(False)


Saved data\exxon\exxon_web.csv


  df = df.fillna(False)


Saved data\exxon\exxon_youtube.csv


  df = df.fillna(False)


Saved data\exxon\exxon_news.csv


  df = df.fillna(False)


Saved data\exxon\exxon_images.csv
Saved image\exxon_search_trends.png
Fetching data for: chevron


  df = df.fillna(False)


Saved data\chevron\chevron_web.csv


  df = df.fillna(False)


Saved data\chevron\chevron_youtube.csv


  df = df.fillna(False)


Saved data\chevron\chevron_news.csv


  df = df.fillna(False)


Saved data\chevron\chevron_images.csv
Saved image\chevron_search_trends.png
Fetching data for: bp


  df = df.fillna(False)


Saved data\bp\bp_web.csv


  df = df.fillna(False)


Saved data\bp\bp_youtube.csv


  df = df.fillna(False)


Saved data\bp\bp_news.csv


  df = df.fillna(False)


Saved data\bp\bp_images.csv
Saved image\bp_search_trends.png
Fetching data for: cop


  df = df.fillna(False)


Saved data\cop\cop_web.csv


  df = df.fillna(False)


Saved data\cop\cop_youtube.csv


  df = df.fillna(False)


Saved data\cop\cop_news.csv


  df = df.fillna(False)


Saved data\cop\cop_images.csv
Saved image\cop_search_trends.png
Fetching data for: coca_cola


  df = df.fillna(False)


Saved data\coca_cola\coca_cola_web.csv


  df = df.fillna(False)


Saved data\coca_cola\coca_cola_youtube.csv


  df = df.fillna(False)


Saved data\coca_cola\coca_cola_news.csv


  df = df.fillna(False)


Saved data\coca_cola\coca_cola_images.csv
Saved image\coca_cola_search_trends.png
Fetching data for: pepsi


  df = df.fillna(False)


Saved data\pepsi\pepsi_web.csv


  df = df.fillna(False)


Saved data\pepsi\pepsi_youtube.csv


  df = df.fillna(False)


Saved data\pepsi\pepsi_news.csv
Saved data\pepsi\pepsi_images.csv
Saved image\pepsi_search_trends.png


  df = df.fillna(False)


In [13]:
bucket_name = "eda"

def upload_directory_to_minio(local_path, bucket, minio_path=""):
    for root, _, files in os.walk(local_path):
        for file in files:
            local_file = os.path.join(root, file)
            relative_path = os.path.relpath(local_file, local_path)
            minio_file_path = os.path.join(minio_path, relative_path).replace("\\", "/")

            try:
                client.fput_object(bucket, minio_file_path, local_file)
                print(f"Uploaded '{local_file}' to '{minio_file_path}'")
            except S3Error as e:
                print(f"Failed to upload '{local_file}': {e}")

# Upload 'data' and 'image' directories to MinIO under the 'google-trend' folder
upload_directory_to_minio(output_dir, bucket_name, 'google-trend/data')
upload_directory_to_minio(image_dir, bucket_name, 'google-trend/image')

print("All files uploaded to MinIO under 'google-trend' folder.")


Uploaded 'data\apple\apple_images.csv' to 'google-trend/data/apple/apple_images.csv'
Uploaded 'data\apple\apple_news.csv' to 'google-trend/data/apple/apple_news.csv'
Uploaded 'data\apple\apple_web.csv' to 'google-trend/data/apple/apple_web.csv'
Uploaded 'data\apple\apple_youtube.csv' to 'google-trend/data/apple/apple_youtube.csv'
Uploaded 'data\bp\bp_images.csv' to 'google-trend/data/bp/bp_images.csv'
Uploaded 'data\bp\bp_news.csv' to 'google-trend/data/bp/bp_news.csv'
Uploaded 'data\bp\bp_web.csv' to 'google-trend/data/bp/bp_web.csv'
Uploaded 'data\bp\bp_youtube.csv' to 'google-trend/data/bp/bp_youtube.csv'
Uploaded 'data\chevron\chevron_images.csv' to 'google-trend/data/chevron/chevron_images.csv'
Uploaded 'data\chevron\chevron_news.csv' to 'google-trend/data/chevron/chevron_news.csv'
Uploaded 'data\chevron\chevron_web.csv' to 'google-trend/data/chevron/chevron_web.csv'
Uploaded 'data\chevron\chevron_youtube.csv' to 'google-trend/data/chevron/chevron_youtube.csv'
Uploaded 'data\coca_

----------------------

## Data Processing

In [50]:
# resp = client.get_object("eda", "gdelt/2019/01/01.export.CSV.zip")
# from io import BytesIO

# df = pd.read_csv(BytesIO(resp.data))

In [49]:
# df = pd.read_csv(urlopen("http://data.gdeltproject.org/events/20{0}{1}{2}.export.CSV.zip".format(19,'01','01'),))

----------------------

## Data Analysis