# Data Cleaning Branch
I will work here before merging with main so we have a backup in case someone messes up a merge with their independent branch

In [37]:
# import modules
import pandas as pd
import pandas_datareader as pdr
import requests
import os
from dotenv import load_dotenv
import csv
import numpy as np

# Modules for fetching data
import yfinance as yf

In [38]:
load_dotenv()
AV_api_key = os.getenv("ALPHA_VANTAGE_API_KEY")

## Helper functions

### Data retrieval and cleanup helper functions

In [29]:

def DeleteEmptyCSVs(CSVToCheck):
    # Function to delete any csvs that were created by an attempted data retrieval
    # but are empty of data
    if os.path.isfile(CSVToCheck):
        # Count the number of lines in the file
        with open(CSVToCheck, 'r') as file:
            reader = csv.reader(file)
            line_count = sum(1 for row in reader)
        
        print(f"The file {CSVToCheck} has {line_count} lines.")

        # Check if the line count is less 3 (header and empty line)
        if line_count < 3:
            print(f"{CSVToCheck} seems empty. Deleting the file.")
            os.remove(CSVToCheck)
        else:
            print(f"{CSVToCheck} seems to have data.")
    else:
        print(f"{CSVToCheck} does not exist.")

In [68]:
def getYahoo(tckr, tckr_csv):
    
    if not os.path.isfile(tckr_csv):
        print("Retrieving "+tckr+" Data from Yahoo...")
        try:
            tckr_data = yf.download(sp500_ticker, start=START_DATE_sp500, end=END_DATE_sp500)
            print("saving data to csv")
            tckr_data.to_csv(tckr_csv)
        except Exception as e:
            print(f"Failed to retrieve data: {e}")
    else:
        print(tckr+" data already saved in csv")



In [91]:
def getDividends(tckr):
    ticker = yf.Ticker(tckr)
    dividends_series = ticker.dividends
    csvFile = '.\data_files\\'
    csvFile = csvFile + tckr + 'dividends.csv'
    dividends_series.to_csv(csvFile)

In [75]:
def GetEarningsFromAlphaVantage(stock_ticker, data_folder = 'data_files'):
    # Request from API
    url = f'https://www.alphavantage.co/query?function=EARNINGS&symbol={stock_ticker}&apikey={AV_api_key}'
    response = requests.get(url)
    data = response.json()
    quarterly_file =  stock_ticker + 'quarterlyEarnings.csv'
    #annual_file =  stock_ticker + 'annualEarnings.csv'
    quarterly_csv = os.path.join(data_folder, quarterly_file)
    #annual_csv = os.path.join(data_folder, annual_file)
    # Parse the response to get EPS data and save it as a CSV file
    quarterly_earnings = data['quarterlyEarnings']
    #annual_earnings = data['annualEarnings']
    if not os.path.isfile(quarterly_csv):
        with open(quarterly_csv, mode='w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['Fiscal Date','Estimated EPS', 'Reported EPS'])  # Write header row
            for earnings_data in quarterly_earnings:
                fiscal_date = earnings_data['fiscalDateEnding']
                estimated_eps = earnings_data['estimatedEPS']
                reported_eps = earnings_data['reportedEPS']
                writer.writerow([fiscal_date, estimated_eps, reported_eps])
    else:
        print("Quarterly Earnings CSV already present")
    # Report save
        
    print(f" Quarterly Earnings data saved to {quarterly_csv}")
    
    

In [73]:
def BalanceSheetFromAlphaVantage(stock_ticker, data_folder = 'data_files'):
    # Request from API
    url = 'https://www.alphavantage.co/query?function=BALANCE_SHEET&symbol=IBM&apikey=demo'
    response = requests.get(url)
    data = response.json()
    quarterly_file =  stock_ticker + 'quarterlyBalSheet.csv'
    #annual_file =  stock_ticker + 'annualEarnings.csv'
    quarterly_csv = os.path.join(data_folder, quarterly_file)
    #annual_csv = os.path.join(data_folder, annual_file)
    # Parse the response to get EPS data and save it as a CSV file
    quarterly_reports = data['quarterlyReports']
    #annual_earnings = data['annualEarnings']
    if not os.path.isfile(quarterly_csv):
        with open(quarterly_csv, mode='w', newline='') as file:
            writer = csv.writer(file)
            balance_sheet_items = list(quarterly_reports[0].keys())
            writer.writerow(balance_sheet_items)  # Write header row
            for report in quarterly_reports:
                curr_row = []
                for item in balance_sheet_items:
                    curr_row.append(report[item])
                writer.writerow(curr_row)
    else:
        print("Quarterly Earnings CSV already present")
    # Report save
        
    print(f" Quarterly Earnings data saved to {quarterly_csv}")
    
    

## Fetching Initial Data

### Sourced from yahoo

In [78]:
START_DATE_yahoo ="1990-01-01"
END_DATE_yahoo = "2024-05-30"

# Larger Market trends 

# S&P 500
sp500_ticker = '^GSPC'
sp500_csv = '.\data_files\sp500_RawData.csv'
START_DATE_sp500 = START_DATE_yahoo
END_DATE_sp500 = END_DATE_yahoo

# CBOE Volatility Index (VIX)
vix_ticker = '^VIX'
vix_csv = '.\data_files\VIX_RawData.csv'
START_DATE_vix = START_DATE_yahoo
END_DATE_vix = END_DATE_yahoo

# Large Cap Stocks

# Microsoft (MSFT)
micro_ticker = 'MSFT'
micro_csv = '.\data_files\MSFT_RawData.csv'
START_DATE_msft = START_DATE_yahoo
END_DATE_msft = END_DATE_yahoo

# General Electric (GE)
general_elec_ticker = 'GE'
ge_csv = '.\data_files\GE_RawData.csv'
START_DATE_ge = START_DATE_yahoo
END_DATE_ge = END_DATE_yahoo

# Johnson and Johnson (JNJ)
jj_ticker = 'JNJ'
jj_csv = '.\data_files\JNJ_RawData.csv'
START_DATE_jj = START_DATE_yahoo
END_DATE_jj = END_DATE_yahoo

# Coca Cola (KO)
cc_ticker = 'KO'
cc_csv = '.\data_files\KO_RawData.csv'
START_DATE_cc = START_DATE_yahoo
END_DATE_cc = END_DATE_yahoo


## S&P 500 (representative of of larger market trends)

In [35]:

if not os.path.isfile(sp500_csv):
    print("Retrieving S&P Data from Yahoo...")
    try:
        sp500_data = yf.download(sp500_ticker, start=START_DATE_sp500, end=END_DATE_sp500)
        print("saving data to csv")
        sp500_data.to_csv(sp500_csv)
    except Exception as e:
        print(f"Failed to retrieve data: {e}")
else:
    print("S&P data already saved in csv")

DeleteEmptyCSVs(sp500_csv)

S&P data already saved in csv
The file .\data_files\sp500_RawData.csv has 11197 lines.
.\data_files\sp500_RawData.csv seems to have data.


## VIX (overall market volatility)

In [36]:

if not os.path.isfile(vix_csv):
    print("Retrieving "+vix_ticker+"from Yahoo...")
    try:
        vix_data = yf.download(vix_ticker, start=START_DATE_vix, end=END_DATE_vix)
        print("saving data to csv")
        vix_data.to_csv(vix_csv)
    except Exception as e:
        print(f"Failed to retrieve data: {e}")
else:
    print("VIX data already saved in csv")

DeleteEmptyCSVs(vix_csv)

VIX data already saved in csv
The file .\data_files\VIX_RawData.csv has 8669 lines.
.\data_files\VIX_RawData.csv seems to have data.


In [None]:

if not os.path.isfile(sp500_csv):
    print("Retrieving S&P Data from Yahoo...")
    try:
        sp500_data = yf.download(sp500_ticker, start=START_DATE_sp500, end=END_DATE_sp500)
        print("saving data to csv")
        sp500_data.to_csv(sp500_csv)
    except Exception as e:
        print(f"Failed to retrieve data: {e}")
else:
    print("S&P data already saved in csv")

DeleteEmptyCSVs(sp500_csv)

In [79]:
getYahoo(jj_ticker, jj_csv)

[*********************100%%**********************]  1 of 1 completed

Retrieving JNJ Data from Yahoo...
saving data to csv





In [69]:
getYahoo(micro_ticker,micro_csv)
getYahoo(micro_ticker,micro_csv)

Retrieving MSFT Data from Yahoo...


[*********************100%%**********************]  1 of 1 completed

saving data to csv





In [89]:
getDividends(jj_ticker)

In [92]:
getDividends(micro_ticker)

# Alpha Vantage Accounting Data

In [None]:
#BalanceSheetFromAlphaVantage(stock_ticker=micro_ticker)
#GetEarningsFromAlphaVantage(stock_ticker= micro_ticker)

In [77]:
BalanceSheetFromAlphaVantage(stock_ticker=jj_ticker)
GetEarningsFromAlphaVantage(stock_ticker= jj_ticker)

 Quarterly Earnings data saved to data_files\JNJquarterlyBalSheet.csv
 Quarterly Earnings data saved to data_files\JNJquarterlyEarnings.csv
