In [1]:
#This notebook exists to compile stock data that machine learning can run off of, and save it to a .json file
#for each stock. The rough idea of all of the data that the scraper will collect is as follows:
#[Date, Open, High, Low, Close, Adj Close, Volume, State of the Market (bear, bull, normal), 
#State of the Economy (Dow Jones, S&P500, NASDAQ Composite), inflation %, 
#whether or not the company is releasing earnings in the next day]

In [2]:
#Notebook instructions

#The purpose of this scraper is to collect data that can be fed into ML algorithms. If your ML algorithm is working
#correctly, this program can run overnight, every night (It finishes within that 8 hour window. If you aren't getting
#8 hours of sleep per night, consider it please) to collect updated data. The update functions are if you are trying to
#update State of the Market (bear, bull, normal), inflation %, or earnings data separately from the historical price 
#date. If you are not trying to do that, running the whole notebook every day will automatically update those metrics
#when they are updated by their respective agencies.

#The recommended paths I use for this notebook are just recommended. I saved everything within the Jupyter notebook 
#file, but you do not have to do that. Just make sure you update the paths.

#This scraper returns about 7.5 GB of data.

#There are certain stocks that fail to scrape every time. The failed scrapes-related functions exist to compare whether
#or not the same functions are failing to scrape every time.

In [3]:
#This cell contains all necessary import statements
import math
import copy
import json
import os
import csv
import time
import datetime
import requests
from bs4 import BeautifulSoup
import pandas_datareader.data as web
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM
import matplotlib.pyplot as plt
import yfinance as yf 
plt.style.use('fivethirtyeight')
import socket

In [4]:
#This cell defines the functions neccessary to read and write JSON files
#ALL FUNCTIONS CONTAINED IN THIS CELL ARE DEBUGGED
def write_json(path, data):
    with open(path, 'w', encoding = "utf-8") as f:
        json.dump(data, f, indent = 2)
def read_json(path):
    with open(path, encoding = "utf-8") as f: 
        return json.load(f)     

In [5]:
#This cell defines a function that will continuously check your internet connection after every stock scrape, ensuring
#if you leave the scraper on as a background process it will not fail to scrape stocks due to loss of connectivity
#ALL FUNCTIONS CONTAINED IN THIS CELL ARE DEBUGGED
def check_wifi():
    IPaddress=socket.gethostbyname(socket.gethostname())
    if IPaddress=="127.0.0.1":
        print('Oops, no internet. Please try again')
        time.sleep(20)
        check_wifi()
    else:
        return None

In [6]:
#This function will retrive the yyyy-mm-dd formatted date of any MONTH/D style date
#ALL FUNCTIONS CONTAINED IN THIS CELL ARE DEBUGGED
def get_date(investor_sentiment_reported_date):
    if investor_sentiment_reported_date[0:3] == 'Jan':
        month = '01'
    elif investor_sentiment_reported_date[0:3] == 'Feb':
        month = '02'
    elif investor_sentiment_reported_date[0:3] == 'Mar':
        month = '03'
    elif investor_sentiment_reported_date[0:3] == 'Apr':
        month = '04'
    elif investor_sentiment_reported_date[0:3] == 'May':
        month = '05'
    elif investor_sentiment_reported_date[0:3] == 'Jun':
        month = '06'
    elif investor_sentiment_reported_date[0:3] == 'Jul':
        month = '07'
    elif investor_sentiment_reported_date[0:3] == 'Aug':
        month = '08'
    elif investor_sentiment_reported_date[0:3] == 'Sep':
        month = '09'
    elif investor_sentiment_reported_date[0:3] == 'Oct':
        month = '10'
    elif investor_sentiment_reported_date[0:3] == 'Nov':
        month = '11'
    elif investor_sentiment_reported_date[0:3] == 'Dec':
        month = '12'
    day = '0'+investor_sentiment_reported_date[4]
    year = datetime.date.today().strftime("%Y-%m-%d")[0:4]
    reported_date = year + '-'+ month + '-' + day
    return reported_date

In [7]:
#This function creates a custom business day calendar that excludes days the market isn't open
#ALL FUNCTIONS CONTAINED IN THIS CELL ARE DEBUGGED
def custom_business(start, end):
    start = start
    end = (datetime.date(int(end[0:4]), int(end[5:7]), int(end[8:10])) + datetime.timedelta(weeks=1)).strftime("%Y-%m-%d")
    start_list = start.split('-')
    for idx in range(0, len(start_list)):
        if idx == 0:
            start_year = int(start_list[idx])
        elif idx == 1:
            start_month = int(start_list[idx])
        else:
            start_day  = int(start_list[idx])
    end_list = end.split('-')
    for idx in range(0, len(start_list)):
        if idx == 0:
            end_year = int(end_list[idx])
        elif idx == 1:
            end_month = int(end_list[idx])
        else:
            end_day  = int(end_list[idx])
    holidays = [datetime.datetime(start_year, start_month, start_day), datetime.datetime(end_year, end_month, end_day)]
    dates = pd.bdate_range(start=start, end=end, freq = 'C', holidays = holidays).strftime("%Y-%m-%d").tolist()
    for idx in range(0, len(dates)-2):
        if dates[idx] == '2004-12-31' or \
        dates[idx] == '2006-01-02' or\
        dates[idx] == '2010-12-31' or\
        dates[idx] == '2012-01-02' or\
        dates[idx] == '2017-01-02' or\
        dates[idx] == '2021-12-31' or\
        dates[idx] == '2023-01-02':
            dates.pop(idx)
    return dates

In [8]:
#This cell turns the scraped data into a DataFrame
#ALL FUNCTIONS CONTAINED IN THIS CELL ARE DEBUGGED
def get_data_frame(rows, columns):
    stocks_df = pd.DataFrame(columns=columns)
    for row in rows:
        elems = row.find_all('td')
        dict_to_add = {}
        for i,elem in enumerate(elems):
            dict_to_add[columns[i]] = elem.text
        stocks_df = stocks_df.append(dict_to_add, ignore_index=True)
    stocks_df_symbols = stocks_df['Symbol']
    stocks_df_symbols_list = list(stocks_df_symbols)
    return stocks_df_symbols_list

In [9]:
#This function will compile a list of every publicly traded stock on the market, save it to a JSON, and also return the
#   sorted list. This function uses the website EODDATA as a source.
#ALL FUNCTIONS CONTAINED IN THIS CELL ARE DEBUGGED
def make_stock_tuple_list(): 
    parsed_html_dict = {}
    parsed_html_dict_nas = {}
    url = 'https://eoddata.com/stocklist/NYSE/{}.htm'
    url_nas = 'https://eoddata.com/stocklist/NASDAQ/{}.htm'
    #This loop will loop through the pages of EOD Data to get all of the NYSE stock symbols
    for letter in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', \
                   'U', 'V', 'W', 'X', 'Y', 'Z']:
        page_url = url.format(letter)
        page = requests.get(page_url)
        assert page.status_code == 200
        data = BeautifulSoup(page.text, 'html.parser')
        letter_table_data = data.find('table', {'class': 'quotes'})
        assert len(letter_table_data) > 0
        table_header = letter_table_data.find_all('th')
        text_headers = []
        for th in table_header:
            th_text = th.text
            text_headers.append(th_text)
        assert len(text_headers) == 8
        text_headers.pop(-1)
        table_data = letter_table_data.find_all('tr')
        table_data.pop(0)
        stock_tuple_list = []
        for row in table_data:
            stock_symbol = row.find_all('td')
            for td in stock_symbol:
                if stock_symbol.index(td)==text_headers.index('Code'):
                    symbol = td.text
                if stock_symbol.index(td)==text_headers.index('Name'):
                    name = td.text
            stock_tuple = (symbol, name)
            stock_tuple_list.append(stock_tuple)
        parsed_html_dict[letter] = stock_tuple_list
    #This loop will loop through the pages of EOD Data to get all of the NASDAQ Stock Symbols
        page_url_nas = url_nas.format(letter)
        page_nas = requests.get(page_url_nas)
        assert page_nas.status_code == 200
        data_nas = BeautifulSoup(page_nas.text, 'html.parser')
        letter_table_data_nas = data_nas.find('table', {'class': 'quotes'})
        assert len(letter_table_data_nas) > 0
        table_header_nas = letter_table_data_nas.find_all('th')
        text_headers_nas = []
        for th_nas in table_header_nas:
            th_text_nas = th_nas.text
            text_headers_nas.append(th_text_nas)
        assert len(text_headers_nas) == 8
        text_headers_nas.pop(-1)
        table_data_nas = letter_table_data_nas.find_all('tr')
        table_data_nas.pop(0)
        stock_tuple_list_nas = []
        for row_nas in table_data_nas:
            stock_symbol_nas = row_nas.find_all('td')
            for td_nas in stock_symbol_nas:
                if stock_symbol_nas.index(td_nas)==text_headers_nas.index('Code'):
                    symbol_nas = td_nas.text
                if stock_symbol_nas.index(td_nas)==text_headers_nas.index('Name'):
                    name_nas = td_nas.text
            stock_tuple_nas = (symbol_nas, name_nas)
            stock_tuple_list_nas.append(stock_tuple_nas)
        parsed_html_dict_nas[letter] = stock_tuple_list_nas
        time.sleep(20)
    #This code will take both parsed_html_dicts and add them to create a dictionary of all stock symbols traded on both 
    #   platforms
    complete_stock_symbol_list = []
    for key in parsed_html_dict:
        nyse_list = parsed_html_dict[key]
        nas_list = parsed_html_dict_nas[key]
        total_list = list(set(nyse_list + nas_list))
        complete_stock_symbol_list.extend(total_list)
    complete_stock_symbol_list.sort()
    print(len(complete_stock_symbol_list))
    #This cell will write the contents of complete_stock_symbol_list to a .json file
    path_name = os.path.join('List_of_Stocks', 'nyse_and_nasdaq_stock_symbol_list.json')
    write_json(path_name, complete_stock_symbol_list)
    return None

In [10]:
#This function will pull the complete stock tuple list from the JSON format.
#Recommended Path is os.path.join('List_of_Stocks', 'nyse_and_nasdaq_stock_symbol_list.json')
#ALL FUNCTIONS CONTAINED IN THIS CELL ARE DEBUGGED
def obtain_stock_tuple_list(path):
    stock_list_of_lists = read_json(path)
    return stock_list_of_lists

In [11]:
#This function will scrape the historical yahoo stock data for a given stock tuple
#ALL FUNCTIONS CONTAINED IN THIS CELL ARE DEBUGGED
def obtain_historical_pricing(stock_tuple):
    yf.pdr_override()
    df = web.get_data_yahoo(stock_tuple[0])
    if df.shape[0] == 0:
        df = web.get_data_yahoo(stock_tuple[0], start='2004-01-01')
        if df.shape[0] == 0:
            time.sleep(0.2)
            return stock_tuple
        else:
            time.sleep(0.2)
            string_index_list = []
            for index in df.index:
                string_index = str(index)[0:10]
                string_index_list.append(string_index)
            df['string_index'] = string_index_list
            df = df.set_index('string_index')
            return df
    else:
        time.sleep(0.2)
        string_index_list = []
        for index in df.index:
            string_index = str(index)[0:10]
            string_index_list.append(string_index)
        df['string_index'] = string_index_list
        df = df.set_index('string_index')
        return df

In [12]:
#This function will retrive the investor sentiment data, while mapping it ahead to the current date.
#Recommended Path is os.path.join('Market_Trend_Data', 'Bullish_Bearish_Neutral_Data')
#ALL FUNCTIONS CONTAINED IN THIS CELL ARE DEBUGGED
def obtain_investor_sentiment(bull_path):
    json_string = read_json(bull_path)
    bull_df = pd.read_json(json_string, orient = 'columns')
    string_index_list = []
    for index in bull_df.index:
        string_index = index.strftime("%Y-%m-%d")
        string_index_list.append(string_index)
    bull_df['string_index'] = string_index_list
    bull_df = bull_df.set_index('string_index')
    current_date = datetime.date.today()
    last_reported_date = bull_df.tail(1).index[0]
    date_list = pd.date_range(last_reported_date, current_date, freq = 'B').strftime("%Y-%m-%d").tolist()
    date_list.pop(0)
    n = len(date_list)
    if len(date_list) == 0:
        return bull_df
    else:
        supplemental_data_dictionary = {'Date': date_list, 'Bullish': [bull_df.loc[bull_df.tail(1).index[0]][0]]*n, \
                                        'Neutral': [bull_df.loc[bull_df.tail(1).index[0]][1]]*n,\
                       'Bearish': [bull_df.loc[bull_df.tail(1).index[0]][2]]*n}
        supplemental_data_frame = pd.DataFrame(supplemental_data_dictionary)
        supplemental_data_frame = supplemental_data_frame.set_index('Date')
        bull_df = pd.concat([bull_df, supplemental_data_frame], axis = 0)
        return bull_df

In [13]:
#This function will retrieve the inflation % data, while mapping it ahead to the current date
#Recommended Path is os.path.join('Market_Trend_Data', 'Inflation_Percentage_History')
#ALL FUNCTIONS CONTAINED IN THIS CELL ARE DEBUGGED
def obtain_inflation(inf_path):
    inf_json_string = read_json(inf_path)
    inf_df = pd.read_json(inf_json_string, orient='columns')
    string_index_list = []
    for index in inf_df.index:
        string_index = index.strftime("%Y-%m-%d")
        string_index_list.append(string_index)
    inf_df['string_index'] = string_index_list
    inf_df = inf_df.set_index('string_index')
    current_date = datetime.date.today()
    last_reported_date_inf = inf_df.tail(1).index[0]
    date_list_inf = pd.date_range(last_reported_date_inf, current_date, freq = 'B').strftime("%Y-%m-%d").tolist()
    if date_list_inf[0] == last_reported_date_inf:
        date_list_inf.pop(0)
    if len(date_list_inf) == 0:
        return inf_df
    else:
        n = len(date_list_inf)
        supplemental_data_dictionary_inf = {'string_index': date_list_inf, 'Inflation %': [inf_df.loc[inf_df.tail(1).index[0]][0]]*n}
        supplemental_data_frame_inf = pd.DataFrame(supplemental_data_dictionary_inf)
        supplemental_data_frame_inf = supplemental_data_frame_inf.set_index('string_index')
        inf_df = pd.concat([inf_df, supplemental_data_frame_inf], axis = 0)
        return inf_df

In [14]:
#These functions will retrieve the yahoo data for the DOW, S&P500, and NASDAQ COMPOSITE indices
#ALL FUNCTIONS CONTAINED IN THIS CELL ARE DEBUGGED
def obtain_dow():
    yf.pdr_override()
    df_dow = web.get_data_yahoo('^DJI')
    df2_dow = pd.DataFrame(df_dow['Close'])
    df2_dow.rename(columns ={'Close': 'Dow_Close'}, inplace = True)
    string_index_list = []
    for index in df2_dow.index:
        string_index = str(index)[0:10]
        string_index_list.append(string_index)
    df2_dow['string_index'] = string_index_list
    df2_dow = df2_dow.set_index('string_index')
    time.sleep(0.2)
    return df2_dow
def obtain_sp():
    yf.pdr_override()
    df_sp500 = web.get_data_yahoo('^GSPC')
    df2_sp500 = pd.DataFrame(df_sp500['Close'])
    df2_sp500.rename(columns ={'Close': 'SP500_Close'}, inplace = True)
    string_index_list = []
    for index in df2_sp500.index:
        string_index = str(index)[0:10]
        string_index_list.append(string_index)
    df2_sp500['string_index'] = string_index_list
    df2_sp500 = df2_sp500.set_index('string_index')
    time.sleep(0.2)
    return df2_sp500
def obtain_nasdaq():
    yf.pdr_override()
    df_nasdaq_comp = web.get_data_yahoo('^IXIC')
    df2_nasdaq_comp = pd.DataFrame(df_nasdaq_comp['Close'])
    df2_nasdaq_comp.rename(columns ={'Close': 'NASDAQ_Close'}, inplace = True)
    string_index_list = []
    for index in df2_nasdaq_comp.index:
        string_index = str(index)[0:10]
        string_index_list.append(string_index)
    df2_nasdaq_comp['string_index'] = string_index_list
    df2_nasdaq_comp = df2_nasdaq_comp.set_index('string_index')
    time.sleep(0.2)
    return df2_nasdaq_comp

In [15]:
#This cell will obtain earnings data from the JSON format
#ALL FUNCTIONS CONTAINED IN THIS CELL ARE DEBUGGED
def obtain_earnings(earnings_path):
    earnings_dict = read_json(earnings_path)
    return earnings_dict

In [16]:
#This cell contains the function that will map the earnings reports onto the historical stock jsons, having values 
#  of either 0 (false) or 1 (true)
#ALL FUNCTIONS CONTAINED IN THIS CELL ARE DEBUGGED
def map_earnings(earnings_dict, stock_df, stock_symbol):
    boolean_dictionary = {}
    for int_pos in range(0, len(stock_df)):
        index = str(stock_df.index[int_pos])[0:10]
        if index in boolean_dictionary:
            continue
        elif index not in earnings_dict:
            boolean_dictionary[index] = 0
        elif stock_symbol in earnings_dict[index]:
            boolean_dictionary[index] = 0
            boolean_dictionary[str(stock_df.index[int_pos-1])[0:10]] = 1
        else:
            boolean_dictionary[index] = 0
    boolean_df = pd.DataFrame.from_dict(boolean_dictionary, orient='index', columns = ['Releasing Earnings Tomorrow'])
    return boolean_df

In [17]:
#This function will save the failed stock symbols into a json for later comparison
#ALL FUNCTIONS CONTAINED IN THIS CELL ARE DEBUGGED
def save_failed_scrapes(failed_stock_retrievals):
    time_name_sort = datetime.date.today().strftime("%Y-%m-%d")
    last_path_name = 'failed_stock_symbol_list_' + time_name_sort
    failed_path = os.path.join('List_of_Stocks', last_path_name)
    write_json(failed_path, failed_stock_retrievals)
    return None

In [18]:
#This function will merge all of the dataframes that have been created into one master dataframe
#ALL FUNCTIONS CONTAINED IN THIS CELL ARE DEBUGGED
def merge_dfs(stock_df, market_trend_data_df, inflation_data_df, dow_df, sp_df, nasdaq_df, earnings_df):
    df2 = pd.merge(stock_df, market_trend_data_df, left_index=True, right_index=True)
    df3 = pd.merge(df2, inflation_data_df, left_index=True, right_index=True)
    df4 = pd.merge(df3, dow_df, left_index=True, right_index=True)
    df5 = pd.merge(df4, sp_df, left_index=True, right_index=True)
    df6 = pd.merge(df5, nasdaq_df, left_index=True, right_index=True)
    final_df = pd.merge(df6, earnings_df, left_index = True, right_index = True)
    return final_df

In [19]:
#This function will save the final master dataframe of a stock to its proper json
#ALL FUNCTIONS CONTAINED IN THIS CELL ARE DEBUGGED
def master_json(final_df, stock_tuple):
    df_json = final_df.to_json(orient = 'columns')
    symbol_path_name = stock_tuple[0]+'_final_data.json'
    path_name = os.path.join('Stock_Jsons', symbol_path_name)
    write_json(path_name, df_json)
    return None

In [20]:
#This function will compare the failed stock retrievals of all of the scrapes
#THIS FUNCTION HAS NOT BEEN DEBUGGED
def failed_scrape_analysis():
    path = 'List_of_Stocks'
    dir_list = os.listdir(path)
    failed_list = []
    dictionary_failed_stocks = {}
    for file in dir_list:
        if file[0:6] == 'failed':
            failed_list.append(file)
            f = read_json(os.path.join(path, file))
            dictionary_failed_stocks[file] = f
        else:
            continue
    failed_list.sort()
    initial_failed_stocks = dictionary_failed_stocks[failed_list[0]]
    failed_stock_info = {}
    for name in failed_list:
        failed_stock_list = dictionary_failed_stocks[name]
        key = name 
        length = len(failed_stock_list)
        if failed_list.index(name) != 0:
            exclusive_failed_stocks = []
            for stock in failed_stock_list:
                if stock not in initial_failed_stocks:
                    exclusive_failed_stocks.append(stock)
            failed_stock_info[(key, length)] = exclusive_failed_stocks
        else:
            failed_stock_info[(key, length)] = 'No Exclusives for First Failed Scrapes'
    return failed_stock_info

In [21]:
#This function will update investor sentiment data when released 
#This cell should be run every thursday
#ALL FUNCTIONS CONTAINED IN THIS CELL ARE DEBUGGED
def update_sentiment_data(bull_path):
    url = 'https://www.aaii.com/sentimentsurvey/sent_results'
    headers = {'authority': 'www.aaii.com',\
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',\
               'accept-language': 'en-US,en;q=0.9', \
               'referer': 'https://www.aaii.com/sentimentsurvey/sent_results', \
               'DNT': '1',\
                'sec-ch-ua-mobile': '?0',\
                'sec-ch-ua-platform': '"Windows"',\
                'sec-fetch-dest': 'style',\
                'sec-fetch-mode': 'no-cors',\
                'sec-fetch-site': 'same-origin',\
              'Clear-Site-Data': '"cache", "cookies", "storage"', \
              'cookie': 'CFID=23072366; CFTOKEN=2d012472349cc840-670874C0-0C4D-829C-F16D771B042838EE; JSESSIONID=5B858A89E32DC8109B3C7D6BD096D587.cfusion; HASLOGGEDIN=""; nlbi_1875454=Imi+eMas7C+8eZxO5D1AxQAAAACpFtuMdTiO2aODVUW3LBx/; visid_incap_1875454=SGqJ7WPmTQeFxRqZj5ywwuDouGMAAAAAQUIPAAAAAACVIyIktevmwmwDks+02l1+; incap_ses_617_1875454=CMJLK3ymw0bE8Sx7wgaQCOHouGMAAAAAU/XbeMfpYNqUCN+VUsaWNg==; _ga_M82LSVK8P9=GS1.1.1673062625.7.0.1673062625.0.0.0'}
    page = requests.get(url, headers = headers)
    assert page.status_code == 200
    soup = BeautifulSoup(page.text, 'html.parser')
    table = soup.find('table')
    headers = table.find_all('tr')[0]
    headers_list = []
    for header in headers:
        header_title = header.text
        if header_title == '\n':
            continue
        else:
            headers_list.append(header_title)
    columns = headers_list
    print(columns)
    rows = table.find_all('tr')[1:]
    new_df = pd.DataFrame(columns=columns)
    for row in rows:
        elems = row.find_all('td')
        dict_to_add = {}
        for i,elem in enumerate(elems):
            dict_to_add[columns[i]] = elem.text
        new_df = new_df.append(dict_to_add, ignore_index=True)
    json_string = read_json(bull_path)
    bull_df = pd.read_json(json_string, orient = 'columns')
    string_index_list = []
    for index in bull_df.index:
        string_index = index.strftime("%Y-%m-%d")
        string_index_list.append(string_index)
    bull_df['string_index'] = string_index_list
    bull_df = bull_df.set_index('string_index')
    if bull_df.tail(1)['Bullish'][0] == new_df.head(1)['Bullish'][0][0:4]:
        print('THE INVESTOR SENTIMENT DATA IS ALREADY UPDATED')
        return None
    else:
        latest_reported_date = get_date(new_df.head(1)['Reported Date'][0])
        second_latest_reported_date = ((datetime.date(int(latest_reported_date[0:4]), \
            int(latest_reported_date[5:7]), int(latest_reported_date[8:10])))\
                               - datetime.timedelta(weeks=1)).strftime("%Y-%m-%d")
        date_list = pd.date_range(second_latest_reported_date, latest_reported_date, freq = 'B').strftime("%Y-%m-%d").tolist()
        date_list.pop(0)
        bull_bullish = float(new_df.head(1)['Bullish'][0][0:4])/100
        bull_neutral = float(new_df.head(1)['Neutral'][0][0:4])/100
        bull_bearish = float(new_df.head(1)['Bearish'][0][0:4])/100
        bull_dictionary = {'Date': date_list, 'Bullish': bull_bullish, 'Neutral': bull_neutral,\
                   'Bearish': bull_bearish}
        bull_df = bull_df.loc[bull_df.head(1).index[0]:second_latest_reported_date]
        new_bull_df = pd.DataFrame(bull_dictionary)
        new_bull_df = new_bull_df.set_index('Date')
        appended_bull_df = pd.concat([bull_df, new_bull_df], axis = 0) 
        bull_json_string = appended_bull_df.to_json(orient = 'columns')
        write_json(bull_path, bull_json_string)
    

In [22]:
#This function will update the inflation data with the current months inflation statistics
#This cell should be run every 12th/13th of every month
#ALL FUNCTIONS CONTAINED IN THIS CELL ARE DEBUGGED
def update_inflation_data(inf_path):
    start_date = str(input('Please Input Start Date of Inflation Data yyyy-mm-dd'))
    end_date = str(input('Please Input End Date of Inflation Data yyyy-mm-dd'))
    inflation_percent = float(input('Please input inflation % 0.0'))
    date_list = pd.date_range(start_date, end_date, freq = 'B').strftime("%Y-%m-%d").tolist()
    last_entry_date = (datetime.date(int(start_date[0:4]), int(start_date[5:7]), int(start_date[8:10]))\
                       - datetime.timedelta(days=1)).strftime("%Y-%m-%d")
    n = len(date_list)
    supplemental_data_dictionary_inf = {'string_index': date_list, 'Inflation %': [inflation_percent]*n}
    supplemental_data_frame_inf = pd.DataFrame(supplemental_data_dictionary_inf)
    supplemental_data_frame_inf = supplemental_data_frame_inf.set_index('string_index')
    f = read_json(inf_path)
    inf_df = pd.read_json(f, orient='columns')
    string_index_list = []
    for index in inf_df.index:
        string_index = index.strftime("%Y-%m-%d")
        string_index_list.append(string_index)
    inf_df['string_index'] = string_index_list
    inf_df = inf_df.set_index('string_index')
    inf_df = inf_df.loc[inf_df.head(1).index[0]:last_entry_date]
    appended_inf_df = pd.concat([inf_df, supplemental_data_frame_inf], axis = 0)
    print(inf_df)
    json_string = appended_inf_df.to_json(orient='columns')
    write_json(inf_path, json_string)
    
    
    
#This code is available in the future if I have too much time on my hands and want to automate a process that only 
#   requires manual input 1/month

#     url = 'https://data.bls.gov/timeseries/CUUR0000SA0L1E?output_view=pct_12mths'
#     headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',\
#               }
#     page = requests.get(url, headers=headers)
#     assert page.status_code == 200
#     soup = BeautifulSoup(page.text, 'html.parser')
#     table = soup.find_all('table')[1]
#     print(table)
#     headers = table.find_all('tr')[0]
#     headers_list = []
#     for header in headers:
#         header_title = header.text
#         if header_title == '\n':
#             continue
#         else:
#             headers_list.append(header_title)
#     columns = headers_list
#     print(columns)
#     rows = table.find_all('tr')[1:]
#     new_df = pd.DataFrame(columns=columns)
#     for row in rows:
#         elems = row.find('th')
#         elems.extend(row.find_all('td'))
#         print(elems)
#         dict_to_add = {}
#         for i,elem in enumerate(elems):
#             dict_to_add[columns[i]] = elem.text
#         new_df = new_df.append(dict_to_add, ignore_index=True)
#     return new_df

In [23]:
#This cell scrapes data from yahoo earnings calendar and returns a dictionary with date: [companies]
#ALL FUNCTIONS CONTAINED IN THIS CELL ARE DEBUGGED
def scraper_function(date_object, url, headers, dates):
    earnings_reports_dict = {}
    date_list = date_object.split('-')
    year = int(date_list[0])
    month = int(date_list[1])
    day = int(date_list[2])
    today = datetime.date(year, month, day)
    start = (today - datetime.timedelta(days=today.weekday()+1))
    end = (start + datetime.timedelta(days=6))
    start = str(start)
    end = str(end)
    today_str = str(today)
    page_url=url.format(start, end, today_str)
    print(page_url)
    page = requests.get(page_url, headers=headers)
    print(page.status_code)
    if page.status_code == 404:
        print(date_object)
    soup = BeautifulSoup(page.text, 'html.parser')
    time.sleep(1)
    if soup.find('table') == None:
        return None
    table = soup.find('table')
    headers = table.find_all('th')
    headers_list = []
    for header in headers:
        header_title = header.text
        headers_list.append(header_title)
    columns = headers_list
    rows = soup.table.tbody.find_all('tr')
    stock_df_symbols_list = get_data_frame(rows, columns)
    if date_object in dates:
        earnings_reports_dict[date_object] = stock_df_symbols_list
    else: 
        tomorrow = (today + datetime.timedelta(days=1))
        tomorrow_str = str(tomorrow)
        if tomorrow_str in dates:
            if earnings_reports_dict.get(tomorrow_str) == None:
                earnings_reports_dict[tomorrow_str] = stock_df_symbols_list
            else:
                earnings_reports_dict[tomorrow_str].extend(stock_df_symbols_list)
        else:
            tomorrow = (tomorrow + datetime.timedelta(days=1))
            tomorrow_str = str(tomorrow)
            if tomorrow_str in dates:
                if earnings_reports_dict.get(tomorrow_str) == None:
                    earnings_reports_dict[tomorrow_str] = stock_df_symbols_list
                else:
                    earnings_reports_dict[tomorrow_str].extend(stock_df_symbols_list)
            else:
                tomorrow = (tomorrow + datetime.timedelta(days=1))
                tomorrow_str = str(tomorrow)
                if tomorrow_str in dates:
                    if earnings_reports_dict.get(tomorrow_str) == None:
                        earnings_reports_dict[tomorrow_str] = stock_df_symbols_list
                    else:
                        earnings_reports_dict[tomorrow_str].extend(stock_df_symbols_list)
                else:
                    tomorrow = (tomorrow + datetime.timedelta(days=1))
                    tomorrow_str = str(tomorrow)
                    if tomorrow_str in dates:
                        if earnings_reports_dict.get(tomorrow_str) == None:
                            earnings_reports_dict[tomorrow_str] = stock_df_symbols_list
                        else:
                            earnings_reports_dict[tomorrow_str].extend(stock_df_symbols_list)
                    else:
                        tomorrow = (tomorrow + datetime.timedelta(days=1))
                        tomorrow_str = str(tomorrow)
                        if tomorrow_str in dates:
                            if earnings_reports_dict.get(tomorrow_str) == None:
                                earnings_reports_dict[tomorrow_str] = stock_df_symbols_list
                            else:
                                earnings_reports_dict[tomorrow_str].extend(stock_df_symbols_list)
                        else:
                            tomorrow = (tomorrow + datetime.timedelta(days=1))
                            tomorrow_str = str(tomorrow)
                            if tomorrow_str in dates:
                                if earnings_reports_dict.get(tomorrow_str) == None:
                                    earnings_reports_dict[tomorrow_str] = stock_df_symbols_list
                                else:
                                    earnings_reports_dict[tomorrow_str].extend(stock_df_symbols_list)
    return earnings_reports_dict

In [24]:
#This function updates the earnings reports for the week that it is called for 
#This function should be run every week
#ALL FUNCTIONS CONTAINED IN THIS CELL ARE DEBUGGED
def update_earnings_reports(start, end, earnings_path):
    start = start
    end = end
    dates = custom_business(start, end)
    print(dates)
    unsifted_dates = pd.date_range(start = start, end = end, freq ='D').strftime("%Y-%m-%d").tolist()
    url = 'https://finance.yahoo.com/calendar/earnings?from={}&to={}&day={}'
    headers = {'authority': 's.yimg.com',\
    'method': 'GET',\
    'path': '/uc/finance/dd-site/fonts/YahooSansFinancial-Regular-Web.woff2',\
    'scheme': 'https',\
    'accept': '*/*',\
    'accept-encoding': 'gzip, deflate, br',\
    'accept-language': 'en-US,en;q=0.9',\
    'cache-control': 'no-cache',\
    'origin': 'https://finance.yahoo.com',\
    'pragma': 'no-cache',\
    'referer': 'https://finance.yahoo.com/calendar/earnings?from=2023-12-24&to=2023-12-30&day=2023-12-28',\
    'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="108", "Google Chrome";v="108"',\
    'sec-ch-ua-mobile': '?0',\
    'sec-ch-ua-platform': '"Windows"',\
    'sec-fetch-dest': 'font',\
    'sec-fetch-mode': 'cors',\
    'sec-fetch-site': 'cross-site',\
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36', \
    'Clear-Site-Data': '"cache", "cookies", "storage"'}
    earning_dict = {}
    for date_object in unsifted_dates:
        additive_dict = scraper_function(date_object, url, headers, dates)
        if additive_dict == None:
            continue
        else:
            earning_dict.update(additive_dict)
    earnings_historical = read_json(earnings_path)
    keys_list = []
    for key in earnings_historical:
        if key in earning_dict:
            keys_list.append(key)
        else:
            continue
    for key in keys_list:
        earnings_historical.pop(key)
    earnings_historical.update(earning_dict)
    write_json(earnings_path, earnings_historical)
    return None

In [25]:
#This cell contains all of the overarching paths I need for my functions
stock_tuple_path = os.path.join('List_of_Stocks', 'nyse_and_nasdaq_stock_symbol_list.json')
bull_path = os.path.join('Market_Trend_Data', 'Bullish_Bearish_Neutral_Data')
inf_path = os.path.join('Market_Trend_Data', 'Inflation_Percentage_History')
earnings_path = os.path.join('Market_Trend_Data', 'Earnings_Reports')


In [None]:
#This for loop will loop through all of the stock tuples and save the stock json data to json
stock_tuples = obtain_stock_tuple_list(stock_tuple_path)
failed_stock_retrievals = []
dow_df = obtain_dow()
sp_df = obtain_sp()
nasdaq_df = obtain_nasdaq()
investor_sentiment_df = obtain_investor_sentiment(bull_path)
inflation_df = obtain_inflation(inf_path)
earnings_dict = obtain_earnings(earnings_path)
for stock_tuple in stock_tuples:
    check_wifi()
    historical_pricing_df = obtain_historical_pricing(stock_tuple)
    if len(historical_pricing_df) <= 90:
        failed_stock_retrievals.append(stock_tuple)
    else:
        earnings_df = map_earnings(earnings_dict, historical_pricing_df, stock_tuple[0])
        master_df =  merge_dfs(historical_pricing_df, \
                               investor_sentiment_df, inflation_df, dow_df, sp_df, nasdaq_df, earnings_df)
        master_json(master_df, stock_tuple)
        print('json_created')
print(len(failed_stock_retrievals))
save_failed_scrapes(failed_stock_retrievals)