####  Imports and Credentials

In [1]:
%stop_session

%additional_python_modules pandas_market_calendars==4.4.1, opencv-python==4.10.0.84, bs4==0.0.2

%idle_timeout 15
%glue_version 4.0
%worker_type G.1X
%number_of_workers 10

print('Start session')

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.7 
There is no current session.
Additional python modules to be included:
pandas_market_calendars==4.4.1
opencv-python==4.10.0.84
bs4==0.0.2
Current idle_timeout is None minutes.
idle_timeout has been set to 15 minutes.
Setting Glue version to: 4.0
Previous worker type: None
Setting new worker type to: G.1X
Previous number of workers: None
Setting new number of workers to: 10
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 10
Idle Timeout: 15
Session ID: 1f282af4-1511-46e0-a187-105513991d1b
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog t

In [2]:
import pandas as pd
import numpy as np
import requests
import json
from datetime import datetime
import re
import boto3
import pandas_market_calendars as mcal
import cv2 
from bs4 import BeautifulSoup
from io import StringIO




In [3]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)




In [None]:
from awsglue.utils import getResolvedOptions
arguments = getResolvedOptions(sys.argv, ['apikey', 'tiingo_token', 'username', 'password'])
apikey, tiingo_token, username, password = arguments['apikey'], arguments['tiingo_token'], arguments['username'], arguments['password']




#### Company Profile Functions

In [5]:
def get_tiingo_company_regular_data(ticker, company_name, company_profile):     
    if company_name == None: return None

    headers = {'Content-Type': 'application/json'}
    tiingo_URL = "https://api.tiingo.com/tiingo/daily/" + ticker + "?token=" + tiingo_token
    tiingo_data= requests.get(tiingo_URL, headers=headers).json()
    if tiingo_data != {'detail': 'Not found.'}:
        is_valid_exchange = tiingo_data["exchangeCode"] in ["NASDAQ", "NYSE"]
        description = (tiingo_data["description"].replace(',', '').replace('. ', ' ').replace('.', ' ').lower() 
                        if tiingo_data["description"] != None else "")

        if is_valid_exchange == False and ticker in ["SBNY"]:   #later delisted from NYSE or NASDAQ
            is_valid_exchange = True
            
        company_name = (company_name.lower().replace(',', '').replace('. ', ' ').replace('.', ' ').replace("'",'`')
                        .replace('(',"").replace(')',"").strip())
        
        is_right_company = False
        if tiingo_data["name"] != None:
            is_right_company = all([val in ["inc","co","corp","corporation","company","companies","the","-","&",
                                            "int`l","plc","ltd","llc",'(the)'] 
                                    or val in tiingo_data["name"].lower() or val in description.split()[:20]
                                    for val in company_name.split()])

        ticker_exception_list = ["DPZ","CPAY","CBOE","CB","ORLY","DOC","BXP","EL","LH","BF-B","GE","NSC","SLB"]
        #500s
        ticker_exception_list += ["XOM","FRCB","SIVBQ","DISCK","MRKT","DINO","AGN"]
        #600s
        ticker_exception_list += ["GGP","LVLT","DD","BBBYQ","MNKKQ","FTR","ENDPQ","TE","VAL","CNX","ALTR1"
                                  ,"CMCSA","PLL1","DTV1"]
        #700s
        ticker_exception_list += ["LIFE2","DELL1","FHN","DFODQ","BIGGQ","RRD","SUN1","ATGE","SHLDQ","MMI1","SUNEQ","NSM1"
                                  ,"Q1","PTV","MIL1","XTO","BDK","SGP","WINMQ"]
        #800s
        ticker_exception_list += ["WYE","CTX1","EQ1","ROH","SOV","UST1","AW","ABI1","BUD1","ASH","WWY","WEN","SAF2","FNMA","FMCC"
                                  ,"IAC","OMX1","CCTYQ","CBH1","CZR","DJ","AT1","BOL","AV1","TXU","ASN","SLR","CBSS","KSE"
                                  ,"BMET","MEL","PD1","PGL","APCC","EOP","SBL","BLS","NFB"]
        #900s
        ticker_exception_list += ["FSH","GTW","CTB","EC1","ABS","CHIR1","CIN","MYG","BR1","RBK","KRB","DPHIQ","G1","MAY"
                                  ,"NXTL","TOY","GLK","VRTS1","S1","WLP1","SOTR","AWE","ONE1","AM1","FBF","CE1","BIIB","QTRN"
                                  ,"PHA","EHC","RATL","INCLF","AL1","SHEL","PDG","IMNX","CNXT1","WLL1","MEA1","KM","HM"
                                  ,"RAL","ENRNQ","GPU","TX1"]
        #1000s
        ticker_exception_list += ["TOS","WB2","AGC1","ETS","CEN1","OK","SUB1","UK1","CGP","SMI1","PRD","VO1","FJ","ACKH"
                                  ,"PWJ","CG1","EFU1","UCM","MKG","YNR","NCE","RADCQ","GTE1","MZIAQ","WLA","CHA1","USW",
                                  "CSR1","TMC-A","SMS","MIR1","JOS","CBS1","ARC1","PNU","PBY1","FLTWQ","CNG","RNB","PPW",
                                  "CSE1","CYM","DGN","AIT1","PHB1","MBWM","RYC","NLC1","BFI","TA2","PVT1"]
        #1100s
        ticker_exception_list += ["ATI1","ASND1","MWI","FMY1","AMP1","TCOMA","HBOC","SAI","PZE1","CCI1","GSX1","USS","FCN1"
                                  ,"AHM1","DI1","MNR1","BAY1","DIGI2","WMX","SK1","JH","BEV","SFS1","INGR1"]

        if ticker in ticker_exception_list or (is_valid_exchange and is_right_company):
            company_profile["company_name"] = tiingo_data["name"].title()
            company_profile["is_delisted"] = "delisted" in tiingo_data["description"][:15].lower()
            company_profile["description"] = tiingo_data["description"].replace("DELISTED - ", '')
            company_profile["exchange"] = tiingo_data["exchangeCode"] if tiingo_data["exchangeCode"] in ["NYSE", "NASDAQ"] else None
            if company_profile["description"] == tiingo_data["name"]:
                company_profile["description"] = None
            return True
        else:
            if ticker not in ["BK","BF-B","LLY","GE","IBM"]:
                print("Invalid Tiingo data for ticker symbol: " + ticker)
                return False
    else:
        print("No Tiingo data retrieved for: " + ticker)
        return False

    #DPS company data is not found on meta for tiingo or on FMP


def get_tiingo_company_metadata(ticker, company_profile, meta_data_list):
    tiingo_meta_data_index = {'A': 0, 'B': 1787, 'C': 2763, 'D': 4607, 'E': 5244, 'F': 6052, 'G': 6866, 
                            'H': 7687, 'I': 8349, 'J': 9170, 'K': 9383, 'L': 9730, 'M': 10382, 'N': 11485, 
                            'O': 12269, 'P': 12752, 'Q': 13899, 'R': 14030, 'S': 14729, 'T': 16316, 'U': 17321, 
                            'V': 17665, 'W': 18158, 'X': 18638, 'Y': 18773, 'Z': 18851}
    
    #weird edge cases
    if ticker == "MDR":
        company_profile["sector"] = "Energy"
        company_profile["industry"] = "Oil & Gas Equipment & Services"
        company_profile["source"] = "tiingo"
        return True
    if ticker == "MYG": ticker = "MYG1"
    if ticker == "MII": ticker = "MII1"


    first_letter = ticker[0]
    starting_index = tiingo_meta_data_index[first_letter]
    stopping_index = None
    if first_letter == 'Z':
        stopping_index = -1
    else:
        next_letter = list(tiingo_meta_data_index.keys())[list(tiingo_meta_data_index.keys()).index(first_letter) + 1]
        stopping_index = tiingo_meta_data_index[next_letter]
    meta_data_list = meta_data_list[starting_index:stopping_index]
    match_found = False
    for i, meta_data in enumerate(meta_data_list):
        if ticker.lower() == meta_data["ticker"]:
            company_profile["sector"] = meta_data["sector"]
            company_profile["industry"] = meta_data["industry"]
            company_profile["source"] = "tiingo"
            match_found = True
            return True
    if match_found == False:
        print("No tiingo meta data found for: " + ticker)
        return False


def get_fmp_metadata(ticker, company_name, company_profile, index):
    # fmp_bio_list = fmpsdk.company_profile(apikey=apikey, symbol=ticker)
    request = requests.get("https://financialmodelingprep.com/api/v3/profile/" + ticker + "?apikey=" + apikey)
    fmp_bio_list = request.json()
    if (len(fmp_bio_list) > 0):
        fmp_data = fmp_bio_list[0]
        is_valid_exchange = fmp_data["exchangeShortName"] in ["NASDAQ", "NYSE"]
        company_name = company_name.replace('. ', ' ').replace('.', ' ') #do not remove commas, just "."
        is_right_company = False
        if fmp_data["companyName"] != None:
            is_right_company = all([val in ["inc","co","corp","corporation","company","companies","the","-","&",
                                            "int`l","plc","ltd","llc",'(the)'] 
                                    or val in fmp_data["companyName"].lower()
                                    for val in company_name.lower().split()])
            if is_right_company == False: 
                is_right_company = company_name.lower()[:8] in fmp_data["companyName"].lower().replace('. ', ' ').replace('.', ' ')

        has_exception = False
        if has_exception == False: has_exception = index<500 and ticker in ["BK","BF-B","LLY","GE","IMB"] #<500; unneeded
        if has_exception == False: has_exception = 500<=index<600 and ticker in ["XOM"] #500s; unneeded
        #nothing in 600s
        if has_exception == False: has_exception = 800<=index<900 and ticker in ["FNMA", "FMCC", "IAC"] #800s
        if has_exception == False: has_exception = 900<=index<1000 and ticker in ["SHEL"] #900s
        #nothing in 1000s, 1100s
        if has_exception or (is_valid_exchange and is_right_company):
            company_profile["company_name"] = fmp_data["companyName"]
            company_profile["sector"] = fmp_data["sector"]
            company_profile["industry"] = fmp_data["industry"]
            company_profile["is_delisted"] = not fmp_data["isActivelyTrading"]
            company_profile["description"] = fmp_data["description"]
            company_profile["source"] = "fmp"
            company_profile["exchange"] = fmp_data["exchangeShortName"] if fmp_data["exchangeShortName"] in ["NASDAQ", "NYSE"] else None


            if fmp_data["sector"] in [None, ""]:
                print("Issue with fmp metadata: " + ticker)
                return False
            return True
        else: 
            print("Invalid FMP data for ticker symbol: " + ticker)
            return False

    else:        
        print("No FMP data retrieved for: " + ticker)
        return False

#will return the company profile metadata for edge cases or return None
def get_company_metadata_for_edge_cases(ticker, index):
    company_profiles = {
        ("INFO", 545): {
            "company_name": "IHS Markit Ltd","is_delisted": True,"description": None,
            "sector": "Technology","industry": "Information Technology Services","exchange": "NYSE"},
        ("STI", 586): {
            "company_name": "SunTrust Banks Inc","is_delisted": True,"description": None,
            "sector": "Financial Services","industry": "Banks - Regional","source": "tiingo","exchange": "NYSE"},
        ("LLL", 595): {
            "company_name": "L3 Communications Holdings Inc","is_delisted": True,"description": None,
            "sector": "Industrials","industry": "Aerospace & Defense","source": "tiingo","exchange": "NYSE"},
        ("CA", 611): {
            "company_name": "CA Inc","is_delisted": True,"description": None,
            "sector": "Technology","industry": "Software - Infrastructure","source": "tiingo","exchange": "NYSE"},
        ("XL", 612): {
            "company_name": "XL Group Ltd","is_delisted": True,"description": None,
            "sector": "Financial Services","industry": "Insurance - Property & Casualty","exchange": "NYSE"},
        ("DPS", 614): {
            "company_name": "Dr Pepper Snapple Group Inc","is_delisted": True,"description": None,
            "sector": "Consumer Defensive","industry": "Beverages - Non-Alcoholic","exchange": "NYSE"},
        ("WYND", 620): {
            "company_name": "Wyndham Worldwide Corp","is_delisted": True,"description": None,
            "sector": "Consumer Cyclical","industry": "Lodging","exchange": "NYSE"},
        ("DOW", 629): {
            "company_name": "Dow Chemical Company","is_delisted": True,"description": None,
            "sector": "Basic Materials","industry": "Chemicals","exchange": "NYSE"},
        ("HAR", 648): {
            "company_name": "Harman International Industries IncDE","is_delisted": True,"description": None,
            "sector": "Technology","industry": "Consumer Electronics","exchange": "NYSE"},
        ("SE", 652): {
            "company_name": "Spectra Energy Corp","is_delisted": True,"description": None,
            "sector": "Utilities","industry": "Utilities - Regulated Gas","exchange": "NYSE"},
        ("EMC", 659): {
            "company_name": "EMC Corporation","is_delisted": True,"description": None,
            "sector": "Technology","industry": "Hardware, Equipment & Parts","exchange": "NYSE"},
        ("FRX", 714): {
            "company_name": "Forest Laboratories Inc","is_delisted": True,"description": None,
            "sector": "Healthcare","industry": "Biotechnology","source": "tiingo","exchange": "NYSE"},
        ("LSI", 716): {
            "company_name": "LSI Corp","is_delisted": True,"description": None,
            "sector": "Technology","industry": "Semiconductors","source": "tiingo","exchange": "NASDAQ"},
        ("BEAM", 718): {
            "company_name": "Beam Suntory Inc","is_delisted": True,"description": None,
            "sector": "Consumer Defensive","industry": "Beverages - Wineries & Distilleries","exchange": "NYSE"},
        ("JCP", 726): {
            "company_name": "JCPenney","is_delisted": True,"description": None,
            "sector": "Consumer Cyclical","industry": "Department Stores","exchange": "NYSE"},
        ("NYX", 727): {
            "company_name": "NYSE Euronext","is_delisted": True,"description": None,
            "sector": "Financial Services","industry": "Asset Management","source": "tiingo","exchange": "NYSE"},
        ("S", 732): {
            "company_name": "Sprint Corp","is_delisted": True,"description": None,
            "sector": "Communication Services","industry": "Telecom Services","source": "tiingo","exchange": "NYSE"},
        ("PGN", 750): {
            "company_name": "Progress Energy Inc.","is_delisted": True,"description": None,
            "sector": "Utilities","industry": "Utilities - Regulated Electric","source": "tiingo","exchange": "NYSE"},
        ("NVLS", 752): {
            "company_name": "Novellus Systems Inc","is_delisted": True,"description": None,
            "sector": "Technology","industry": "Semiconductor Equipment & Materials","source": "tiingo","exchange": "NASDAQ"},
        ("EP", 753): {
            "company_name": "El Paso Corp","is_delisted": True,"description": None,
            "sector": "Energy","industry": "Oil & Gas E&P","source": "tiingo","exchange": "NYSE"},
        ("CEG", 757): {
            "company_name": "Constellation Energy Group Inc","is_delisted": True,"description": None,
            "sector": "Utilities","industry": "Utilities - Regulated Electric","source": "tiingo","exchange": "NYSE"},
        ("CPWR", 758): {
            "company_name": "Compuware Corporation","is_delisted": True,"description": None,
            "sector": "Technology","industry": "Software - Services","exchange": "NASDAQ"},
        ("MI", 767): {
            "company_name": "Marshall & Ilsley Corp","is_delisted": True,"description": None,
            "sector": "Financial Services","industry": "Banks - Regional","source": "tiingo","exchange": "NYSE"},
        ("SII", 782): {
            "company_name": "Smith International","is_delisted": True,"description": None,
            "sector": "Energy","industry": "Oil & Gas Equipment & Services","exchange": "NYSE"},
        ("STR", 784): {
            "company_name": "Questar Corp","is_delisted": True,"description": None,
            "sector": "Energy","industry": "Oil & Gas E&P","source": "tiingo","exchange": "NYSE"},
        ("JAVA", 792): {
            "company_name": "Sun Microsystems Inc.","is_delisted": True,"description": None,
            "sector": "Technology","industry": "Scientific & Technical Instruments","source": "tiingo","exchange": "NASDAQ"},
        ("DYN", 797): {
            "company_name": "Dynegy Inc.","is_delisted": True,"description": None,
            "sector": "Utilities","industry": "Utilities - Regulated Electric","source": "tiingo","exchange": "NYSE"},
        ("HPC", 828): {
            "company_name": "Hercules Inc.","is_delisted": True,"description": None,
            "sector": "Basic Materials","industry": "Specialty Chemicals","source": "tiingo","exchange": "NYSE"},
        ("EDS", 842): {
            "company_name": "Electronic Data Systems","is_delisted": True,"description": None,
            "sector": "Technology","industry": "Information Technology Services","exchange": "NYSE"},
        ("TEK", 861): {
            "company_name": "Tektronix Inc.","is_delisted": True,"description": None,
            "sector": "Technology","industry": "Semiconductor Equipment & Materials","source": "tiingo","exchange": "NYSE"},
        ("ADCT", 879): {
            "company_name": "ADC Telecommunications Inc.","is_delisted": True,"description": None,
            "sector": "Technology","industry": "Communication Equipment","source": "tiingo","exchange": "NASDAQ"},
        ("MEDI", 880): {
            "company_name": "MedImmune Inc.","is_delisted": True,"description": None,
            "sector": "Healthcare","industry": "Biotechnology","source": "tiingo","exchange": "NASDAQ"},
        ("CMX", 885): {
            "company_name": "Caremark Rx Inc.","is_delisted": True,"description": None,
            "sector": "Consumer Defensive","industry": "Pharmaceutical Retailers","source": "tiingo","exchange": "NYSE"},
        ("LU", 897): {
            "company_name": "Lucent Technologies Inc.","is_delisted": True,"description": None,
            "sector": "Technology","industry": "Communication Equipment","source": "tiingo","exchange": "NYSE"},
        ("MERQ", 924): {
            "company_name": "Mercury Interactive Corp","is_delisted": True,"description": None,
            "sector": "Technology","industry": "Software - Application","source": "tiingo","exchange": "NASDAQ"},
        ("GP", 927): {
            "company_name": "Georgia-Pacific Corporation","is_delisted": True,"description": None,
            "sector": "Basic Materials","industry": "Paper & Paper Products","exchange": "NYSE"},
        ("PVN", 931): {
            "company_name": "Providian Financial Corporation","is_delisted": True,"description": None,
            "sector": "Financial Services","industry": "Banks - Regional","exchange": "NYSE"},
        ("PWER", 942): {
            "company_name": "Power-One, Inc","is_delisted": True,"description": None,
            "sector": "Technology","industry": "Semiconductor Equipment & Materials","exchange": "NASDAQ"},
        ("PSFT", 943): {
            "company_name": "People Soft Inc","is_delisted": True,"description": None,
            "sector": "Technology","industry": "Information Technology Services","exchange": "NASDAQ"},
        ("UPC", 952): {
            "company_name": "Union Planters Corp","is_delisted": True,"description": None,
            "sector": "Financial Services","industry": "Banks - Regional","source": "tiingo","exchange": "NYSE"},
        ("PCS", 956): {
            "company_name": "Sprint PCS Group","is_delisted": True,"description": None,
            "sector": "Communication Services","industry": "Telecom Services","exchange": "NYSE"},
        ("NSI", 995): {
            "company_name": "National Service Industries Inc","is_delisted": True,"description": None,
            "sector": "Consumer Cyclical","industry": "Personal Services","source": "tiingo","exchange": "NYSE"},
        ("OAT", 1007): {
            "company_name": "Quaker Oats Co","is_delisted": True,"description": None,
            "sector": "Consumer Defensive","industry": "Packaged Foods","exchange": "NYSE"},
        ("H", 1010): {
            "company_name": "Harcourt General Inc.","is_delisted": True,"description": None,
            "sector": "Consumer Cyclical","industry": "Publishing","exchange": "NYSE"},
        ("FPC", 1032): {
            "company_name": "Florida Progress Corp","is_delisted": True,"description": None,
            "sector": "Utilities","industry": "Utilities - Regulated Electric","source": "tiingo","exchange": "NYSE"},
        ("COMS", 1047): {
            "company_name": "3Com Corp.","is_delisted": True,"description": None,
            "sector": "Technology","industry": "Communication Equipment","source": "tiingo","exchange": "NASDAQ"},
        ("USW", 1051): {
            "company_name": "US West Inc.","is_delisted": True,"description": None,
            "sector": "Communication Services","industry": "Telecommunications Services","exchange": "NYSE"},
        ("NLV", 1074): {
            "company_name": "NextLevel Systems Inc.","is_delisted": True,"description": None,
            "sector": "Technology","industry": "Communication Equipment","exchange": "NYSE"},
        ("LI", 1077): {
            "company_name": "Laidlaw International, Inc.","is_delisted": True,"description": None,
            "sector": "Industrials","industry": "General Transportation","exchange": "NYSE"},
        ("TEN", 1083): {
            "company_name": "Tenneco Inc.","is_delisted": True,"description": None,
            "sector": "Consumer Cyclical","industry": "Auto Parts","source": "tiingo","exchange": "NYSE"},
        ("AR", 1084): {
            "company_name": "Asarco Inc.","is_delisted": True,"description": None,
            "sector": "Basic Materials","industry": "Other Industrial Metals & Mining","source": "tiingo","exchange": "NYSE"},
        ("SNT", 1085): {
            "company_name": "Sonat Inc.","is_delisted": True,"description": None,
            "sector": "Energy","industry": "Oil & Gas Midstream","source": "tiingo","exchange": "NYSE"},
        ("BT", 1105): {
            "company_name": "Bankers Trust Corp.","is_delisted": True,"description": None,
            "sector": "Financial Services","industry": "Banks - Regional","source": "tiingo","exchange": "NYSE"},
        ("AN", 1117): {
            "company_name": "Amoco Corp.","is_delisted": True,"description": None,
            "sector": "Energy","industry": "Oil & Gas Refining & Marketing","source": "tiingo","exchange": "NYSE"},
        ("GRN", 1119): {
            "company_name": "General Reinsurance Corporation","is_delisted": True,"description": None,
            "sector": "Financial Services","industry": "Reinsurance","exchange": "NYSE"},
        ("AS", 1120): {
            "company_name": "Armco Inc.","is_delisted": True,"description": None,
            "sector": "Basic Materials","industry": "Steel","source": "tiingo","exchange": "NYSE"},
        ("STO", 1121): {
            "company_name": "Stone Container Corp.","is_delisted": True,"description": None,
            "sector": "Basic Materials","industry": "Paper & Paper Products","exchange": "NYSE"},
        ("C", 1122): {
            "company_name": "Chrysler Corp","is_delisted": True,"description": None,
            "sector": "Consumer Cyclical","industry": "Auto Manufacturers","source": "tiingo","exchange": "NYSE"},
        ("MCIC", 1130): {
            "company_name": "MCI Communications Corp","is_delisted": True,"description": None,
            "sector": "Communication Services","industry": "Telecommunications Services","exchange": "NASDAQ"},
        ("MST", 1133): {
            "company_name": "Mercantile Stores Inc","is_delisted": True,"description": None,
            "sector": "Consumer Cyclical","industry": "Department Stores","exchange": "NYSE"},
        ("WAI", 1134): {
            "company_name": "Western Atlas Inc.","is_delisted": True,"description": None,
            "sector": "Energy","industry": "Oil & Gas Equipment & Services","source": "tiingo","exchange": "NYSE"},
        ("BNL", 1138): {
            "company_name": "Beneficial Corp","is_delisted": True,"description": None,
            "sector": "Financial Services","industry": "Credit Services","source": "tiingo","exchange": "NYSE"},
        ("GNT", 1140): {
            "company_name": "Green Tree Financial Corp","is_delisted": True,"description": None,
            "sector": "Financial Services","industry": "Mortgage Finance","exchange": "NYSE"},
        ("PET", 1141): {
            "company_name": "Pacific Enterprises","is_delisted": True,"description": None,
            "sector": "Utilities","industry": "Utilities - Regulated Gas","source": "tiingo","exchange": "NYSE"},
        ("DEC", 1142): {
            "company_name": "Digital Equipment Corp","is_delisted": True,"description": None,
            "sector": "Technology","industry": "Information Technology Services","exchange": "NYSE"},
        ("CFL", 1143): {
            "company_name": "Corestates Financial Corp","is_delisted": True,"description": None,
            "sector": "Financial Services","industry": "Banks - Regional","source": "tiingo","exchange": "NYSE"},
        ("FG", 1144): {
            "company_name": "USF&G Corp","is_delisted": True,"description": None,
            "sector": "Financial Services","industry": "Insurance - Life","exchange": "NYSE"},
        ("CBB", 1150): {
            "company_name": "Caliber System Inc.","is_delisted": True,"description": None,
            "sector": "Industrials","industry": "Integrated Freight & Logistics","exchange": "NYSE"},
        ("BBI", 1152): {
            "company_name": "Barnett Banks Inc.","is_delisted": True,"description": None,
            "sector": "Financial Services","industry": "Banks - Regional","exchange": "NYSE"}
    }
    key = (ticker, index)
    return company_profiles.get(key)






#will return the stock exchange(NYSE or NASDAQ) for edge cases or return None
def get_stock_exchange_for_edge_cases(index):
    exchange_dict = {129:"NASDAQ",527:"NASDAQ"}
    exchange_dict.update({635:"NASDAQ",636:"NYSE",650:"NASDAQ",662:"NYSE",683:"NASDAQ",691:"NYSE",692:"NASDAQ",699:"NYSE"})
    exchange_dict.update({721:"NASDAQ",728:"NASDAQ",736:"NYSE",739:"NYSE",744:"NYSE",748:"NASDAQ",754:"NYSE",762:"NYSE"
                          ,766:"NYSE",772:"NYSE",781:"NYSE",783:"NYSE",785:"NYSE",787:"NYSE"})
    exchange_dict.update({800:"NYSE",803:"NYSE",806:"NYSE",818:"NYSE",824:"NYSE",826:"NYSE",827:"NYSE",835:"NYSE"
                          ,838:"NYSE",840:"NYSE",841:"NYSE",848:"NYSE",853:"NYSE",854:"NYSE",859:"NYSE",860:"NYSE"
                          ,863:"NYSE",864:"NYSE",866:"NYSE",868:"NYSE",872:"NASDAQ",873:"NYSE",874:"NASDAQ",876:"NYSE"
                          ,886:"NYSE",888:"NYSE",889:"NASDAQ",890:"NYSE",892:"NYSE",893:"NYSE",898:"NYSE"})
    exchange_dict.update({901:"NYSE",907:"NYSE",909:"NYSE",911:"NYSE",915:"NASDAQ",916:"NYSE",918:"NYSE",923:"NYSE"
                          ,925:"NYSE",930:"NYSE",932:"NYSE",933:"NYSE",935:"NASDAQ",938:"NYSE",939:"NYSE",940:"NASDAQ"
                          ,941:"NYSE",947:"NYSE",948:"NASDAQ",949:"NYSE",953:"NYSE",954:"NYSE",958:"NYSE",960:"NYSE"
                          ,962:"NASDAQ",964:"NYSE",968:"NASDAQ",976:"NYSE",978:"NYSE",981:"NYSE",982:"NASDAQ",983:"NASDAQ"
                          ,988:"NYSE",990:"NYSE",991:"NYSE",993:"NYSE",994:"NYSE",996:"NYSE",997:"NYSE",999:"NYSE"})
    exchange_dict.update({1000:"NYSE",1003:"NYSE",1006:"NYSE",1015:"NYSE",1016:"NYSE",1017:"NYSE",1018:"NYSE",1019:"NYSE"
                          ,1022:"NYSE",1024:"NYSE",1027:"NYSE",1033:"NYSE",1035:"NYSE",1036:"NYSE",1037:"NYSE",1038:"NYSE"
                          ,1040:"NYSE",1043:"NYSE",1046:"NYSE",1048:"NYSE",1052:"NYSE",1053:"NYSE",1056:"NYSE",1057:"NYSE"
                          ,1058:"NYSE",1059:"NYSE",1061:"NYSE",1063:"NYSE",1064:"NYSE",1066:"NYSE",1067:"NYSE",1068:"NYSE"
                          ,1069:"NYSE",1071:"NYSE",1073:"NYSE",1075:"NYSE",1079:"NYSE",1081:"NYSE",1086:"NYSE",1087:"NYSE"
                          ,1088:"NYSE",1089:"NYSE",1094:"NYSE",1096:"NYSE",1097:"NYSE",1098:"NYSE",1099:"NYSE"})
    exchange_dict.update({1100:"NYSE",1101:"NASDAQ",1106:"NYSE",1107:"NYSE",1110:"NYSE",1112:"NASDAQ",1114:"NASDAQ"
                          ,1116:"NYSE",1118:"NYSE",1123:"NYSE",1124:"NYSE",1125:"NYSE",1126:"NYSE",1127:"NYSE",1128:"NYSE"
                          ,1129:"NYSE",1131:"NYSE",1132:"NASDAQ",1135:"NYSE",1146:"NYSE",1147:"NYSE"})
    return exchange_dict.get(index)




#### Company Marketcap Functions

In [6]:
#get market cap data from fmp
def get_fmp_market_cap_data(original_ticker, index):
    start_year = 2020
    end_year = 2024
    if index >  570:
        start_year = 2016
        end_year = 2020
    if index > 680:
        start_year = 2012
        end_year = 2016
    if index > 750:
        years_to_subtract = ((index - 550) // 100) * 4
        start_year = 2020 - years_to_subtract
        end_year = 2024 - years_to_subtract
    if index > 1200:
        start_year = 1922
        end_year = 1996       
    fmp_market_cap_data = []
    while True:
        fmp_url = f"https://financialmodelingprep.com/api/v3/historical-market-capitalization/{original_ticker}?from={start_year}-01-01&to={end_year}-12-31&apikey={apikey}"
        response = requests.get(fmp_url)
        result = response.json()
        if len(result) == 0:
            break
        fmp_market_cap_data += result
        start_year -= 5
        end_year -= 5
    fmp_market_cap_data = [{"date":data["date"],"market_cap":data["marketCap"]} for data in fmp_market_cap_data]
    fmp_market_cap_data.reverse() #reverse the list to go from earliest to latest date like tiingo data
    return fmp_market_cap_data


#get market cap data from tiingo
def get_tiingo_market_cap_data(ticker, added_date,marketcap_metadata):
    headers = {'Content-Type': 'application/json'}
    tiingo_URL = "https://api.tiingo.com/tiingo/fundamentals/" + ticker +"/daily?token=" + tiingo_token
    requestResponse = requests.get(tiingo_URL, headers=headers)
    tiingo_market_cap_data = requestResponse.json()
    tiingo_market_cap_data = [{"date":data["date"].split("T")[0],"market_cap":data["marketCap"]} 
                              for data in tiingo_market_cap_data]
    
    #get rid of possible market cap data with null market cap values at start
    for i, data in enumerate(tiingo_market_cap_data):
        if data["market_cap"] != None:
            tiingo_market_cap_data = tiingo_market_cap_data[i:]
            break

    if len(tiingo_market_cap_data) == 0:
        print("Empty tiingomarket cap data for: " + ticker)
    else: 
        #if market cap data already has enough data, return it early; else, look into stock price data
        first_date_in_market_cap_data = datetime.strptime(tiingo_market_cap_data[0]["date"], "%Y-%m-%d")
        if (first_date_in_market_cap_data <= added_date 
            or first_date_in_market_cap_data <= datetime.strptime("1998-01-02", "%Y-%m-%d")):
            return tiingo_market_cap_data

        # print("Looking at tiingo stock price data: " + ticker)
        tiingo_URL = ("https://api.tiingo.com/tiingo/daily/" + ticker + 
                    "/prices?startDate=1950-01-02&token=" + tiingo_token)
        requestResponse = requests.get(tiingo_URL, headers=headers)
        tiingo_stock_price_data = requestResponse.json()

        if (len(tiingo_stock_price_data) > len(tiingo_market_cap_data)):
            date_needed = max(added_date,  datetime.strptime("1998-01-02", "%Y-%m-%d"))
            first_date_in_market_cap_data = datetime.strptime(tiingo_market_cap_data[0]["date"], "%Y-%m-%d")

            nyse = mcal.get_calendar('NYSE')
            # Get the market open days within the specified range
            missing_trading_days = nyse.valid_days(start_date=date_needed, end_date=first_date_in_market_cap_data)
            num_missing_trading_days = len(missing_trading_days)


            marketcap_metadata["num_trading_days_to_calculate"] = num_missing_trading_days
            if num_missing_trading_days > 250:
                print(str(first_date_in_market_cap_data) + " - " + str(date_needed))
                print("\tTiingo: needed to calculate # of additional trading days: " + str(num_missing_trading_days))

            new_market_cap_data = []
            last_stock_market_cap_ratio = None
            for daily_stock_price_data in tiingo_stock_price_data:
                current_date = datetime.strptime(daily_stock_price_data["date"].split("T")[0], "%Y-%m-%d")
                if current_date >= first_date_in_market_cap_data:
                    last_stock_market_cap_ratio = tiingo_market_cap_data[0]["market_cap"] / daily_stock_price_data["close"]
                    if current_date != first_date_in_market_cap_data:
                        print("Same date match not found")
                    break
                new_market_cap_data.append(daily_stock_price_data)
            new_market_cap_data = [{"date": data["date"].split("T")[0]
                                    ,"market_cap": round(data["close"]*last_stock_market_cap_ratio, 2)} 
                              for data in new_market_cap_data]
            tiingo_market_cap_data = new_market_cap_data + tiingo_market_cap_data

            if tiingo_stock_price_data[-1]["date"].split("T")[0] != tiingo_market_cap_data[-1]["date"]:
                print("Tiingo data error: Ending Dates aren't the same: " + ticker)

    return tiingo_market_cap_data


#4 cases
def get_misc_market_cap_data(index, ticker, marketcap_metadata, s3):
    bucket_name = 'sp500-historical-analysis-project'
    file_key = "historical_stock_price/" + str(index) + "_" + ticker + ".csv"
    response = s3.get_object(Bucket=bucket_name, Key=file_key) # Get the object from S3
    csv_content = response['Body'].read().decode('utf-8') # Read the file content as a string
    csv_buffer = StringIO(csv_content) # Use StringIO to convert the string data into a file-like object
    data = pd.read_csv(csv_buffer)
    marketcaps_dict = {89:51.8, 90:51.8, 162:47, 728:24.4}
    if ticker != "DELL":
        data["Price"] = data["Price"].astype(float)
        marketcap_metadata["source"] = "investing.com"
        marketcap_ratio = marketcaps_dict[index] / float(data.iloc[0]["Price"])
        data["Date"] = data["Date"].apply(lambda x : x.split("/")[2] + "-" + x.split("/")[0] + "-" + x.split("/")[1])
        data["Marketcap"] = data["Price"].apply(lambda x : round(x * marketcap_ratio * 1000000000, 2))
        marketcap_data = []
        for i in range(len(data)):
            marketcap_data.append({"date": data.iloc[i]["Date"], "market_cap": data.iloc[i]["Marketcap"]})
        marketcap_data.reverse()
        return marketcap_data
    else: #for 728, DELL
        
        marketcap_metadata["source"] = "https://i.dell.com/sites/csdocuments/Corporate_secure_Documents/en/dell-closing-costs.pdf"
        marketcap_ratio = marketcaps_dict[728] / 13.73

        indices =list(data[data["Date"] == "Please note that these closing prices reflect the Cumulative Split-Adjusted Price."].index)
        marketcap_data = []
        for i in range(len(indices) - 1): #4 columns of data to process, Date, Price, Date2, Price 2
            start, stop = indices[i] + 1, indices[i+1]
            marketcap_data_part1 = []
            marketcap_data_part2 = []
            for index in range(start, stop):
                date_parts_1 = data.iloc[index]["Date"].split("/")
                year, month, day = (date_parts_1[2], date_parts_1[0] if int(date_parts_1[0]) >= 10 else "0" + date_parts_1[0]
                                    ,date_parts_1[1] if int(date_parts_1[1]) >= 10 else "0" + date_parts_1[1])
                date_1 = year + "-" + month + "-" + day
                marketcap_1 = round(float(float(data.iloc[index]["Stock Close Price"]) * marketcap_ratio * 1000000000), 2)
                date_parts_2 = data.iloc[index]["Date.1"].split("/")
                year, month, day = (date_parts_2[2], date_parts_2[0] if int(date_parts_2[0]) >= 10 else "0" + date_parts_2[0]
                                    ,date_parts_2[1] if int(date_parts_2[1]) >= 10 else "0" + date_parts_2[1])
                date_2 = year + "-" + month + "-" + day
                marketcap_2 = round(float(float(data.iloc[index]["Stock Close Price.1"]) * marketcap_ratio * 1000000000), 2)
                marketcap_data_part1.append({"date":date_1, "market_cap": marketcap_1})
                marketcap_data_part2.append({"date":date_2, "market_cap": marketcap_2})
            marketcap_data = marketcap_data + marketcap_data_part1 + marketcap_data_part2 

        for index in range(indices[-1]+1, len(data)): #for last part of data, only 2 columns
            date_parts = data.iloc[index]["Date"].split("/")
            year, month, day = (date_parts[2], date_parts[0] if int(date_parts[0]) >= 10 else "0" + date_parts[0]
                                ,date_parts[1] if int(date_parts[1]) >= 10 else "0" + date_parts[1])
            date = year + "-" + month + "-" + day
            marketcap = round(float(float(data.iloc[index]["Stock Close Price"]) * marketcap_ratio * 1000000000), 2)
            marketcap_data.append({"date":date, "market_cap": marketcap})
    return marketcap_data


#about 36 uses
def get_companiesmarketcap_market_cap_data(index, start_date, end_date, marketcap_metadata):
    #modify start date as needed; get later of current start date and start of 1998
    start_date_check = max(datetime.strptime(start_date, "%B %d, %Y"),  datetime.strptime("1998-01-02", "%Y-%m-%d")).__str__()[:10]
    if start_date_check == "1998-01-02": start_date = "January 2, 1998"
    #modify end date as needed for current SP 500 companies
    if index in [366, 377, 499]: end_date = "September 30, 2024"

    URL_dict = {366:"https://companiesmarketcap.com/paramount/marketcap/", 377:"https://companiesmarketcap.com/linde/marketcap/"
            ,499:"https://companiesmarketcap.com/sp-global/marketcap/"
            ,618:"https://companiesmarketcap.com/monsanto/marketcap/", 693:"https://companiesmarketcap.com/noble-corp/marketcap/"
            ,726:"https://companiesmarketcap.com/jcpenney/marketcap/", 732:"https://companiesmarketcap.com/sprint-corporation/marketcap/"
            ,766:"https://companiesmarketcap.com/national-semiconductor/marketcap/", 772:"https://companiesmarketcap.com/qwest-communications-international/marketcap/"
            ,792:"https://companiesmarketcap.com/sun-microsystems/marketcap/", 799:"https://companiesmarketcap.com/schering-plough/marketcap/"
            ,800:"https://companiesmarketcap.com/wyeth/marketcap/", 812:"https://companiesmarketcap.com/noble-corp/marketcap/"
            ,839:"https://companiesmarketcap.com/lehman-brothers/marketcap/", 852:"https://companiesmarketcap.com/bear-stearns/marketcap/"
            ,869:"https://companiesmarketcap.com/ncr-corporation/marketcap/", 871:"https://companiesmarketcap.com/first-data-corporation/marketcap/"
            ,893:"https://companiesmarketcap.com/bellsouth/marketcap/", 894:"https://companiesmarketcap.com/par-technology/marketcap/"
            ,897:"https://companiesmarketcap.com/lucent-technologies/marketcap/"
            ,907:"https://companiesmarketcap.com/gateway-inc/marketcap/", 929:"https://companiesmarketcap.com/att/marketcap/"
            ,935:"https://companiesmarketcap.com/nextel-communications/marketcap/", 940:"https://companiesmarketcap.com/veritas-technologies/marketcap/"
            ,964:"https://companiesmarketcap.com/pharmacia/marketcap/", 966:"https://companiesmarketcap.com/healthsouth/marketcap/"
            ,975:"https://companiesmarketcap.com/nortel-networks/marketcap/", 985:"https://companiesmarketcap.com/worldcom/marketcap/"
            ,987:"https://companiesmarketcap.com/compaq-computer/marketcap/", 996:"https://companiesmarketcap.com/enron/marketcap/"
            ,998:"https://companiesmarketcap.com/global-crossing/marketcap/"
            ,1027:"https://companiesmarketcap.com/seagram/marketcap/", 1056:"https://companiesmarketcap.com/warner-lambert/marketcap/"
            ,1058:"https://companiesmarketcap.com/mediaone-group/marketcap/", 1068:"https://companiesmarketcap.com/pharmacia-upjohn/marketcap/"
            ,1122:"https://companiesmarketcap.com/chrysler-corporation/marketcap/"
            }

    page_url = URL_dict[index]
    response = requests.get(page_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    data = soup.find("script",{"type": "text/javascript"}).string

    # Use regex to find the data variable
    pattern = re.compile(r"data\s*=\s*(\[\{.*?\}\]);")
    match = pattern.search(data)
    if match:
        data = match.group(1)
        data = json.loads(data)
        for i, point in enumerate(data):
            market_cap_in_millions = point["m"]
            del data[i]['m']
            data[i]["market_cap"] = market_cap_in_millions * (10 ** 5) #should be 10 ** 6, but data is off by 10
        # print(data[:5], data[-5:])

        #get relevant market cap range to return data
        # print(start_date, end_date)
        nyse = mcal.get_calendar('NYSE') # Create a calendar for the New York Stock Exchange
        market_open_days = nyse.valid_days(start_date=start_date, end_date=end_date) # Get the market open days within the specified range
        market_cap_data = []
        for i, date in enumerate(market_open_days):
            month, day, year = date.month, date.day, date.year
            if date.month < 10: month = "0" + str(month)
            if date.day < 10: day = "0" + str(day)
            date = str(year) + "-" + str(month) + "-" + str(day)
            current_unix_time = int(datetime.strptime(date + " 00:00:00", "%Y-%m-%d %H:%M:%S").timestamp())

            first_data = data[0]
            second_data = data[1]
            while True: #get correct two date ranges from available data
                if len(data) == 2: break
                if current_unix_time < first_data["d"] and current_unix_time < second_data["d"]: break

                if (first_data["d"] < current_unix_time < second_data["d"]) == False:
                    data.pop(0)
                    first_data = data[0]
                    second_data = data[1]
                else:
                    break
            
            #less than available date range
            if current_unix_time < first_data["d"] and current_unix_time < second_data["d"]:
                market_cap_data.append({"date":date, "market_cap":round(first_data["market_cap"], 2)})
            #between chosen date ranges
            elif (first_data["d"] < current_unix_time < second_data["d"]):
                #value between 0-1 that tells where you are between the two chosen dates
                x = (current_unix_time - first_data["d"]) / (second_data["d"] - first_data["d"])
                market_cap_range = second_data["market_cap"] - first_data["market_cap"]
                market_cap = round((market_cap_range * x) + first_data["market_cap"], 2) 
                market_cap_data.append({"date":date, "market_cap":market_cap})
            #larger than avaiable date range
            elif current_unix_time > first_data["d"] and current_unix_time > second_data["d"]:
                market_cap_data.append({"date":date, "market_cap":round(second_data["market_cap"], 2)})
            else:
                print("Unknown case")

        marketcap_metadata["source"] = "companiesmarketcap.com"
        return market_cap_data
    else:
        print("Data not found: companiesmarketcap.com")


#about 25 uses
def get_kibot_market_cap_data(index, ticker, start_date, end_date, marketcap_metadata):
    marketcap_metadata["source"] = "kibot"

    if index == 855: ticker = "HET" #edge case; changed ticker symbol later; HET -> CZR (Caesar's Entertainment)

    #need to fix start and end date to right format: year-month-day (xxxx-xx-xx)
    start_date_check = max(datetime.strptime(start_date, "%B %d, %Y"),  datetime.strptime("1998-01-02", "%Y-%m-%d")).__str__()[:10]
    if start_date_check == "1998-01-02": start_date = "January 2, 1998"
    start_date = datetime.strptime(start_date, "%B %d, %Y").__str__()[:10]
    end_date = datetime.strptime(end_date, "%B %d, %Y").__str__()[:10]

    authenciation_request = requests.get("http://api.kibot.com/?action=login&user=" + username + "&password=" + password)
    # print(authenciation_request.text)
    headers = ["Date", "Open", "High", "Low", "Close", "Volume"]
    kibot_request = requests.get("http://api.kibot.com/?action=history&symbol=MSFT&interval=daily&period=10")
    kibot_request = requests.get("http://api.kibot.com/?action=history&symbol=" + str(ticker) + "&interval=daily"
                                 + "&startdate=" + str(start_date) + "&enddate=" + str(end_date))

    result_list = kibot_request.text.splitlines()
    stock_price_data = []
    for line in result_list:
        values = line.split(',')
        stock_price_data.append(dict(zip(headers, values)))

    ending_market_cap_dict = {620:6.11, 646:9, 648:7.8, 656:4, 657:1.6, 658:13.06, 664:14.5, 683:16.34
                            , 711:3.65, 725:6.93, 735:23.41, 743:12.05, 788:3.2
                            , 804:0.294, 809:1.05, 853:0.69, 854:7.03, 855:17.4, 857:1.82
                            , 926:0.8, 967:0.23
                            , 1041:5.69, 1071:0.63
                            , 1149:2.52}
    marketcap_price_ratio = ending_market_cap_dict[index] / float(stock_price_data[-1]["Close"])

    market_cap_data = []
    for data in stock_price_data:
        date_data = data["Date"].split("/")
        date = date_data[2] + "-" + date_data[0] + "-" + date_data[1]
        market_cap = round(float(data["Close"]) * marketcap_price_ratio * (10 ** 9), 2)
        market_cap_data.append({"date":date, "market_cap":market_cap})
    return market_cap_data


def get_finchat_market_cap_data(index, ticker, start_date, end_date, marketcap_metadata, s3):
    #modify start date as needed; get later of current start date and start of 1998
    start_date_check = max(datetime.strptime(start_date, "%B %d, %Y"),  datetime.strptime("1998-01-02", "%Y-%m-%d")).__str__()[:10]
    if start_date_check == "1998-01-02": start_date = "January 2, 1998"
    if end_date == None: end_date = "September 30, 2024"

    #edge cases
    if index == 956: start_date = "March 13, 2001" #limited data
    if index == 989: start_date = "March 18, 1999" #limited data
    if index == 1084: end_date = "November 17, 1999" #more data than needed
    if index == 1086: end_date = "December 2, 1999" #more data than needed
    if index in [1015, 1148, 1152]: #these will be quickly processed instead of normal function
        #for CEN, 1015, 13 days of relevant data; needed
        market_caps = None
        if index == 1015:
            start_date = "March 14, 2001"
            market_caps = [2.48, 2.26, 2.11, 2.04, 2.19, 2.22, 2.18, 1.89, 1.82, 2.01, 1.97, 1.95, 2.05]
        #for ITT, 1148
        if index == 1148:
            market_caps = [3.69, 3.71, 3.68, 3.62, 3.66, 3.37, 3.62, 3.59, 3.66, 3.66, 3.67, 3.69, 3.62, 3.67, 3.68, 3.56, 3.55, 3.67, 3.74, 3.67
            ,3.81, 3.73, 3.74, 3.79, 3.78, 3.86, 3.92, 3.86, 3.81, 3.78, 3.83, 3.83, 3.83, 3.91]
        #for BBI, 1152
        if index == 1152:
            market_caps = [13.92, 14.18, 14.07, 14.07, 13.96, 13.43]
        nyse = mcal.get_calendar('NYSE') # Create a calendar for the New York Stock Exchange
        market_open_days = nyse.valid_days(start_date=start_date, end_date=end_date) # Get the market open days within the specified range
        market_cap_data = []
        for i, date in enumerate(market_open_days):
            month, day, year = date.month, date.day, date.year
            if day < 10: day = "0" + str(day)
            if month < 10: month = "0" + str(month)
            date = str(year) + "-" + str(month) + "-" + str(day)
            market_cap_data.append({"date":date, "market_cap":round(market_caps[i] * (10 ** 9), 2)})
        marketcap_metadata["image_type"] = "screenshot"
        marketcap_metadata["source"] = "finchat.io"
        return market_cap_data

    image_file = "finchat_market_cap_images/" + str(index) + "_" + ticker + ".png"
    response = s3.get_object(Bucket=bucket_name, Key=image_file) # Get the object from S3
    # Read the image content and convert it to a NumPy array
    image_content = response['Body'].read()
    image_np_array = np.frombuffer(image_content, np.uint8)
    image = cv2.imdecode(image_np_array, cv2.IMREAD_COLOR) # Decode the image with OpenCV


    # image = cv2.imread(image_file)
    
    
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    length, height = gray_image.shape[1], gray_image.shape[0]
    min_market_cap, max_market_cap =  get_finchat_market_cap_range(index)

    reference_list = [] #will be a list representing the image scaled to the proper x-range(date) and y-range(market-caps)
    #process the image data to get pixel data scaled properly(x-coord scales to unix time and y-coord scales to market_caps)
    if (length == 4000 and height == 1600) or index in [784]:
        marketcap_metadata["image_type"] = "download"
        gray_image = gray_image[130:-209, 20:-50] #basic cropping of image
        length, height = gray_image.shape[1], gray_image.shape[0] #get new dimensions
        reference_list = process_finchat_image_download(gray_image, length, height, start_date, end_date, min_market_cap, max_market_cap)
    else:
        marketcap_metadata["image_type"] = "screenshot"
        reference_list = process_finchat_image_screenshot(gray_image, length, height, start_date, end_date, min_market_cap, max_market_cap)

    nyse = mcal.get_calendar('NYSE') # Create a calendar for the New York Stock Exchange
    market_open_days = nyse.valid_days(start_date=start_date, end_date=end_date) # Get the market open days within the specified range
    market_cap_data = []
    stock_price_list = reference_list.copy()
    for day in market_open_days:
        month, day, year = day.month, day.day, day.year
        if day < 10: day = "0" + str(day)
        if month < 10: month = "0" + str(month)
        date = str(year) + "-" + str(month) + "-" + str(day)
        unix_time = int(datetime.strptime(date + " 16:00:00", "%Y-%m-%d %H:%M:%S").timestamp())

        market_cap = None
        while(True):
            if len(stock_price_list) == 0:
                break
            if len(stock_price_list) == 1: #occurs after last point of list
                if abs(unix_time - stock_price_list[0][0]) <= (86400 * 7): #at most a week of time difference
                    market_cap = round(stock_price_list[0][1] * 1000000000, 2)
                    market_cap_data.append({"date":date, "market_cap":market_cap})
                    break
                else:
                    print("Last date is more than a day apart: " + date)

            if stock_price_list[0][0] <= unix_time <= stock_price_list[1][0]:
                ratio = (stock_price_list[1][1] - stock_price_list[0][1]) / (stock_price_list[1][0] - stock_price_list[0][0])
                market_cap = (unix_time - stock_price_list[0][0]) * ratio + stock_price_list[0][1]
                market_cap = round(market_cap * 1000000000, 2)
                market_cap_data.append({"date":date, "market_cap":market_cap})
                break
            else:
                stock_price_list.pop(0)
    marketcap_metadata["source"] = "finchat.io"
    return market_cap_data


def process_finchat_image_download(gray_image, length, height, start_date, end_date, min_market_cap, max_market_cap):
    color_frequency = {} #gets the count of each color's pixel count in the image
    color_occurences = {} #records the first occurence of a color at each 'x' coordinate {x_0:{color1:y_0, color2:y_1}, x_1...}
    for x in range(length):
        if color_occurences.get(x) is None: color_occurences[x] = {}
        for y in range(height):
            color_value = gray_image[y, x]
            if color_frequency.get(color_value) is None: color_frequency[color_value] = 0
            else: color_frequency[color_value] += 1

            if color_occurences[x].get(color_value) is None: color_occurences[x][color_value] = y

    #chart should have only 3 major values: black(blackground), grid color(), line color; 
        # will remove the first two from dictionary; afterward, the line color should have the largest # of occurences
    last_y = height - 1
    line_color = 105
    # print("Line color: " + str(line_color))
    # print(color_frequency.get(105))
    x_start, x_end = None, None #get the range: start and ending x-coord locations of the stock line 
    for x_coord in color_occurences:
        if line_color in color_occurences[x_coord] and x_start == None:
            x_start = x_coord
        #after finding start of line, find end of line
        if x_start != None and line_color not in color_occurences[x_coord]:
            x_end = x_coord - 1
            if length - x_end >= 100: #check if line detection ends too early
                print("Line ended too early?")
                return []
            break

    #get pixel coordinates(x,y) of the stock line
    pixel_coordinates = []
    for i in range(x_start,x_end):
        x_coord = i - x_start
        if color_occurences[i].get(line_color) is None:
            print("error at: " + str(i))
            break
        y_coord = height - color_occurences[i].get(line_color) #need to reverse coordinates
        pixel_coordinates.append([x_coord,y_coord]) 
   
    #scale the pixel_coordinates properly to dates(x-asix) and the marketcap range(y-axis)
    reference_list = []
    added_date = start_date
    removed_date = end_date
    unix_added_date = int(datetime.strptime(added_date + " 16:00:00", "%B %d, %Y %H:%M:%S").timestamp())
    unix_removed_date = int(datetime.strptime(removed_date + " 16:00:00", "%B %d, %Y %H:%M:%S").timestamp())
    x_slope = (unix_removed_date - unix_added_date) / (x_end - x_start) #new axis
    y_slope = (max_market_cap - min_market_cap) / height
    for i in range(len(pixel_coordinates)):
        temp_x_coord = pixel_coordinates[i][0]
        x_coord = x_slope * temp_x_coord + unix_added_date
        temp_y_coord = pixel_coordinates[i][1]
        y_coord = (y_slope * temp_y_coord) + min_market_cap
        reference_list.append([x_coord,y_coord])

    return reference_list


def process_finchat_image_screenshot(gray_image, length, height, start_date, end_date, min_market_cap, max_market_cap):
    #get boundaries for graph, excluding pixels containing the x-axis and y-axis
    lower_boundary, left_boundary, right_boundary, top_boundary = None, None, None, None

    white_frequency = {} #gets the count of each color's pixel count in the image
    white_occurences = {} #records the first occurence of a color at each 'x' coordinate {x_0:{color1:y_0, color2:y_1}, x_1...}
    for y in range(round(height*0.6),height):
        for x in range(length):
            color_value = gray_image[y, x]
            if color_value == 255:
                if white_frequency.get(y) is None:
                    white_frequency[y] = 0
                    white_occurences[y] = [x]
                else:
                    white_frequency[y] += 1
                    white_occurences[y].append(x)
        if white_frequency.get(y) is not None and white_frequency[y]/length > 0.75: #found the y-axis here; is >=75% white pixels
            break

    y_values = list(white_occurences.keys())
    #the last y_value should be the top layer/y-value of the x-axis
    #the second-to-last y_value shoud have values of the x-axis where the axis intersect
        #also the lower_boundary, excluding the x-axis
    lower_boundary = y_values[-2] #the pixels above the x-axis; y-value
    left_boundary = white_occurences[y_values[-2]][-1] + 1 #the pixels after the y-axis; x-value
    for y in range(lower_boundary, -1, -1):
        color = gray_image[y][left_boundary - 1]
        if color != 255:
            top_boundary = y + 1
            break
    for x in reversed(white_occurences[lower_boundary + 1]): #look at pixels on line directly above x-axis in reverse
        color = gray_image[lower_boundary][x]
        if color != 45:
            right_boundary = x
            break
    
    #perform basic checks for boundary locations
    if (left_boundary < (0.1*length)) == False: print("Possible issue with left boundary: " + str(left_boundary))
    if (right_boundary > (0.9*length)) == False: print("Possible issue with right boundary: " + str(right_boundary))
    if (200<top_boundary<400) == False: print("Possible issue with upper boundary: " + str(top_boundary))
    if (600<lower_boundary<900) == False: print("Possible issue with lower boundary: " + str(lower_boundary))


    #double check y-axis for right market caps based on ticker locations
    top = all([gray_image[top_boundary][x] == 255 for x in range(left_boundary - 16, left_boundary)])
    bottom = all([gray_image[lower_boundary+1][x] == 255 for x in range(left_boundary - 16, left_boundary)])
    #if tickers are not on top and/or bottom, adjustments are needed
    if (top and bottom) == False:
    #get first two tickers from top to determine ticker interval length; tickers should be at least length 15
        num_tickers = 0
        last_ticker_y_coord = None
        ticker_y_locations = []
        for y in range(top_boundary, lower_boundary + 5):
            #ticker width can be up to 5(assumed); ignore the succeeding y-coords after finding a ticker
            if last_ticker_y_coord != None and y - last_ticker_y_coord < 5: continue
            ticker_check = all([gray_image[y][x] == 255 for x in range(left_boundary - 16, left_boundary)])
            if ticker_check:
                num_tickers += 1
                last_ticker_y_coord = y
                ticker_y_locations.append(y)
        ticker_distances = [ticker_y_locations[i+1] - ticker_y_locations[i] for i in range(len(ticker_y_locations) - 1)]
        average_ticker_interval = sum(ticker_distances) / len(ticker_distances) #tickers can have varying pixel distances of 1-5
        market_cap_ratio = (max_market_cap - min_market_cap) / len(ticker_distances) #market cap amount per ticker interval
        if top == False: #adjust upper market cap
            x = abs(top_boundary - ticker_y_locations[0]) / average_ticker_interval #distance reletive to a full ticker interval  
            max_market_cap = max_market_cap + (x * market_cap_ratio)
        if bottom == False: #adjust lower market cap
            x = abs(lower_boundary - ticker_y_locations[-1]) / average_ticker_interval #distance reletive to a full ticker interval  
            min_market_cap = min_market_cap - (x * market_cap_ratio)
    # print("Market caps: " + str((min_market_cap, max_market_cap)))


    #with calculated boundaries, crop the image and do final processing to get reference list
    cropped_image = gray_image[top_boundary:lower_boundary, left_boundary:right_boundary]
    length = cropped_image.shape[1]
    height = cropped_image.shape[0]

    pixel_coordinates = []
    for x in range(length):
        for y in range(height):
            if cropped_image[y,x] in [170, 132]: #is the value of green and red after being gray-scaled respectively
                pixel_coordinates.append([x,height - y]) #need to reverse y-value since it is read top-down
                break

    reference_list = []

    unix_added_date = int(datetime.strptime(start_date + " 16:00:00", "%B %d, %Y %H:%M:%S").timestamp())
    unix_removed_date = int(datetime.strptime(end_date + " 16:00:00", "%B %d, %Y %H:%M:%S").timestamp())
    x_slope = (unix_removed_date - unix_added_date) / length #new axis
    y_slope = (max_market_cap - min_market_cap) / height
    for i in range(len(pixel_coordinates)):
        temp_x_coord = pixel_coordinates[i][0]
        x_coord = x_slope * temp_x_coord + unix_added_date
        temp_y_coord = pixel_coordinates[i][1]
        y_coord = (y_slope * temp_y_coord) + min_market_cap
        reference_list.append([x_coord,y_coord])
    return reference_list


#return the min, max of the marketcap range in the finchat image 
def get_finchat_market_cap_range(index):
    #store min and max of y-axis
    #will make code to double-check/adjust for screenshotted data
    finchat_download_dict = {545:(15,60), 670:(3,12), 680:(0,45)}
    finchat_screenshot_dict = {117:(0,40), 286:(0,40), 586:(0,40), 595:(5,20), 609:(0,40), 612:(0,15), 649:(5,25), 652:(10,30), 659:(0, 250), 660:(0,140) 
                            ,662:(2,7), 679:(0, 70), 691:(2,14), 695:(0,70)}
    #700s, fixed 767 image
    finchat_download_dict.update({721:(1,14), 727:(2,26), 737:(0,11), 743:(7,14), 754:(5,13), 762:(0,24), 765:(3.75,6.5)
                                ,767:(0,14), 769:(0,8), 771:(2,24), 782:(2,18), 784:(3,14)})
    finchat_screenshot_dict.update({714:(0,35), 716:(0,30), 718:(2,16), 731:(5,20), 733:(0,20), 744:(2,14), 747:(0,20), 749:(0,15)
                                    ,750:(5,20), 752:(0,10), 753:(0,40), 756:(10,35), 757:(5,20), 758:(0,15), 759:(0,35)
                                    ,761:(0,8), 770:(0,15), 774:(0,12), 775:(2,12), 776:(0.5,3), 780:(2,12), 781:(1,5)
                                    ,786:(2,16), 787:(2,8), 790:(10,40), 791:(3,9), 797:(0,15)})
    #800s
    finchat_download_dict.update({806:(3,11), 861:(0,4.5), 867:(6,15), 868:(0,35), 883:(1,8)})
    finchat_screenshot_dict.update({801:(2,12), 803:(0,10), 810:(4,16), 817:(2,14), 818:(2,12), 819:(20,100), 820:(0,120)
                                    ,821:(5,30), 822:(4,8), 824:(1,7), 826:(0,40), 827:(20,55), 828:(1,5), 835:(10,20), 838:(2,9)
                                    ,842:(5,40), 847:(5,30), 848:(1,4.5), 850:(0,10), 859:(2,6), 860:(10,30), 862:(1,5), 863:(1,5)
                                    ,864:(0,8), 866:(0,40), 872:(5,11), 873:(4,8), 874:(2,14), 876:(10,25), 877:(0,7), 879:(0,40)
                                    ,880:(5,20), 881:(4,16), 884:(4,14), 885:(15,30), 886:(0,30), 887:(4,7), 888:(1,2), 889:(2,10)
                                    ,890:(10,22), 891:(0,20), 892:(2,8), 895:(1,4), 898:(4,16)})
    #900s
    finchat_download_dict.update({901:(3,12), 903:(3,11), 904:(0.25,3.75), 905:(2,26), 910:(3,6.5), 911:(1.5,5), 914:(6,30)
                                ,915:(1,13), 916:(2,10), 917:(0,7), 918:(0,40), 919:(5,8.5), 922:(0,55), 923:(0,4), 925:(8,40)
                                ,927:(1,14), 931:(0,20), 932:(20,75), 933:(4,17), 937:(5,19), 939:(0.75,3.5), 941:(4,26)
                                ,943:(2,16), 947:(2,22), 948:(2,16), 949:(5,55), 950:(3,11), 952:(3,7.5), 953:(20,80), 955:(7,15)
                                ,956:(0,30), 958:(15,55), 960:(2,20), 962:(0.75,4), 968:(0.5,5), 969:(3,9), 973:(0,40), 982:(8,18)
                                ,989:(1.9,3.1), 990:(2,5), 993:(0.75,3.25), 994:(4,13), 995:(0.5,2.75), 997:(2.5,6.5), 999:(22,42)})
    finchat_screenshot_dict.update({900:(1,5), 912:(10,25), 921:(0,15), 924:(2,14), 930:(0,14), 936:(4,12), 938:(2,10), 942:(0,8)
                                    ,951:(0.5,4), 954:(0.5,4), 963:(0,2.5), 965:(10,35), 971:(12,20), 983:(0,14), 988:(2,7)})
    #1000s, fixed 1078
    finchat_download_dict.update({1000:(3,9), 1007:(5,14), 1011:(2,14), 1016:(2.5,6.5), 1017:(3,10), 1019:(4,22), 1022:(0.3,1.2)
                                ,1023:(0.3,1.3), 1025:(6,15), 1032:(3.8,5.6), 1033:(3,12), 1034:(2,20), 1036:(4,13), 1037:(3.5,6)
                                ,1038:(0.7,1.8), 1039:(6,10.5), 1040:(1.2,3.4), 1042:(10,21), 1043:(2.6,4.6), 1044:(0.7,1.7)
                                ,1046:(2.5,5.75), 1049:(1,7), 1052:(40,80), 1057:(2,8), 1060:(2.5,6.5), 1063:(1.75,5.25)
                                ,1064:(0.55,1.05), 1065:(2.25,5.25), 1066:(10,55), 1067:(16,34), 1073:(3.75,6.75), 1075:(3.5,8.5)
                                ,1078:(45,90), 1080:(1.4,3.2), 1081:(1,5.5), 1084:(0.4,1.3), 1085:(2.5,5.25), 1086:(0.7,2)
                                ,1087:(0.3,1.3), 1088:(40,90), 1089:(4,11), 1090:(7,18), 1091:(0,2.75), 1092:(3,11), 1094:(1.5,4)
                                ,1095:(0.2,1.8), 1097:(4,8), 1098:(5.5,10)})
    finchat_screenshot_dict.update({1001:(20,70), 1003:(10,25), 1006:(2,10), 1009:(0.6,1.8), 1010:(2.5,5), 1013:(1,7), 1014:(0.6,1.8)
                                    ,1018:(4,10), 1021:(1.5,5), 1045:(0.6,1.6), 1047:(5,40), 1050:(4,12), 1054:(1,5), 1059:(3,7)
                                    ,1061:(1,2.5), 1069:(0.5,2), 1079:(5,9), 1082:(1.5,5), 1083:(2,9), 1093:(0.015,0.05)})
    #1100s, fixed 1100, 1146
    finchat_download_dict.update({1100:(15,70), 1103:(2.5,5.5), 1105:(4,14), 1106:(0.7,1.6), 1107:(5,11), 1108:(2,6.5), 1109:(0.4,2.2)
                                ,1110:(5,13), 1111:(3,6), 1113:(1,3), 1114:(8,17), 1117:(36,60), 1119:(14,21), 1120:(0.25,0.8)
                                ,1125:(1.8,4), 1126:(17,29), 1127:(4,10), 1128:(4,10), 1129:(1.4,2.7), 1130:(28,52), 1131:(4.5,8.5)
                                ,1133:(2.1,3), 1136:(1.9,2.7), 1137:(2.1,3.4), 1138:(3,9), 1140:(2,6.5), 1142:(4.5,9.5)
                                ,1146:(1.52,1.72), 1147:(0.43,0.7)})
    finchat_screenshot_dict.update({1101:(40,120), 1102:(5,12), 1118:(1.5,3.5), 1121:(1,3), 1123:(40,90), 1132:(2,4.5), 1134:(3,5.5)
                                    ,1135:(7,14), 1139:(0.8,1.6), 1141:(2.8,3.8), 1143:(14,22), 1144:(2.2,3.2), 1145:(0.4,0.6)
                                    ,1150:(1.8,2.3)})

    market_cap_range = finchat_download_dict.get(index)
    if market_cap_range == None: market_cap_range = finchat_screenshot_dict.get(index)
    if market_cap_range == None: print("Error: Market cap range not found for finchat image: " + str(index)) 
    return market_cap_range




#### Main Functions

In [22]:
def get_company_data(tiingo_ticker, company_profile, company_name, meta_data_list, original_ticker, index, s3):
    misc_company_metadata = get_company_metadata_for_edge_cases(original_ticker,index)
    if misc_company_metadata != None:
        company_profile.update(misc_company_metadata)
    else:
        got_tiingo_data = get_tiingo_company_regular_data(tiingo_ticker, company_name, company_profile)
        got_tiingo_metadata = (get_tiingo_company_metadata(tiingo_ticker, company_profile, meta_data_list) 
                            if got_tiingo_data == True else False)
        if got_tiingo_metadata == False:
            get_fmp_metadata(original_ticker, company_name, company_profile, index)

    if company_profile.get("exchange") not in ["NYSE","NASDAQ"] : company_profile["exchange"] = get_stock_exchange_for_edge_cases(index)
    if company_profile.get("sector") == None or company_profile.get("industry") == None:
        print("Missing sector and industry in profile data: " + original_ticker)
    if company_profile.get("exchange") not in ["NYSE","NASDAQ"]:
        print("Missing exchange(NYSE, NASDAQ) in profile data: " + original_ticker)

    file_path = "company_profiles/" + str(index) + "_" + original_ticker + ".json"
    json_string = json.dumps(company_profile, indent=4)
    s3.put_object(Bucket=bucket_name, Key=file_path, Body=json_string)
    return company_profile




In [16]:
def get_market_cap_data(ticker, original_ticker, index, added_date, removal_date, company_name, s3):
    marketcap_metadata = {"index":index,"ticker":original_ticker} #used to store where we get our data source
    market_cap_data = []

    #need to double-check that first day and last_day are valid (not a weekend or holiday); fix if needed
    date1 = max(datetime.strptime(added_date, "%B %d, %Y"), datetime.strptime("January 2, 1998", "%B %d, %Y")).__str__()[:10]
    date2 = (datetime.strptime("September 30, 2024", "%B %d, %Y").__str__()[:10] if removal_date == None
                       else datetime.strptime(removal_date, "%B %d, %Y").__str__()[:10])
    nyse = mcal.get_calendar('NYSE')
    valid_trading_days = nyse.valid_days(start_date=date1, end_date=date2)   
    first_day_needed = valid_trading_days[0].date().__str__()
    last_day_needed = valid_trading_days[-1].date().__str__()

    finchat_download_list = [545, 670, 680, 721, 727, 737, 743, 754, 762, 765, 767, 769, 771, 782, 784, 806, 861, 867, 868, 883, 901, 903, 904, 905, 910, 911, 914, 915, 916, 917, 918, 919, 922, 923, 925, 927, 931, 932, 933, 937, 939, 941, 943, 947, 948, 949, 950, 952, 953, 955, 956, 958, 960, 962, 968, 969, 973, 982, 989, 990, 993, 994, 995, 997, 999, 1000, 1007, 1011, 1016, 1017, 1019, 1022, 1023, 1025, 1032, 1033, 1034, 1036, 1037, 1038, 1039, 1040, 1042, 1043, 1044, 1046, 1049, 1052, 1057, 1060, 1063, 1064, 1065, 1066, 1067, 1073, 1075, 1078, 1080, 1081, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091, 1092, 1094, 1095, 1097, 1098, 1100, 1103, 1105, 1106, 1107, 1108, 1109, 1110, 1111, 1113, 1114, 1117, 1119, 1120, 1125, 1126, 1127, 1128, 1129, 1130, 1131, 1133, 1136, 1137, 1138, 1140, 1142, 1146, 1147]
    finchat_screenshot_list = [117, 286, 586, 595, 609, 612, 649, 652, 659, 660, 662, 679, 691, 695, 714, 716, 718, 731, 733, 744, 747, 749, 750, 752, 753, 756, 757, 758, 759, 761, 770, 774, 775, 776, 780, 781, 786, 787, 790, 791, 797, 801, 803, 810, 817, 818, 819, 820, 821, 822, 824, 826, 827, 828, 835, 838, 842, 847, 848, 850, 859, 860, 862, 863, 864, 866, 872, 873, 874, 876, 877, 879, 880, 881, 884, 885, 886, 887, 888, 889, 890, 891, 892, 895, 898, 900, 912, 921, 924, 930, 936, 938, 942, 951, 954, 963, 965, 971, 983, 988, 1001, 1003, 1006, 1009, 1010, 1013, 1014, 1018, 1021, 1045, 1047, 1050, 1054, 1059, 1061, 1069, 1079, 1082, 1083, 1093, 1101, 1102, 1118, 1121, 1123, 1132, 1134, 1135, 1139, 1141, 1143, 1144, 1145, 1150]
    companiesmarketcap_list = [366, 377, 499, 618, 693, 726, 732, 766, 772, 792, 799, 800, 812, 839, 852, 869, 871, 893, 894, 897, 907, 929, 935, 940, 964, 966, 975, 985, 987, 996, 998, 1027, 1056, 1058, 1068, 1122]
    kibot_list = [620, 646, 648, 656, 657, 658, 664, 683, 711, 725, 735, 743, 788, 804, 809, 853, 854, 855, 857, 926, 967, 1041, 1071, 1149]

    #edge case: 728 DELL
    if index == 728:
        market_cap_data = get_misc_market_cap_data(index, original_ticker, marketcap_metadata, s3)
    #finchat.io
    elif index in ([1015, 1148, 1152] + finchat_download_list + finchat_screenshot_list):
        market_cap_data = get_finchat_market_cap_data(index, original_ticker, added_date, removal_date, marketcap_metadata, s3)
    #companiesmarketcap.com
    elif index in companiesmarketcap_list:
        market_cap_data = get_companiesmarketcap_market_cap_data(index, added_date, removal_date, marketcap_metadata)
    #kibot.com
    elif index in kibot_list:
        market_cap_data = get_kibot_market_cap_data(index,original_ticker,added_date,removal_date, marketcap_metadata)
    #tiingo and fmp
    else:
        #format: January 21, 2012
        added_date = datetime.strptime(added_date, "%B %d, %Y") 
        removal_date = (datetime.strptime(removal_date, "%B %d, %Y") if removal_date != None 
                        else datetime.strptime("September 30, 2024", "%B %d, %Y"))
        min_date_needed = max(added_date,  datetime.strptime("1998-01-02", "%Y-%m-%d"))
        tiingo_market_cap_data = []
        fmp_market_cap_data = []
        #get market cap data from tiingo
        if get_tiingo_company_regular_data(ticker,company_name,{}):
            tiingo_market_cap_data = get_tiingo_market_cap_data(ticker, added_date,marketcap_metadata)
            #filter tiingo market cap data to filter out NULLs and unneeded dates
            tiingo_market_cap_data = [val for val in tiingo_market_cap_data 
                                    if val["market_cap"] != None and
                                    min_date_needed <= datetime.strptime(val["date"], "%Y-%m-%d") <= removal_date]
        #get fmp market cap data if tiingo data is empty or starts after "Added Date"
        if len(tiingo_market_cap_data) == 0 or marketcap_metadata.get("num_trading_days_to_calculate") not in [None,0]:
            if get_fmp_metadata(original_ticker,company_name,{},index):
                fmp_market_cap_data = get_fmp_market_cap_data(original_ticker,index)
                #filter fmp market cap data to filter out NULLs and unneeded dates
                fmp_market_cap_data = [val for val in fmp_market_cap_data
                                        if val["market_cap"] != None and
                                        min_date_needed <= datetime.strptime(val["date"], "%Y-%m-%d") <= removal_date]
        #decide whether to use fmp or tiingo data      
        tiingo_has_more_data = len(fmp_market_cap_data) <= len(tiingo_market_cap_data)
        num_days = len(fmp_market_cap_data) - len(tiingo_market_cap_data)
        if marketcap_metadata.get("num_trading_days_to_calculate"): #consider fmp, if tiingo has too many calculations done
            tiingo_has_more_data = (len(fmp_market_cap_data) < (len(tiingo_market_cap_data) - marketcap_metadata["num_trading_days_to_calculate"])
                                        or len(fmp_market_cap_data) == 0)
            num_days = len(fmp_market_cap_data) - (len(tiingo_market_cap_data) - marketcap_metadata["num_trading_days_to_calculate"])
        if tiingo_has_more_data == False:
            print("Using fmp market data; more by: " + str(num_days))
            marketcap_metadata["source"] = "fmp"
            if marketcap_metadata.get("num_trading_days_to_calculate"): del marketcap_metadata["num_trading_days_to_calculate"]
        else:
            marketcap_metadata["source"] = "tiingo"
        market_cap_data = tiingo_market_cap_data if tiingo_has_more_data else fmp_market_cap_data

    # add more data from investing.com for these 3 cases
    if index in [89,90,162]: 
        add_info = {}
        additional_marketcap_data = get_misc_market_cap_data(index, original_ticker, add_info, s3)
        market_cap_data = additional_marketcap_data + market_cap_data
        marketcap_metadata["source"] += " + " + add_info["source"]

    #handle marketcap metadata
    if len(market_cap_data) > 0:
        marketcap_metadata["first_day_have_vs_needed"] = market_cap_data[0]["date"] + " : " + first_day_needed
        marketcap_metadata["last_day_have_vs_needed"] = market_cap_data[-1]["date"] + " : " + last_day_needed

        if type(added_date) == str:
            added_date = datetime.strptime(added_date, "%B %d, %Y") #format: January 1, 2005
        if type(removal_date) == str:    
            removal_date = datetime.strptime(removal_date, "%B %d, %Y") 
        
        first_date_in_data = datetime.strptime(market_cap_data[0]["date"], "%Y-%m-%d") #format: 2006-01-31
        last_date_in_data = datetime.strptime(market_cap_data[-1]["date"], "%Y-%m-%d")

        min_date_needed = datetime.strptime(first_day_needed, "%Y-%m-%d")
        if min_date_needed < first_date_in_data:
            missing_days = nyse.valid_days(start_date=min_date_needed, end_date=first_date_in_data)
            if str(min_date_needed.date()) == str(missing_days[0].date()): #exclude first day if same 
                missing_days = missing_days[1:]
            days_between = len(missing_days)
            if len(missing_days) > 0:
                print(first_day_needed + " : " + first_date_in_data.date().__str__())
                print("Missing days of earlier data for market cap: " + str(days_between))   
                marketcap_metadata["missing_num_days_before"] = days_between
        if last_date_in_data < datetime.strptime(last_day_needed, "%Y-%m-%d"):
            missing_days = nyse.valid_days(start_date=last_date_in_data, end_date=last_day_needed)
            if str(last_date_in_data.date()) == str(missing_days[0].date()): #exclude first day if same 
                missing_days = missing_days[1:]
            days_between = len(missing_days)
            if days_between > 0:
                print(last_date_in_data.date().__str__() + " : " + last_day_needed)
                print("Missing days of later data for market cap: " + str(days_between))
                marketcap_metadata["missing_num_days_after"] = days_between
    else:
        marketcap_metadata["is_empty"] = True
        print("No market cap data found: " + ticker)

    marketcap_metadata["num_of_days_data"] = len(market_cap_data)

    #store market cap data
    file_path = "company_market_cap_data/" + str(index) + "_" + original_ticker + ".json"
    json_string = json.dumps(market_cap_data, indent=4)
    s3.put_object(Bucket=bucket_name, Key=file_path, Body=json_string)
    
    
    #store origin of market cap data
    file_path = "market_cap_metadata/" + str(index) + "_" + original_ticker + ".json"
    json_string = json.dumps(marketcap_metadata, indent=4)
    s3.put_object(Bucket=bucket_name, Key=file_path, Body=json_string)
    
    return market_cap_data




In [17]:
def get_tiingo_ticker(ticker, i):
    #some delisted or moved stocks have modified tickers for tiingo
    #500-600
    if 500<=i<600:
        if ticker == "WRK":     ticker = "WRK-W"
        if ticker == "FRC":     ticker = "FRCB" 
        if ticker == "SIVB":    ticker = "SIVBQ" 
        if ticker == "STI":     ticker = "STI-WS-B"
        if ticker == "INFO":    ticker = "MRKT"
    #600-700    
    if 600<=i<700:
        if ticker == "XL":      ticker = "XLGLF"
        if ticker == "WYND":    ticker = "WYN"
        if ticker == "BBBY":    ticker = "BBBYQ"
        if ticker == "MNK":     ticker = "MNKKQ"
        if ticker == "ENDP":    ticker = "ENDPQ"
        if ticker == "SE":      ticker = "SE1"
        if ticker == "ALTR":    ticker = "ALTR1"
        if ticker == "PLL":     ticker = "PLL1"
        if ticker == "DTV":     ticker = "DTV1"
        if ticker == "WIN":     ticker = "WINMQ"
    #700-800
    if 700<=i<800:
        if ticker == "LIFE":    ticker = "LIFE2"
        if ticker == "DELL":    ticker = "DELL1"
        if ticker == "BIG":     ticker = "BIGGQ"
        if ticker == "DF":      ticker = "DFODQ"
        if ticker == "SUN":     ticker = "SUN1"
        if ticker == "ANR":     ticker = "ANRZQ"
        if ticker == "SHLD":    ticker = "SHLDQ"
        if ticker == "MMI":     ticker = "MMI1"
        if ticker == "NSM":     ticker = "NSM1"
        if ticker == "Q":       ticker = "Q1"
        if ticker == "MIL":     ticker = "MIL1"
    #800-900
    if 800<=i<900:
        if ticker == "CTX":     ticker = "CTX1"
        if ticker == "EQ":      ticker = "EQ1"
        if ticker == "WFT":     ticker = "WFTIQ"
        if ticker == "UST":     ticker = "UST1"
        if ticker == "WB":      ticker = "WB2"
        if ticker == "ABI":     ticker = "ABI1"
        if ticker == "BUD":     ticker = "BUD1"
        if ticker == "SAF":     ticker = "SAF2"
        if ticker == "OMX":     ticker = "OMX1"
        if ticker == "BSC":     ticker = "BSC1"
        if ticker == "CC":      ticker = "CCTYQ"
        if ticker == "CBH":     ticker = "CBH1"
        if ticker == "AT":      ticker = "AT1"
        if ticker == "AV":      ticker = "AV1"
        if ticker == "PD":      ticker = "PD1"
        if ticker == "FSLB":    ticker = "FSL-B"
    #900-1000
    if 900<=i<1000:
        if ticker == "ACV":     ticker = "ACV1"
        if ticker == "ASO":     ticker = "ASO1"
        if ticker == "EC":      ticker = "EC1"
        if ticker == "CHIR":    ticker = "CHIR1"
        if ticker == "BR":      ticker = "BR1"
        if ticker == "JP":      ticker = "JP1"
        if ticker == "DPH":     ticker = "DPHIQ"
        if ticker == "G":       ticker = "G1"
        if ticker == "SDS":     ticker = "SDS1"
        if ticker == "VRTS":    ticker = "VRTS1"
        if ticker == "S":       ticker = "S1"
        if ticker == "WLP":     ticker = "WLP1"
        if ticker == "CF":      ticker = "CF1"
        if ticker == "ONE":     ticker = "ONE1"
        if ticker == "AM":      ticker = "AM1"
        if ticker == "TAP.B":   ticker = "TAP-B"
        if ticker == "TUP":     ticker = "TUPBQ"
        if ticker == "CE":      ticker = "CE1"
        if ticker == "BGEN":    ticker = "BIIB"  #merger in 2003;reentered SP500 directly after
        if ticker == "HI":      ticker = "HI1"
        if ticker == "AMR":     ticker = "AAMRQ"
        if ticker == "COC.B":   ticker = "COC-B"
        if ticker == "NT":      ticker = "NRTLQ"
        if ticker == "AL":      ticker = "AL1"
        if ticker == "CNXT":    ticker = "CNXT1"
        if ticker == "U":       ticker = "UAIRQ"
        if ticker == "WCOM":    ticker = "WCOEQ"
        if ticker == "WLL":     ticker = "WLL1"
        if ticker == "NMK":     ticker = "NMK1"
        if ticker == "MEA":     ticker = "MEA1"
        if ticker == "TX":      ticker = "TX1"
    #1000-1100
    if 1000<=i<1100:
        if ticker == "WB":      ticker = "WB2" #also in 800s
        if ticker == "AGC":     ticker = "AGC1"
        if ticker == "AZA.A":   ticker = "AZA-A"
        if ticker == "CEN":     ticker = "CEN1"
        if ticker == "SUB":     ticker = "SUB1"
        if ticker == "UK":      ticker = "UK1"
        if ticker == "SMI":     ticker = "SMI1"
        if ticker == "VO":      ticker = "VO1"
        if ticker == "AFS.A":   ticker = "AFS-A"
        if ticker == "SEG":     ticker = "STX" #relisted in NASDAQ from NYSE
        if ticker == "CG":      ticker = "CG1"
        if ticker == "EFU":     ticker = "EFU1"
        if ticker == "BFO":     ticker = "BFO1"
        if ticker == "GAP":     ticker = "GAPTQ"
        if ticker == "RAD":     ticker = "RADCQ" #unneccessary, but for testing; has fmp data as well
        if ticker == "GTE":     ticker = "GTE1"
        if ticker == "MZ":      ticker = "MZIAQ"
        if ticker == "CHA":     ticker = "CHA1"
        if ticker == "UMG":     ticker = "USW" #incorrect or older symbol? using usw for something else 1051
        if ticker == "CSR":     ticker = "CSR1"
        if ticker == "TMC.A":   ticker = "TMC-A" 
        if ticker == "MIR":     ticker = "MIR1"
        if ticker == "RLM":     ticker = "RLM1"
        if ticker == "CBS":     ticker = "CBS1"
        if ticker == "ARC":     ticker = "ARC1"
        if ticker == "PBY":     ticker = "PBY1"
        if ticker == "FLE":     ticker = "FLTWQ"
        if ticker == "CSE":     ticker = "CSE1"
        if ticker == "AIT":     ticker = "AIT1"
        if ticker == "PHB":     ticker = "PHB1"
        if ticker == "FTL.A":   ticker = "FTL-A" 
        if ticker == "FRO":     ticker = "FRO1"
        if ticker == "NLC":     ticker = "NLC1"
        if ticker == "TA":      ticker = "TA2"
        if ticker == "PVT":     ticker = "PVT1"
    #1100-1200
    if 1100<=i<1200: 
        if ticker == "ATI":     ticker = "ATI1"
        if ticker == "ASND":    ticker = "ASND1"
        if ticker == "ASC":     ticker = "ASC1"
        if ticker == "HPH":     ticker = "HRZIQ"
        if ticker == "FMY":     ticker = "FMY1"
        if ticker == "UCC":     ticker = "UCC1"
        if ticker == "ANV":     ticker = "ANV1"
        if ticker == "AMP":     ticker = "AMP1"
        if ticker == "PZE":     ticker = "PZE1"
        if ticker == "CCI":     ticker = "CCI1"
        if ticker == "GSX":     ticker = "GSX1"
        if ticker == "FCN":     ticker = "FCN1"
        if ticker == "AHM":     ticker = "AHM1"
        if ticker == "DI":      ticker = "DI1"
        if ticker == "MNR":     ticker = "MNR1"
        if ticker == "BAY":     ticker = "BAY1"
        if ticker == "DIGI":    ticker = "DIGI2"
        if ticker == "GFS.A":   ticker = "GFS-A"
        if ticker == "ECH":     ticker = "ECH1"
        if ticker == "CHRS":    ticker = "CHRS1"
        if ticker == "SK":      ticker = "SK1"
        if ticker == "FLM":     ticker = "FLMIQ"
        
    return ticker




#### Main
    #NOTE: must make a new S3 client inside for each spark call; 
      #a single S3 client is not "distributable" and will run into "lock/threads" issues since a client is intended for "one process"

Spark Times: (default - 3 workers)
    43 seconds for all company_profiles (18 seconds for 10 workers)
    29 minutes for all market_cap_data (2 minutes for 10 workers)
    
    127 seconds for everything(127 seconds for 10 workers)


In [18]:
# Create an S3 client
s3 = boto3.client('s3')
# Define the bucket name and the file key (path to your CSV file in the bucket)
bucket_name = 'sp500-historical-analysis-project'
file_key = 'cleaned_sp_500_dataset.csv'
response = s3.get_object(Bucket=bucket_name, Key=file_key) # Get the object from S3
csv_content = response['Body'].read().decode('utf-8') # Read the file content as a string
csv_buffer = StringIO(csv_content) # Use StringIO to convert the string data into a file-like object

df = pd.read_csv(csv_buffer).iloc[:1153]
df["Index"] = [i for i in range(len(df))]
df = df.where(pd.notnull(df), None) #replace "NAN" values with NULL
print(df) # Display the DataFrame

     Ticker                   Name  ...     Removal_Reason Index
0      DELL      Dell Technologies  ...               None     0
1      ERIE         Erie Indemnity  ...               None     1
2      PLTR  Palantir Technologies  ...               None     2
3        SW       Smurfit WestRock  ...               None     3
4      CRWD            CrowdStrike  ...               None     4
...     ...                    ...  ...                ...   ...
1148    ITT        ITT Corporation  ...  Annual Re-ranking  1148
1149    PAS     Pepsiamericas Inc.  ...  Annual Re-ranking  1149
1150    CBB     Caliber System Inc  ...  Annual Re-ranking  1150
1151    ECO     Echo Bay Mines Ltd  ...  Annual Re-ranking  1151
1152    BBI     Barnett Banks Inc.  ...  Annual Re-ranking  1152

[1153 rows x 7 columns]


In [19]:
meta_data_list = None
bucket_name, file_key = 'sp500-historical-analysis-project', 'misc/tiingo_meta_data.json'
response = s3.get_object(Bucket=bucket_name, Key=file_key) # Get the object from S3
file_content = response['Body'].read().decode('utf-8') # Read the file content and load it as JSON
meta_data_list = json.loads(file_content)

def get_SP500_data(ticker, company_name, index, added_date, removal_date):
    #NOTE: must make a new S3 client inside for each spark call; 
      #a single S3 client is not "distributable" and will run into "lock/threads" issues since a client is intended for "one process"
    s3 = boto3.client('s3')

    original_ticker = ticker

    company_profile = {}
    company_profile["ticker"] = ticker

    ticker = get_tiingo_ticker(ticker, index)
    
    a = get_company_data(ticker, company_profile, company_name, meta_data_list, original_ticker, index, s3)
    b = get_market_cap_data(ticker, original_ticker, index, added_date, removal_date, company_name, s3)
    return b




In [20]:
spark_df = spark.createDataFrame(df)
spark_df.show(5)

+------+--------------------+------------------+------------+--------+--------------+-----+
|Ticker|                Name|        Added_Date|Removed_Date|Replaces|Removal_Reason|Index|
+------+--------------------+------------------+------------+--------+--------------+-----+
|  DELL|   Dell Technologies|September 23, 2024|        null|    ETSY|          null|    0|
|  ERIE|      Erie Indemnity|September 23, 2024|        null|     BIO|          null|    1|
|  PLTR|Palantir Technolo...|September 23, 2024|        null|     AAL|          null|    2|
|    SW|    Smurfit WestRock|     July 05, 2024|        null|     WRK|          null|    3|
|  CRWD|         CrowdStrike|     June 24, 2024|        null|    null|          null|    4|
+------+--------------------+------------------+------------+--------+--------------+-----+
only showing top 5 rows



In [23]:
result = spark_df.rdd.map(lambda x: get_SP500_data(x["Ticker"], x["Name"], x["Index"], x["Added_Date"], x["Removed_Date"])).collect()
result[:5]

[[{'date': '2024-09-23', 'market_cap': 83392886916.81}, {'date': '2024-09-24', 'market_cap': 83208467842.23}, {'date': '2024-09-25', 'market_cap': 85237077662.61}, {'date': '2024-09-26', 'market_cap': 89719879783.17}, {'date': '2024-09-27', 'market_cap': 85272542869.26}, {'date': '2024-09-30', 'market_cap': 84080911925.82}], [{'date': '2024-09-23', 'market_cap': 24864019830.8}, {'date': '2024-09-24', 'market_cap': 25167036792.4}, {'date': '2024-09-25', 'market_cap': 24894968209.5}, {'date': '2024-09-26', 'market_cap': 24845543186.8}, {'date': '2024-09-27', 'market_cap': 24897277790.0}, {'date': '2024-09-30', 'market_cap': 24935154910.2}], [{'date': '2024-09-23', 'market_cap': 84510719860.05}, {'date': '2024-09-24', 'market_cap': 82172478599.1}, {'date': '2024-09-25', 'market_cap': 82662395815.68}, {'date': '2024-09-26', 'market_cap': 82617857886.9}, {'date': '2024-09-27', 'market_cap': 82038864812.76}, {'date': '2024-09-30', 'market_cap': 83307680764.8}], [{'date': '2024-07-08', 'marke

In [24]:
len(result)

1153


In [28]:
for i, x in enumerate(result):
    print(str(i) + ": " + str(len(x)))

0: 6
1: 6
2: 6
3: 60
4: 69
5: 69
6: 69
7: 100
8: 127
9: 127
10: 137
11: 137
12: 198
13: 198
14: 198
15: 240
16: 240
17: 249
18: 262
19: 262
20: 277
21: 323
22: 355
23: 387
24: 390
25: 390
26: 437
27: 445
28: 446
29: 481
30: 495
31: 502
32: 502
33: 512
34: 512
35: 574
36: 574
37: 582
38: 3671
39: 627
40: 650
41: 660
42: 668
43: 699
44: 703
45: 763
46: 763
47: 763
48: 777
49: 805
50: 851
51: 869
52: 889
53: 889
54: 889
55: 914
56: 930
57: 939
58: 950
59: 1002
60: 1014
61: 1014
62: 1077
63: 1077
64: 1097
65: 1105
66: 1105
67: 1131
68: 1131
69: 1154
70: 1178
71: 1201
72: 1201
73: 1201
74: 1211
75: 1213
76: 1222
77: 1257
78: 1262
79: 1265
80: 1295
81: 1295
82: 6282
83: 1314
84: 1323
85: 1336
86: 1343
87: 1342
88: 1385
89: 2274
90: 4980
91: 1409
92: 1416
93: 1435
94: 1452
95: 1466
96: 1466
97: 1479
98: 1484
99: 1502
100: 1510
101: 1533
102: 1573
103: 1581
104: 1583
105: 1592
106: 1635
107: 1645
108: 1697
109: 1752
110: 1770
111: 1781
112: 1784
113: 1808
114: 1808
115: 1808
116: 1808
117: 672