In [1]:
import os
import textwrap
import google.generativeai as genai
from IPython.display import Markdown
import pandas as pd

import warnings
warnings.filterwarnings('ignore')


def to_markdown(text):
    text = text.replace('•', '  *')
    return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [2]:
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

In [5]:
for m in genai.list_models():
    if 'generateContent' in m.supported_generation_methods:
        print(m.name)

models/gemini-1.0-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-latest
models/gemini-1.0-pro-vision-latest
models/gemini-1.5-pro-latest
models/gemini-pro
models/gemini-pro-vision


In [6]:
model = genai.GenerativeModel('gemini-1.0-pro')

In [7]:
model.generate_content("Who is the president of the United States?")

response:
GenerateContentResponse(
    done=True,
    iterator=None,
    result=glm.GenerateContentResponse({'candidates': [{'content': {'parts': [{'text': 'Joe Biden'}], 'role': 'model'}, 'finish_reason': 1, 'index': 0, 'safety_ratings': [{'category': 9, 'probability': 1, 'blocked': False}, {'category': 8, 'probability': 1, 'blocked': False}, {'category': 7, 'probability': 1, 'blocked': False}, {'category': 10, 'probability': 1, 'blocked': False}], 'token_count': 0, 'grounding_attributions': []}]}),
)

In [8]:
ecl_dataset = pd.read_csv('ECL_with_ticker_2024-04-19_14-35-21.csv')
ecl_dataset.head()

Unnamed: 0.1,Unnamed: 0,cik,company,period_of_report,gvkey,datadate,filename,qualified,label,bankruptcy_prediction_split,bankruptcy_date_1,bankruptcy_date_2,bankruptcy_date_3,filing_date,ticker,exchange,gurufocus-stockid,gurufocus-company-name,first_match,cik-equal-gurufocus-cik
0,0,1750,AAR CORP,1994-05-31,1004.0,31/05/1994,/1994/1750_10K_1994_0000912057-94-002818.json,Yes,False,train,,,,1994-08-24,AIR,NYSE,US06AR,AAR Corp,,
1,1,1750,AAR CORP,1995-05-31,1004.0,31/05/1995,/1995/1750_10K_1995_0000912057-95-006316.json,Yes,False,train,,,,1995-08-11,AIR,NYSE,US06AR,AAR Corp,,
2,2,1750,AAR CORP,1996-05-31,1004.0,31/05/1996,/1996/1750_10K_1996_0000912057-96-018355.json,Yes,False,train,,,,1996-08-20,AIR,NYSE,US06AR,AAR Corp,,
3,3,1750,AAR CORP,1997-05-31,1004.0,31/05/1997,/1997/1750_10K_1997_0000912057-97-028915.json,Yes,False,train,,,,1997-08-22,AIR,NYSE,US06AR,AAR Corp,,
4,4,1750,AAR CORP,1998-05-31,1004.0,31/05/1998,/1998/1750_10K_1998_0001047469-98-032283.json,Yes,False,train,,,,1998-08-20,AIR,NYSE,US06AR,AAR Corp,,


In [9]:
ecl_dataset_grouped = ecl_dataset.groupby('cik').agg({'bankruptcy_date_1': 'first', 'bankruptcy_date_2': 'first', 'bankruptcy_date_3': 'first','ticker': 'first', 'label': 'last', 'cik-equal-gurufocus-cik': 'last', 'filing_date': 'last'}).reset_index()
ecl_dataset_grouped['is_bankruptcy_date_filled'] = ecl_dataset_grouped['bankruptcy_date_1'].notnull()
ecl_dataset_grouped['is_bankruptcy_date_filled'].value_counts()

is_bankruptcy_date_filled
False    8252
True      891
Name: count, dtype: int64

Download for each company that is eligible - it means that ticker matches, but company might be not in range & could have not enough variables or could have not enough years of data - the financial data in the date range of the bankruptcy date. 

If there are multiple bankruptcy dates, choose the last one. If there is no bankruptcy date, choose the last filing date.

In [10]:
ecl_dataset_grouped_eligible = ecl_dataset_grouped[ecl_dataset_grouped['cik-equal-gurufocus-cik'] == True]
print(len(ecl_dataset_grouped_eligible))
ecl_dataset_grouped_eligible.head()

5417


Unnamed: 0,cik,bankruptcy_date_1,bankruptcy_date_2,bankruptcy_date_3,ticker,label,cik-equal-gurufocus-cik,filing_date,is_bankruptcy_date_filled
1,1800,,,,ABT,False,True,2021-02-19,False
6,2488,,,,AMD,False,True,2021-01-29,False
7,2491,,,,BYI,False,True,2014-08-29,False
8,2601,,,,ARXX,False,True,2006-09-13,False
10,2969,,,,APD,False,True,2021-11-18,False


In [11]:
companies_eligible_to_financial_dataset = pd.read_csv('companies_eligible_to_financial_dataset_2024-04-19_14-47-55.csv')
len(companies_eligible_to_financial_dataset)
companies_eligible_to_financial_dataset.head()

Unnamed: 0.1,Unnamed: 0,cik,company,label,ticker,gurufocus-company-name,gurufocus-stockid,filing_date,cik-equal-gurufocus-cik,first_match
0,0,1800,ABBOTT LABORATORIES,False,ABT,Abbott Laboratories,US066X,2021-02-19,True,
1,1,2488,ADVANCED MICRO DEVICES INC,False,AMD,Advanced Micro Devices Inc,US022E,2021-01-29,True,
2,2,2491,"BALLY TECHNOLOGIES, INC.",False,BYI,Bally Technologies Inc (Delisted),US06R5,2014-08-29,True,
3,3,2601,AEROFLEX INC,False,ARXX,Aeroflex Inc (Delisted),US026H,2006-09-13,True,
4,4,2969,AIR PRODUCTS & CHEMICALS INC /DE/,False,APD,Air Products & Chemicals Inc,US06DU,2021-11-18,True,


In [12]:
len(companies_eligible_to_financial_dataset)

5417

In [13]:
final_financial_data_in_date_range_directory = 'final_financial_data_in_date_range'

In [14]:
def is_date_valid(value):
  try:
    date = pd.to_datetime(value)
    if pd.isnull(date):
      return False
    return True
  except (pd.errors.OutOfBoundsDatetime, ValueError):
    return False

report_datetime_before_decision means the last date of the financial report that is right before bankruptcy date. If there is no bankruptcy date, it means the last date of the financial report that is right before the last filing date.

In [15]:
eligible_companies_with_minimal_years_count = 0
cnt_bankruptcies = []
cnt_no_bankruptcies = 0
no_date_found = 0
exceptions_count = 0
exceptions = []

for filename in os.listdir(final_financial_data_in_date_range_directory):
    try:
        cik = int(filename.split('-')[0])
        df = pd.read_csv(f'{final_financial_data_in_date_range_directory}/{filename}')
        
        bankruptcies = ecl_dataset.loc[ecl_dataset['cik'] == cik][['bankruptcy_date_1', 'bankruptcy_date_2', 'bankruptcy_date_3']]
    
        if cik == 2601:
            print('here')
    
        bankruptcy_date = None
        report_datetime_before_decision = None
        does_company_went_bankrupt = True
        if is_date_valid(bankruptcies['bankruptcy_date_3'].values[0]):
            bankruptcy_date = bankruptcies['bankruptcy_date_3'].values[0]
            cnt_bankruptcies.append(3)
            print(f'bankruptcy_date_3: {filename}')
        elif is_date_valid(bankruptcies['bankruptcy_date_2'].values[0]):
            bankruptcy_date = bankruptcies['bankruptcy_date_2'].values[0]
            cnt_bankruptcies.append(2)
        elif is_date_valid(bankruptcies['bankruptcy_date_1'].values[0]):
            bankruptcy_date = bankruptcies['bankruptcy_date_1'].values[0]
            cnt_bankruptcies.append(1)
        else:
            # if company didn't go bankrupt, then we need to find the last report before the last filing date
            report_datetime_before_decision = ecl_dataset.loc[ecl_dataset['cik'] == cik]['filing_date'].values[-2]
            cnt_no_bankruptcies += 1
            does_company_went_bankrupt = False
            
        # if company go bankrupt, then we need to find the last report before bankruptcy date
        if does_company_went_bankrupt:
            filing_dates = ecl_dataset.loc[ecl_dataset['cik'] == cik]['filing_date'].values
            report_datetime_before_decision = [date for date in filing_dates if date < bankruptcy_date][-1]
                
        if pd.isna(report_datetime_before_decision):
            print(f'No date found for {filename}')
            no_date_found += 1
            continue
                
        report_filenames = ecl_dataset.loc[ecl_dataset['cik'] == cik][['filename', 'filing_date']]
        report_filename_before_decision = report_filenames.loc[report_filenames['filing_date'] == report_datetime_before_decision]['filename'].values[0]
        
        if report_filename_before_decision is None:
            print(f'No date found for {filename}')
        
        ecl_dataset_grouped_eligible.loc[ecl_dataset_grouped_eligible['cik'] == cik, 'report_datetime_before_decision'] = report_datetime_before_decision
        ecl_dataset_grouped_eligible.loc[ecl_dataset_grouped_eligible['cik'] == cik, 'report_filename_before_decision'] = report_filename_before_decision
        
    except Exception as ex:
        exceptions_count += 1
        exceptions.append(ex)

bankruptcy_date_3: 106618-US03OX_HNH.csv


In [16]:
print(f'No date found: {no_date_found}')

No date found: 0


In [17]:
from collections import Counter
cnt_bankruptcies = Counter(cnt_bankruptcies)
cnt_bankruptcies.most_common(10)

[(1, 310), (2, 20), (3, 1)]

In [18]:
len(ecl_dataset_grouped_eligible)

5417

In [19]:
ecl_dataset_grouped_eligible[ecl_dataset_grouped_eligible['report_filename_before_decision'].isnull()]

Unnamed: 0,cik,bankruptcy_date_1,bankruptcy_date_2,bankruptcy_date_3,ticker,label,cik-equal-gurufocus-cik,filing_date,is_bankruptcy_date_filled,report_datetime_before_decision,report_filename_before_decision
8,2601,,,,ARXX,False,True,2006-09-13,False,,
31,3952,,,,ADGI,False,True,2004-03-15,False,,
32,3982,1987-06-29,,,ALY,False,True,2011-03-15,True,,
66,6071,2001-08-20,1990-04-25,,AMESQ,False,True,2002-05-03,True,,
70,6207,,,,AXR,False,True,2008-07-14,False,,
...,...,...,...,...,...,...,...,...,...,...,...
9137,1834376,,,,INNV,False,True,2021-09-23,False,,
9139,1835256,,,,NAPA,False,True,2021-10-04,False,,
9140,1839439,,,,PYCR,False,True,2021-09-02,False,,
9141,1857951,,,,WEBR,False,True,2021-12-14,False,,


In [20]:
ecl_dataset_grouped_eligible['report_filename_before_decision'].isnull().sum()

754

In [21]:
ecl_dataset_grouped_eligible_with_reports = ecl_dataset_grouped_eligible.dropna(subset=['report_filename_before_decision'])
print(len(ecl_dataset_grouped_eligible_with_reports))
ecl_dataset_grouped_eligible_with_reports.head()

4663


Unnamed: 0,cik,bankruptcy_date_1,bankruptcy_date_2,bankruptcy_date_3,ticker,label,cik-equal-gurufocus-cik,filing_date,is_bankruptcy_date_filled,report_datetime_before_decision,report_filename_before_decision
1,1800,,,,ABT,False,True,2021-02-19,False,2020-02-21,/2019/1800_10K_2019_0001104659-20-023904.json
6,2488,,,,AMD,False,True,2021-01-29,False,2020-02-04,/2019/2488_10K_2019_0000002488-20-000008.json
7,2491,,,,BYI,False,True,2014-08-29,False,2013-08-28,/2013/2491_10K_2013_0001047469-13-008711.json
10,2969,,,,APD,False,True,2021-11-18,False,2020-11-19,/2020/2969_10K_2020_0000002969-20-000049.json
15,3153,,,,ALPpQ.PFD,False,True,2021-02-18,False,2020-02-20,/2019/3153_10K_2019_0000092122-20-000017.json


In [20]:
import datetime
def get_datetime_now():
    return datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
ecl_dataset_grouped_eligible_with_reports.to_csv(f'ecl_dataset_grouped_eligible_with_reports_{get_datetime_now()}.csv', index=False)

In [22]:
ecl_dataset = pd.read_csv('ECL_with_ticker_2024-04-19_14-35-21.csv')
ecl_dataset.head()

Unnamed: 0.1,Unnamed: 0,cik,company,period_of_report,gvkey,datadate,filename,qualified,label,bankruptcy_prediction_split,bankruptcy_date_1,bankruptcy_date_2,bankruptcy_date_3,filing_date,ticker,exchange,gurufocus-stockid,gurufocus-company-name,first_match,cik-equal-gurufocus-cik
0,0,1750,AAR CORP,1994-05-31,1004.0,31/05/1994,/1994/1750_10K_1994_0000912057-94-002818.json,Yes,False,train,,,,1994-08-24,AIR,NYSE,US06AR,AAR Corp,,
1,1,1750,AAR CORP,1995-05-31,1004.0,31/05/1995,/1995/1750_10K_1995_0000912057-95-006316.json,Yes,False,train,,,,1995-08-11,AIR,NYSE,US06AR,AAR Corp,,
2,2,1750,AAR CORP,1996-05-31,1004.0,31/05/1996,/1996/1750_10K_1996_0000912057-96-018355.json,Yes,False,train,,,,1996-08-20,AIR,NYSE,US06AR,AAR Corp,,
3,3,1750,AAR CORP,1997-05-31,1004.0,31/05/1997,/1997/1750_10K_1997_0000912057-97-028915.json,Yes,False,train,,,,1997-08-22,AIR,NYSE,US06AR,AAR Corp,,
4,4,1750,AAR CORP,1998-05-31,1004.0,31/05/1998,/1998/1750_10K_1998_0001047469-98-032283.json,Yes,False,train,,,,1998-08-20,AIR,NYSE,US06AR,AAR Corp,,


In [23]:
text = ''

if True:
    text = 'Discussion and Analysis of Financial Condition and Results of Operations\nThe following discussion and analysis are meant to provide material information relevant to an assessment of the financial condition and results of operations of our company, including an evaluation of the amounts of cash flows from operations and outside resources, liquidity and certain other factors that may affect future results so as to allow investors to better view our company from management\u2019s perspective. The following discussion and analysis of our financial condition and results of operations should be read together with our financial statements and the related notes and other financial information included elsewhere in this annual report on Form 10-K. Some of the information contained in this discussion and analysis or set forth elsewhere in this annual report on Form 10-K, including information with respect to our plans and strategy for our business and financing, includes forward-looking statements that involve risks and uncertainties. Carefully review the \u201cForward-Looking Statements\u201d and \u201cRisk Factors\u201d sections of this annual report on Form 10-K for a discussion of important factors that could cause actual results to differ materially from the results described in or implied by the forward-looking statements contained in the following discussion and analysis.\nOverview\nWe are a multi-national enterprise that leverages its proprietary data visualization technologies to design, develop, manufacture, distribute and service a broad range of products that acquire, store, analyze and present data in multiple formats. We organize our structure around a core set of competencies, including research and\ndevelopment, manufacturing, service, marketing and distribution. We market and sell our products and services through the following two segments:\n\u2022\nProduct Identification (\u201cPI\u201d) - offers color and monochromatic digital label printers, over-printers and custom OEM printers. PI also provides software to design, manage and print labeling and packaging images locally and across networked printing systems, as well as all related printing supplies such as pressure-sensitive labels, tags, inks, toners and thermal transfer ribbons used by digital printers. PI also provides on-site and remote service, spare parts and various service contracts.\n\u2022\nTest and Measurement (\u201cT&M\u201d) - offers a suite of products and services that acquire data from local and networked data streams and sensors as well as wired and wireless networks. The T&M segment includes a line of aerospace printers used to print hard copies of data required for the safe and efficient operation of aircraft, including navigation maps, clearances, arrival and departure procedures, NOTAMS, flight itineraries, weather maps, performance data, passenger data, and various air traffic control data. Aerospace products also include aircraft networking systems for high-speed onboard data transfer. T&M also provides repairs, service and spare parts.\nOn August 4, 2022, we completed the acquisition of Astro Machine, an Illinois-based manufacturer of printing equipment, including label printers, tabbers, conveyors, and envelope feeders, for aggregate consideration of $17.1 million. Astro Machine is reported as part of our PI segment beginning with the third quarter of fiscal 2023. Refer to Note 2, \u201cAcquisition,\u201d in our consolidated financial statements included elsewhere in this report for further details.\nWe market and sell our products and services globally through a diverse distribution structure of direct sales personnel, manufacturers\u2019 representatives and authorized dealers that deliver a full complement of branded products and services to customers in our respective markets. Our growth strategy centers on organic growth through product innovation made possible by research and development initiatives, as well as strategic acquisitions that fit into or complement existing core businesses. In fiscal 2023, 2022, and 2021, revenue from customers in various geographic areas outside the United States, primarily in Western Europe, Canada and Asia, amounted to $50.6 million, $49.3 million, and $45.1 million, respectively.\nWe maintain an active program of product research and development. We spent approximately $6.8 million in both fiscal 2023 and 2022, and $6.2 million in fiscal 2021 on Company-sponsored product development. We are committed to continuous product development as essential to our organic growth and expect to continue our focus on research and development efforts in fiscal 2024 and beyond.'
else:
    pass

In [24]:
def generate_prompt(text_for_prompt):
    return f"You are a financial analyst, specialized in assessing companies' financial health and communicating with clients. I have the management discussion and analysis from a company's 10k report, and I would like to know the elements that could indicate its financial health. Provide a concise summary of the most important information from the investor perspective from the included text, focusing on information about financial performance (e.g., revenue, profitability, liquidity and capital resources), risks (industry Trends, competition, market fluctuations), or future outlook (e.g., growth strategies, acquisitions). Do not split to paragraphs, give just list of sentence that make a summary out of the text. Text: {text_for_prompt}, where text is a management's discussion and analysis section from a 10-K report, with a maximum length of around 15000 words. Try to generate answer with maximum length of 512 tokens."

In [25]:
summarized_mda_sections_for_eligible_companies_in_range_directory = 'summarized_mda_sections_for_eligible_companies_in_range'

In [26]:
reports_filepaths = ecl_dataset_grouped_eligible_with_reports[['cik', 'report_filename_before_decision']].to_numpy()
reports_filepaths[0]

array([1800, '/2019/1800_10K_2019_0001104659-20-023904.json'],
      dtype=object)

In [None]:
import json
import time
from tqdm import tqdm

reports_directory = 'data'
response_lengths = []

cnt = 0
requests_per_day_limit = 1500  # 1500
requests_per_minute = 15
requests_times_elapsed = []

processed_files_count = 1711

for row in tqdm(reports_filepaths[processed_files_count:requests_per_day_limit+processed_files_count]):
    success = False
    ex_counter = 0
    while success is False:
        try:
            time.sleep(3.8)
            start = time.time()
            
            cnt += 1
            if cnt == 10:
                break
            
            cik = row[0]
            report_filename = row[1]
            
            # read reports as json file with open
            with open(f'{reports_directory}/{report_filename}', 'r') as file:
                report = file.read()
                
            parsed_report = json.loads(report)
            item_7 = parsed_report['item_7']
            
            if len(item_7) == 0:
                print(f'Empty item 7 for {cik}')
                break
            
            has_content_proper_length = False
            while has_content_proper_length is False:
                response = model.generate_content(generate_prompt(item_7))
                total_tokens = model.count_tokens(response.text).total_tokens
                if total_tokens < 512:
                    has_content_proper_length = True
                    response_lengths.append(total_tokens)
                    with open(f'{summarized_mda_sections_for_eligible_companies_in_range_directory}/{cik}-summarized_mda-tokens_{total_tokens}.txt', 'w') as file:
                        file.write(response.text)
                    
                else:
                    print('Content too long, trying again...')
    
            end = time.time()
            requests_times_elapsed.append(round(end - start))
                
            success = True
           
        except Exception as ex:
            print(ex)
            success = False
   

  0%|          | 1/1500 [00:14<6:01:40, 14.48s/it]

Content too long, trying again...


  0%|          | 4/1500 [01:11<7:18:19, 17.58s/it]

Content too long, trying again...


In [None]:
import matplotlib.pyplot as plt
tokens_counts = [int(filename.split('_')[-1].split('.')[0]) for filename in os.listdir(summarized_mda_sections_for_eligible_companies_in_range_directory)]

plt.hist(tokens_counts, bins=50)

In [None]:
# rerun process for too short texts
# remove texts with empty item7

too_short_texts_count = 0
for filename in os.listdir(summarized_mda_sections_for_eligible_companies_in_range_directory):
    success = False
    while success is False:
        try:
            with open(f'{summarized_mda_sections_for_eligible_companies_in_range_directory}/{filename}', 'r') as file:
                text = file.read()
                
                tokens_count = filename.split('_')[-1].split('.')[0]
                
                if tokens_count < 200:
                    
                    cik = filename.split('-')[0]
                    too_short_texts_count += 1
                    
                    report_filename = ecl_dataset_grouped_eligible_with_reports.loc[ecl_dataset_grouped_eligible_with_reports['cik'] == int(cik)]['report_filename_before_decision'].values[0]
                    
                    parsed_report = json.loads(report)
                    item_7 = parsed_report['item_7']
                    
                    if len(item_7) == 0:
                        print(f'Empty item 7 for {cik}')
                        break
                    
                    has_content_proper_length = False
                    while has_content_proper_length is False:
                        response = model.generate_content(generate_prompt(item_7))
                        total_tokens = model.count_tokens(response.text).total_tokens
                        if total_tokens < 512:
                            has_content_proper_length = True
                            response_lengths.append(total_tokens)
                            with open(f'{summarized_mda_sections_for_eligible_companies_in_range_directory}/{cik}-summarized_mda-tokens_{total_tokens}.txt', 'w') as file:
                                file.write(response.text)
                                
                            success = True
                        else:
                            print('Content too long, trying again...')
        except Exception as ex:
            success = False
            print(ex)
        

In [None]:
import re

def clean_markdown(text):
  markdown_chars = ["*", "_", "#", "~", "`", ">", "=", "[", "]"]
  text = re.sub(f"[{re.escape(''.join(markdown_chars))}]", "", text)
  text = re.sub(r"\n", "", text)
  
  return text

# Example usage
text = """
*This is a **Markdown** example with* lists.
- It has multiple paragraphs.

> This is a quote.

Will it work?
Yes!
"""
cleaned_text = clean_markdown_and_sentences(text)
print(cleaned_text)

In [None]:
textual_data = pd.DataFrame(columns=['cik', 'text'])

for filename in os.listdir(summarized_mda_sections_for_eligible_companies_in_range_directory):
    with open(f'{summarized_mda_sections_for_eligible_companies_in_range_directory}/{filename}', 'r') as file:
        text = file.read()
        
        text = clean_markdown(text)
        
        cik = int(filename.split('-')[0])
        
        textual_data = textual_data.append({'cik': cik, 'text': text}, ignore_index=True)
        

textual_data.to_csv(f'textual_data_{get_datetime_now()}.csv', index=False)
textual_data.head()

In [None]:
len(textual_data)

In [None]:
# example result dataframes
import pandas as pd

data = {
    'cik': range(1, 11),
    'text': [
        "Company 1 is a leading tech innovator. Its recent product launch shows promise.",
        "Company 2 faced supply chain disruptions.  This impacted quarterly earnings.",
        "Company 3 acquired a smaller competitor, expanding market share.",
        "Company 4's CEO announced retirement plans, causing stock volatility.",
        "Company 5 is investing heavily in renewable energy projects.",
        "Company 6 experienced a cyberattack, compromising sensitive data.",
        "Company 7 reported record profits due to strong consumer demand.",
        "Company 8 is facing regulatory scrutiny over environmental practices.",
        "Company 9 launched a new marketing campaign to boost sales.",
        "Company 10 settled a class-action lawsuit related to product safety."
    ]
}

df = pd.DataFrame(data)
print(df)

df.to_csv('example_textual_data.csv', index=False)