In [245]:
import pandas as pd
import numpy as np
import requests
import html5lib
from bs4 import BeautifulSoup
from datetime import datetime
from openpyxl import workbook
import re
import os
import webbrowser
import unicodedata
import Helper

In [246]:
headers = {
    'User-Agent': 'Goldman Sachs BDC, Inc. GSBD on NYSE'
}
CIK = '0001572694'

filing_data = Helper.fetch_filing_data(cik=CIK, headers=headers)
if filing_data is not None:
    # Write DataFrame to Excel file with auto-adjusting column widths
    file_name = "../GSBC_sec_filing_links.xlsx"
    with pd.ExcelWriter(file_name, engine='xlsxwriter') as writer:
        filing_data.to_excel(writer, index=False)
        worksheet = writer.sheets['Sheet1']
        for i, col in enumerate(filing_data.columns):
            column_len = max(filing_data[col].astype(
                str).str.len().max(), len(col)) + 2
            worksheet.set_column(i, i, column_len)

    print(f"Data written to {file_name}")

filing_links = Helper.get_filing_links('../GSBC_sec_filing_links.xlsx')
print("Filing link recived")

Data written to ../GSBC_sec_filing_links.xlsx
Filing link recived


In [247]:
filing_links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   accessionNumber        36 non-null     object 
 1   filingDate             36 non-null     object 
 2   reportDate             36 non-null     object 
 3   acceptanceDateTime     36 non-null     object 
 4   act                    36 non-null     int64  
 5   form                   36 non-null     object 
 6   fileNumber             36 non-null     object 
 7   filmNumber             36 non-null     int64  
 8   items                  0 non-null      float64
 9   size                   36 non-null     int64  
 10  isXBRL                 36 non-null     int64  
 11  isInlineXBRL           36 non-null     int64  
 12  primaryDocument        36 non-null     object 
 13  primaryDocDescription  36 non-null     object 
 14  fileLink               36 non-null     object 
dtypes: float

In [248]:
filing_links['reportDate'] = pd.to_datetime(
    filing_links['reportDate']).dt.strftime("%B %d, %Y")
print("Date was converted to '%B %d, %Y' format and back to")
filing_links['reportDate'].info()

Date was converted to '%B %d, %Y' format 
<class 'pandas.core.series.Series'>
RangeIndex: 36 entries, 0 to 35
Series name: reportDate
Non-Null Count  Dtype 
--------------  ----- 
36 non-null     object
dtypes: object(1)
memory usage: 416.0+ bytes


In [249]:
# Rewriting the extraction function again

testing 1 response

In [None]:
# consolidated_schedule_regex = re.compile(
#     r'(?i)^\s*.*\s*SCHEDULE(S|)\s+OF\s+INVESTMENTS\s*.*\s*$')
# for tag in content.find_all(text=consolidated_schedule_regex):
#     print("Tag:", tag)
#     print("Find_next:", tag.find_next())
#     date_regex_pattern1 = r'([A-Za-z]+\s+\d{1,2},\s+\d{4})'
#     print("Next_date:", re.search(date_regex_pattern1, tag.find_next().text))
#     print("next:", tag.next)
#     print("Next next: ", tag.find_next().next.next.next.text)
#     print("next sib: ", tag.find_next_sibling())
#     print("\n")

In [None]:
# consolidated_schedule_regex = re.compile(
#     r'(?i)^\s*.*\s*SCHEDULE(S|)\s+OF\s+INVESTMENTS\s*.*\s*$')
# date_regex_pattern1 = r'([A-Za-z]+\s+\d{1,2},\s+\d{4})'

# for index, (url, reporting_date) in enumerate(zip(filing_links['url'], filing_links['Reporting date'])):
#     response = helper.get_response(url=url, headers=headers)
#     content = helper.get_content(response)
#     for tag in content.find_all(text=consolidated_schedule_regex):

#         print(reporting_date, "Tag:", tag)
#         # print("Find_next:", tag.find_next())
#         print(reporting_date, "Next_date:", re.search(
#             date_regex_pattern1, tag.text))
#         # print("next:", tag.next)
#         # print("Next next: ", tag.find_next().next.next.next.text)
#         # print("next sib: ", tag.find_next_sibling())
#         # print("\n")

In [None]:
# filing_links = filing_links.drop(
#     filing_links[filing_links['Reporting date'] == 'December 31, 2017'].index)

In [None]:
def clean_cell_value(x):
    """
    Clean the cell value by normalizing Unicode, stripping leading/trailing spaces, and replacing specific characters.
    """
    if isinstance(x, str):
        replacement_dict = {'—': '0', '%': ' ', '  ': '', '': ''}
        for old_char, new_char in replacement_dict.items():
            x = x.replace(old_char, new_char)
        x = unicodedata.normalize('NFKD', x.strip().strip('\u200b'))
    return x


def extract_tables(content, qtr_date) -> pd.DataFrame:
    master_table = None
    # print("Now doing : ", qtr_date)
    print("Currect file " + qtr_date)

    consolidated_schedule_regex = re.compile(
        r'(?i)^\s*.*\s*SCHEDULE(S|)\s+OF\s+INVESTMENTS\s*.*\s*$')
    date_regex_pattern = r'([A-Za-z]+\s+\d{1,2},\s+\d{4})'
    for tag in content.find_all(text=re.compile(consolidated_schedule_regex)):
        try:
            date_str = re.search(date_regex_pattern, tag.text)
        except Exception as e:
            print(f'Could not find date on extract_tables() : {e}')
        if date_str is not None:
            date_str = str(date_str.group(1))
            date_str = unicodedata.normalize('NFKD', date_str)
            qtr_date = unicodedata.normalize('NFKD', qtr_date)
            # print(f'{qtr_date} : {date_str} : {qtr_date==date_str}')

            if qtr_date == date_str:
                html_table = tag.find_next('table')
                new_table = pd.read_html(
                    html_table.prettify(), keep_default_na=False, skiprows=0, flavor='bs4')[0]
                # new_table = new_table.applymap(lambda x: unicodedata.normalize(
                #     'NFKD', x.strip().strip(u'\u200b').replace('—', '0').replace('%', '').replace('(', '').replace(')', '')) if type(x) == str else x)

                new_table = new_table.applymap(clean_cell_value)

                new_table = new_table.replace(
                    r'^\s*$', np.nan, regex=True).replace(r'^\s*\$\s*$', np.nan, regex=True)
                new_table = new_table.dropna(how='all', axis=0)

                if master_table is None:
                    master_table = new_table
                else:
                    master_table = pd.concat(
                        [master_table, new_table], ignore_index=True)
    try:
        master_table = master_table.applymap(
            lambda x: x.strip().strip(u'\u200b') if type(x) == str else x)
        master_table = master_table.replace(r'^\s*$', np.nan, regex=True).replace(
            r'^\s*\$\s*$', np.nan, regex=True).replace(r'^\s*\)\s*$', np.nan, regex=True)
    except Exception as e:
        print(f'{e}')
    return master_table

In [None]:
path = '/Users/fuadhassan/Desktop/BDC_RA/GSBD/Master_tables_GSBD_Investment.xlsx'
writer = pd.ExcelWriter(path, engine='openpyxl')
for index, (url, reporting_date) in enumerate(zip(filing_links['url'], filing_links['Reporting date'])):
    try:
        content = Helper.get_content(
            Helper.get_response(url=url, headers=headers))
    except Exception as e:
        print(f'failed to get the content: {e}')

    master_table = extract_tables(content, reporting_date)
    master_table.to_csv(
        '../MT_csv_files/'+reporting_date.replace(',', '')+'.csv')
    master_table.to_excel(
        writer, sheet_name=reporting_date.replace(',', ''), index=False)
    writer.book .save(path)
writer.close()

In [None]:
# December 31, 2017

In [151]:
url = Helper.get_file_url('December 31, 2020', filing_links)
content = Helper.get_content(Helper.get_response(url=url, headers=headers))

In [160]:
consolidated_schedule_regex = re.compile(
    r'(?i)^\s*.*\s*SCHEDULE(S|)\s+OF\s+INVESTMENTS\s*.*\s*$')
date_regex_pattern = r'([A-Za-z]+\s+\d{1,2},\s+\d{4})'
for tag in content.find_all(text=consolidated_schedule_regex):
    print("Tag:", tag.text)
    # print("Find_next:", tag.find_next())
    print("Next_date:", re.search(date_regex_pattern, tag.text))
    # print("next:", tag.next)
    print("Next next: ", tag.find_next().text)
    print("next sib: ", tag.find_next_sibling())
    # print("\n")

Tag: Consolidated Schedules of Investments as of December 31, 2020 and 2019
Next_date: <re.Match object; span=(44, 61), match='December\xa031, 2020'>
Next next:  
next sib:  None
Tag: We have audited the accompanying consolidated statements of assets and liabilities, including the consolidated schedules of investments of Goldman Sachs BDC, Inc. and its subsidiaries (the “Company”) as of December 31, 2020 and 2019, and the related consolidated statements of operations, of changes in net assets and of cash flows for each of the three years in the period ended December 31, 2020, including the related notes (collectively referred to as the “consolidated financial statements”). We also have audited the Company's internal control over financial reporting as of December 31, 2020, based on criteria established in 
Next_date: <re.Match object; span=(206, 223), match='December 31, 2020'>
Next next:  Internal Control - Integrated Framework
next sib:  <font>Internal Control - Integrated Framework<

  for tag in content.find_all(text=consolidated_schedule_regex):
