In [47]:
import pandas as pd
import numpy as np
import requests
import html5lib
from bs4 import BeautifulSoup
from datetime import datetime
from openpyxl import workbook
import re
import os
import webbrowser
import unicodedata
from Helper_package import Helper
from html import unescape

In [None]:
headers = {
    'User-Agent': 'Goldman Sachs BDC, Inc. GSBD on NYSE'
}
CIK = '0001572694'

filing_data = Helper.fetch_filing_data(cik=CIK, headers=headers)
if filing_data is not None:
    # Write DataFrame to Excel file with auto-adjusting column widths
    file_name = "../GSBC_sec_filing_links.xlsx"
    with pd.ExcelWriter(file_name, engine='xlsxwriter') as writer:
        filing_data.to_excel(writer, index=False)
        worksheet = writer.sheets['Sheet1']
        for i, col in enumerate(filing_data.columns):
            column_len = max(filing_data[col].astype(
                str).str.len().max(), len(col)) + 2
            worksheet.set_column(i, i, column_len)

    print(f"Data written to {file_name}")

filing_links = Helper.get_filing_links('../GSBC_sec_filing_links.xlsx')
print("Filing link recived")

In [48]:
headers = {
    'User-Agent': 'Goldman Sachs BDC, Inc. GSBD on NYSE'
}
filing_links = Helper.get_filing_links('../GSBC_sec_filing_links.xlsx')
print("Filing link recived")

Filing link recived


In [49]:
filing_links['reportDate'] = pd.to_datetime(
    filing_links['reportDate']).dt.strftime("%B %d, %Y")
print("Date was converted to '%B %d, %Y' format and back to")
filing_links['reportDate'].info()

Date was converted to '%B %d, %Y' format and back to
<class 'pandas.core.series.Series'>
RangeIndex: 36 entries, 0 to 35
Series name: reportDate
Non-Null Count  Dtype 
--------------  ----- 
36 non-null     object
dtypes: object(1)
memory usage: 416.0+ bytes


In [None]:
filing_links.info()

In [None]:
def check_links_validity(filing_links):
    '''
        Checks the validity of each file link in the DataFrame.
    '''
    valid_links = []
    invalid_links = []
    for index, row in filing_links.iterrows():
        link = row['fileLink']
        try:
            response = requests.head(link, headers=headers)
            response.raise_for_status()
            valid_links.append(link)
        except Exception as e:
            invalid_links.append((link, str(e)))
    if invalid_links:
        print("\nInvalid Links:")
        for link, error_message in invalid_links:
            print(f"{link}: {error_message}")

    if len(valid_links) == filing_links.shape[0]:
        print("All Valid Links")


check_links_validity(filing_links)

In [None]:
consolidated_schedule_regex = re.compile(
    r'(?i)^\s*.*\s*SCHEDULE(S|)\s+OF\s+INVESTMENTS\s*.*\s*$')
date_regex_pattern = r'([A-Za-z]+\s+\d{1,2},\s+\d{4})'


def extract_tables(content, report_Date) -> pd.DataFrame:
    master_table = pd.DataFrame()
    print(f"Extractiong File: {report_Date}")
    for tag in content.findAll(string=consolidated_schedule_regex):
        try:
            date_matches = re.findall(date_regex_pattern, tag.text)
        except Exception as e:
            print(f'Could not find date on extract_tables() : {e}')

        if date_matches and len(date_matches) == 1:
            table_date = date_matches[0]
            if table_date is not None and unicodedata.normalize('NFKD', table_date) == unicodedata.normalize('NFKD', report_Date):
                html_table = tag.find_next('table')
                new_table = pd.read_html(
                    html_table.prettify(), na_values="No value")[0]
                # new_table = new_table.applymap(lambda x: unicodedata.normalize(
                #     'NFKD', x.strip().strip(u'\u200b').replace('—', '0').replace('%', '').replace('(', '').replace(')', '')) if type(x) == str else x)
                new_table.replace('(', '-').replace(')', None)
                new_table = new_table.dropna(how='all', axis=0)

                if master_table is None:
                    master_table = new_table
                else:
                    master_table = pd.concat(
                        [master_table, new_table], ignore_index=True)

    return master_table

In [None]:
path = '/Users/fuadhassan/Desktop/BDC_RA/GSBD/New_Master_tables_GSBD_Investment.xlsx'
writer = pd.ExcelWriter(path, engine='openpyxl')
for index, (url, reporting_date) in enumerate(zip(filing_links['fileLink'], filing_links['reportDate'])):
    try:
        content = Helper.get_content(
            Helper.get_response(url=url, headers=headers))
    except Exception as e:
        print(f'failed to get the content: {e}')

    master_table = extract_tables(content, reporting_date)
    master_table.to_csv(
        '../MT_csv_files_2/'+reporting_date.replace(',', '')+'.csv')
    master_table.to_excel(
        writer, sheet_name=reporting_date.replace(',', ''), index=False)
    writer.book .save(path)
writer.close()

In [50]:
test_file = filing_links.iloc[1]
print(test_file)
content = Helper.get_response(
    url=test_file['fileLink'], headers=headers)

accessionNumber                                       0000950170-23-060336
filingDate                                                      2023-11-07
reportDate                                              September 30, 2023
acceptanceDateTime                                2023-11-07T16:40:27.000Z
act                                                                     34
form                                                                  10-Q
fileNumber                                                       814-00998
filmNumber                                                       231384576
items                                                                  NaN
size                                                              48188974
isXBRL                                                                   1
isInlineXBRL                                                             1
primaryDocument                                          gsbd-20230930.htm
primaryDocDescription    

In [11]:
test_file['fileLink']

'https://www.sec.gov/Archives/edgar/data/0001572694/000095017023060336/gsbd-20230930.htm'

In [51]:
str(Helper.parse_and_trim(content.content).find_all('table'))

'[<table>\n\n<tr>\n<td><p><span><ix:nonnumeric><span>Delaware</span></ix:nonnumeric></span></p></td>\n<td><p><span><ix:nonnumeric><span>46-2176593</span></ix:nonnumeric></span></p></td>\n</tr>\n<tr>\n<td><p><span>(State or Other Jurisdiction of</span></p><p><span>Incorporation or Organization)</span></p></td>\n<td><p><span>(I.R.S. Employer</span></p><p><span>Identification No.)</span></p></td>\n</tr>\n\n<tr>\n<td><p><span><ix:nonnumeric><span>200 West Street</span></ix:nonnumeric></span><span>, </span><span><ix:nonnumeric><span>New York</span></ix:nonnumeric></span><span>, </span><span><ix:nonnumeric><span>New York</span></ix:nonnumeric></span></p></td>\n<td><p><span><ix:nonnumeric><span>10282</span></ix:nonnumeric></span></p></td>\n</tr>\n<tr>\n<td><p><span>(Address of Principal Executive Offices)</span></p></td>\n<td><p><span>(Zip Code)</span></p></td>\n</tr>\n</table>, <table>\n\n<tr>\n<td><p><span>Title of each class</span><span> </span></p></td>\n<td><p><span> </span></p></td>\n<t

In [52]:
parsed_content = Helper.parse_and_trim(content.content)

# Find all tables
tables = parsed_content.find_all('table')

# Convert tables to string representation
tables_str = str(tables[16])

# Specify the file path where you want to write the output
file_path = "output.txt"
# Write the string representation of tables to the file
with open(file_path, "w") as file:
    file.write(tables_str)

print("Output written to", file_path)

Output written to output.txt


In [53]:
tables = Helper.parse_and_trim(content.content).find_all('table')
table = tables[16]

In [54]:
table.prettify()

'<table>\n <tr>\n  <td>\n   <p>\n    <span>\n     Investment\n    </span>\n    <span>\n     (1)(4)\n    </span>\n   </p>\n  </td>\n  <td>\n   <p>\n    <span>\n     Industry\n    </span>\n   </p>\n  </td>\n  <td>\n   <p>\n    <span>\n     Interest\n     Rate\n    </span>\n   </p>\n  </td>\n  <td>\n   <p>\n    <span>\n     Initial\n     Acquisition\n     Date\n    </span>\n    <span>\n     (14)\n    </span>\n   </p>\n  </td>\n  <td>\n   <p>\n    <span>\n     Shares\n    </span>\n    <span>\n     (3)\n    </span>\n   </p>\n  </td>\n  <td>\n   <p>\n    <span>\n    </span>\n   </p>\n  </td>\n  <td>\n   <p>\n    <span>\n     Cost\n    </span>\n   </p>\n  </td>\n  <td>\n   <p>\n    <span>\n    </span>\n   </p>\n  </td>\n  <td>\n   <p>\n    <span>\n     Fair\n     Value\n    </span>\n   </p>\n  </td>\n  <td>\n   <p>\n    <span>\n    </span>\n   </p>\n  </td>\n  <td>\n   <p>\n    <span>\n     Footnotes\n    </span>\n   </p>\n  </td>\n </tr>\n <tr>\n  <td>\n   <p>\n    <span>\n     Preferred Sto

In [None]:
file_path = "output2.txt"
# Write the string representation of tables to the file
with open(file_path, "w") as file:
    file.write(table.prettify())

In [55]:
PD_TABLE = pd.read_html(table.prettify())
PD_TABLE[0]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,Investment (1)(4),Industry,Interest Rate,Initial Acquisition Date (14),Shares (3),,Cost,,Fair Value,,Footnotes,,,
1,Preferred Stock - 2.80 %,,,,,,,,,,,,,
2,"Broadway Parent, LLC",Diversified Financial Services,,01/25/21,,4000000,,$,4019,,$,8000,,(6) (7) (12)
3,"CloudBees, Inc.",Software,,11/24/21,,1152957,,,12899,,,15507,,(6) (7) (12)
4,Foundation Software,Construction & Engineering,,08/31/20,,22,,,21,,,29,,(6) (7) (12)
5,"Governmentjobs.com, Inc. (dba NeoGov)",Software,,12/02/21,,10597,,,10332,,,12489,,(6) (7) (12)
6,Kawa Solar Holdings Limited,Construction & Engineering,8.00 % PIK,10/25/16,,85214,,,778,,,—,,(5) (7) (9) (11)
7,"MedeAnalytics, Inc.",Health Care Technology,,10/09/20,,—,,,—,,,—,,(6) (7) (11) (12) (15)
8,"Wine.com, LLC",Beverages,,03/03/21,,124040,,,3067,,,—,,(6) (7) (12)
9,"Wine.com, LLC",Beverages,,11/14/18,,535226,,,8225,,,—,,(6) (7) (12)


In [33]:
table.prettify()

'<table>\n <tr>\n  <td>\n   <p>\n    <span>\n     Investment\n    </span>\n    <span>\n     (1)(4)\n    </span>\n   </p>\n  </td>\n  <td>\n   <p>\n    <span>\n     Industry\n    </span>\n   </p>\n  </td>\n  <td>\n   <p>\n    <span>\n     Interest\n     Rate\n    </span>\n   </p>\n  </td>\n  <td>\n   <p>\n    <span>\n     Initial\n     Acquisition\n     Date\n    </span>\n    <span>\n     (14)\n    </span>\n   </p>\n  </td>\n  <td>\n   <p>\n    <span>\n     Shares\n    </span>\n    <span>\n     (3)\n    </span>\n   </p>\n  </td>\n  <td>\n   <p>\n    <span>\n    </span>\n   </p>\n  </td>\n  <td>\n   <p>\n    <span>\n     Cost\n    </span>\n   </p>\n  </td>\n  <td>\n   <p>\n    <span>\n    </span>\n   </p>\n  </td>\n  <td>\n   <p>\n    <span>\n     Fair\n     Value\n    </span>\n   </p>\n  </td>\n  <td>\n   <p>\n    <span>\n    </span>\n   </p>\n  </td>\n  <td>\n   <p>\n    <span>\n     Footnotes\n    </span>\n   </p>\n  </td>\n </tr>\n <tr>\n  <td>\n   <p>\n    <span>\n     Preferred Sto

In [34]:
for tr in table.findAll('tr'):
    print(tr)

<tr>
<td><p><span>Investment </span><span>(1)(4)</span></p></td>
<td><p><span>Industry</span></p></td>
<td><p><span>InterestRate</span></p></td>
<td><p><span>InitialAcquisitionDate </span><span>(14)</span></p></td>
<td><p><span>Shares</span><span>(3)</span></p></td>
<td><p><span> </span></p></td>
<td><p><span>Cost</span></p></td>
<td><p><span> </span></p></td>
<td><p><span>FairValue</span></p></td>
<td><p><span> </span></p></td>
<td><p><span>Footnotes</span></p></td>
</tr>
<tr>
<td><p><span>Preferred Stock  - </span><span><ix:nonfraction>2.80</ix:nonfraction></span><span>%</span></p></td>
<td><p><span> </span></p></td>
<td><p><span> </span></p></td>
<td><p><span> </span></p></td>
<td><p><span> </span></p></td>
<td><p><span> </span></p></td>
<td><p><span> </span></p></td>
<td><p><span> </span></p></td>
<td><p><span> </span></p></td>
<td><p><span> </span></p></td>
<td><p><span> </span></p></td>
</tr>
<tr>
<td><p><span>Broadway Parent, LLC</span></p></td>
<td><p><span>Diversified Financia