In [35]:
import pandas as pd
import numpy as np
import requests
import html5lib
from bs4 import BeautifulSoup
from datetime import datetime
from openpyxl import workbook
import re
import os
import webbrowser
import unicodedata
from Helper_package import Helper
from html import unescape
import warnings
warnings.filterwarnings('ignore')

In [None]:
headers = {
    'User-Agent': 'Goldman Sachs BDC, Inc. GSBD on NYSE'
}
CIK = '0001572694'

filing_data = Helper.fetch_filing_data(cik=CIK, headers=headers)
if filing_data is not None:
    # Write DataFrame to Excel file with auto-adjusting column widths
    file_name = "../GSBC_sec_filing_links.xlsx"
    with pd.ExcelWriter(file_name, engine='xlsxwriter') as writer:
        filing_data.to_excel(writer, index=False)
        worksheet = writer.sheets['Sheet1']
        for i, col in enumerate(filing_data.columns):
            column_len = max(filing_data[col].astype(
                str).str.len().max(), len(col)) + 2
            worksheet.set_column(i, i, column_len)

    print(f"Data written to {file_name}")

filing_links = Helper.get_filing_links('../GSBC_sec_filing_links.xlsx')
print("Filing link recived")

In [36]:
headers = {
    'User-Agent': 'Goldman Sachs BDC, Inc. GSBD on NYSE'
}
filing_links = Helper.get_filing_links('../GSBC_sec_filing_links.xlsx')
print("Filing link recived")

Filing link recived


In [37]:
filing_links['reportDate'] = pd.to_datetime(
    filing_links['reportDate']).dt.strftime("%B %d, %Y")
print("Date was converted to '%B %d, %Y' format and back to")
filing_links['reportDate'].info()

Date was converted to '%B %d, %Y' format and back to
<class 'pandas.core.series.Series'>
RangeIndex: 36 entries, 0 to 35
Series name: reportDate
Non-Null Count  Dtype 
--------------  ----- 
36 non-null     object
dtypes: object(1)
memory usage: 416.0+ bytes


In [None]:
filing_links.info()

In [None]:
def check_links_validity(filing_links):
    '''
        Checks the validity of each file link in the DataFrame.
    '''
    valid_links = []
    invalid_links = []
    for index, row in filing_links.iterrows():
        link = row['fileLink']
        try:
            response = requests.head(link, headers=headers)
            response.raise_for_status()
            valid_links.append(link)
        except Exception as e:
            invalid_links.append((link, str(e)))
    if invalid_links:
        print("\nInvalid Links:")
        for link, error_message in invalid_links:
            print(f"{link}: {error_message}")

    if len(valid_links) == filing_links.shape[0]:
        print("All Valid Links")


check_links_validity(filing_links)

In [None]:
consolidated_schedule_regex = re.compile(
    r'(?i)^\s*.*\s*SCHEDULE(S|)\s+OF\s+INVESTMENTS\s*.*\s*$')
date_regex_pattern = r'([A-Za-z]+\s+\d{1,2},\s+\d{4})'


def extract_tables(content, report_Date) -> pd.DataFrame:
    master_table = pd.DataFrame()
    print(f"Extractiong File: {report_Date}")
    for tag in content.findAll(string=consolidated_schedule_regex):
        try:
            date_matches = re.findall(date_regex_pattern, tag.text)
        except Exception as e:
            print(f'Could not find date on extract_tables() : {e}')

        if date_matches and len(date_matches) == 1:
            table_date = date_matches[0]
            if table_date is not None and unicodedata.normalize('NFKD', table_date) == unicodedata.normalize('NFKD', report_Date):
                html_table = tag.find_next('table')
                new_table = pd.read_html(
                    html_table.prettify(), na_values="No value")[0]
                # new_table = new_table.applymap(lambda x: unicodedata.normalize(
                #     'NFKD', x.strip().strip(u'\u200b').replace('—', '0').replace('%', '').replace('(', '').replace(')', '')) if type(x) == str else x)
                new_table.replace('(', '-').replace(')', None)
                new_table = new_table.dropna(how='all', axis=0)

                if master_table is None:
                    master_table = new_table
                else:
                    master_table = pd.concat(
                        [master_table, new_table], ignore_index=True)

    return master_table

In [None]:
path = '/Users/fuadhassan/Desktop/BDC_RA/GSBD/New_Master_tables_GSBD_Investment.xlsx'
writer = pd.ExcelWriter(path, engine='openpyxl')
for index, (url, reporting_date) in enumerate(zip(filing_links['fileLink'], filing_links['reportDate'])):
    try:
        content = Helper.get_content(
            Helper.get_response(url=url, headers=headers))
    except Exception as e:
        print(f'failed to get the content: {e}')

    master_table = extract_tables(content, reporting_date)
    master_table.to_csv(
        '../MT_csv_files_2/'+reporting_date.replace(',', '')+'.csv')
    master_table.to_excel(
        writer, sheet_name=reporting_date.replace(',', ''), index=False)
    writer.book .save(path)
writer.close()

In [None]:
test_file = filing_links.iloc[1]
print(test_file)
content = Helper.get_response(
    url=test_file['fileLink'], headers=headers)

In [None]:
test_file['fileLink']

In [None]:
str(Helper.parse_and_trim(content.content).find_all('table'))

In [57]:
parsed_content = Helper.parse_and_trim(content.content)

# Find all tables
tables = parsed_content.find_all('table')

# Convert tables to string representation
tables_str = str(tables[8])

# Specify the file path where you want to write the output
file_path = "output.txt"
# Write the string representation of tables to the file
with open(file_path, "w") as file:
    file.write(tables_str)

print("Output written to", file_path)

Output written to output.txt


In [50]:
tables = Helper.parse_and_trim(content.content).find_all('table')
table = tables[5]

In [51]:
table.prettify()

'<table>\n <tr>\n  <td>\n   <p>\n    <span>\n    </span>\n   </p>\n  </td>\n  <td>\n   <p>\n    <span>\n    </span>\n   </p>\n  </td>\n  <td>\n   <p>\n    <span>\n     For the Three Months Ended\n    </span>\n   </p>\n  </td>\n  <td>\n   <p>\n    <span>\n    </span>\n   </p>\n  </td>\n  <td>\n   <p>\n    <span>\n    </span>\n   </p>\n  </td>\n  <td>\n   <p>\n    <span>\n     For the Nine Months Ended\n    </span>\n   </p>\n  </td>\n  <td>\n   <p>\n    <span>\n    </span>\n   </p>\n  </td>\n </tr>\n <tr>\n  <td>\n   <p>\n    <span>\n    </span>\n   </p>\n  </td>\n  <td>\n   <p>\n    <span>\n    </span>\n   </p>\n  </td>\n  <td>\n   <p>\n    <span>\n     September 30,\n     2023\n    </span>\n   </p>\n  </td>\n  <td>\n   <p>\n    <span>\n    </span>\n   </p>\n  </td>\n  <td>\n   <p>\n    <span>\n    </span>\n   </p>\n  </td>\n  <td>\n   <p>\n    <span>\n     September 30,\n     2022\n    </span>\n   </p>\n  </td>\n  <td>\n   <p>\n    <span>\n    </span>\n   </p>\n  </td>\n  <td>\n   <p>\

In [52]:
file_path = "output2.txt"
# Write the string representation of tables to the file
with open(file_path, "w") as file:
    file.write(table.prettify())

In [None]:
first_row = table.find_all('tr')[0]

# Convert it to a prettified string
first_row_html = first_row.prettify()
first_row_soup = BeautifulSoup(first_row_html, 'html.parser')

# Extract the text content from each cell in the first row
header = [cell.get_text(strip=True) for cell in first_row_soup.find_all(
    'td') if cell.get_text(strip=True)]

# Update the columns of the DataFrame with the extracted header
header

In [None]:
PD_TABLE = pd.read_html(
    table.prettify(), keep_default_na=False, skiprows=1)
PD_TABLE[0] = PD_TABLE[0].drop(10, axis=0)
PD_TABLE[0] = PD_TABLE[0].drop(33, axis=0)
PD_TABLE[0] = PD_TABLE[0].drop(39, axis=0)
PD_TABLE[0] = PD_TABLE[0].drop(40, axis=0)


PD_TABLE[0] = PD_TABLE[0].replace('', np.nan)
PD_TABLE[0] = PD_TABLE[0].dropna(axis=1, how='all')
PD_TABLE[0].columns = header
PD_TABLE[0]

In [None]:
table.prettify()

In [None]:
for tr in table.findAll('tr'):
    print(tr)

In [None]:
# candel
for tr in table.findAll('span', string='$'):
    nonfraction_tag = tr.find_next('ix:nonfraction')
    nonfraction_tag.string = f"${nonfraction_tag.text}"
    tr.string.replace_with('')

In [None]:
ix_nonfraction_elements = table.find_all('ix:nonfraction')
for element in ix_nonfraction_elements:
    element.string = element.text.replace(',', '')

In [None]:
def convert_to_numeric(text):
    """
    Convert text to either integer or float if possible, else return the original text.
    """
    # Strip any leading or trailing whitespace
    text = text.replace('(', '-')
    text = text.strip()

    # Check if the text represents an integer
    if text.isdigit():
        # Convert to integer
        converted_value = int(text)
        return converted_value
    else:
        # Try converting to float
        try:
            converted_value = float(text)
            return converted_value
        except ValueError:
            # If it's neither an integer nor a float, return the original text
            return text


# Example usage:
for element in ix_nonfraction_elements:
    numeric_value = convert_to_numeric(element.text)
    print(numeric_value)

In [38]:
test_file = filing_links.iloc[1]
print(test_file)
content = Helper.get_response(
    url=test_file['fileLink'], headers=headers)
parsed_content = Helper.parse_and_trim(content.content)

accessionNumber                                       0000950170-23-060336
filingDate                                                      2023-11-07
reportDate                                              September 30, 2023
acceptanceDateTime                                2023-11-07T16:40:27.000Z
act                                                                     34
form                                                                  10-Q
fileNumber                                                       814-00998
filmNumber                                                       231384576
items                                                                  NaN
size                                                              48188974
isXBRL                                                                   1
isInlineXBRL                                                             1
primaryDocument                                          gsbd-20230930.htm
primaryDocDescription    

In [48]:

consolidated_schedule_regex = re.compile(
    r'(?i)^\s*.*\s*SCHEDULE(S|)\s+OF\s+INVESTMENTS\s*.*\s*$')
date_regex_pattern = r'([A-Za-z]+\s+\d{1,2},\s+\d{4})'


def convert_to_numeric(table):
    """
    Convert text to either integer or float if possible, else return the original text.
    """
    ix_nonfraction_elements = table.find_all('ix:nonfraction')
    for element in ix_nonfraction_elements:
        # Corrected this line to use element.text

        text = element.text.replace(',', '')
        # Moved this line before stripping whitespace
        text = text.strip()

        # Check if the text represents an integer
        if text.isdigit():
            # Convert to integer
            converted_value = int(text)
        else:
            # Try converting to float
            try:
                converted_value = float(text)
            except ValueError:
                # If it's neither an integer nor a float, keep the original text
                converted_value = text

        # Replace the text content of the element with the converted value
        element.string = str(converted_value)

    return table


def convert_currency_symbols(table, currency_symbols):
    """
    Convert currency symbols to a consistent format and replace original symbols with an empty string.

    Args:
    - table: BeautifulSoup object representing the HTML table
    - currency_symbols: List of currency symbols to be converted

    Returns:
    - Modified BeautifulSoup object with converted currency symbols
    """
    for currency_symbol in currency_symbols:
        for tr in table.findAll('span', string=currency_symbol):
            nonfraction_tag = tr.find_next('ix:nonfraction')
            nonfraction_tag.string = f"{currency_symbol} {nonfraction_tag.text}"
            tr.string.replace_with('')

    return table


def drop_if_contain(pattern, df):
    matching_rows = df.apply(
        lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)
    df = df[~matching_rows]
    return df


def ExtractTable(content, report_Date) -> pd.DataFrame:
    master_table = pd.DataFrame()
    print(f"Extractiong File: {report_Date}")
    for tag in content.findAll(string=consolidated_schedule_regex):
        try:
            date_matches = re.findall(date_regex_pattern, tag.text)
        except Exception as e:
            print(f'Could not find date on extract_tables() : {e}')

        if date_matches and len(date_matches) == 1:
            table_date = date_matches[0]
            if table_date is not None and unicodedata.normalize('NFKD', table_date) == unicodedata.normalize('NFKD', report_Date):
                html_table = tag.find_next('table')
                # we perform more cleaning before
                # taking the first row and making it the header for that table
                first_row = html_table.find_all('tr')[0]
                headers = [cell.get_text(strip=True) for cell in first_row.find_all(
                    'td') if cell.get_text(strip=True)]
                if r'Cost' in headers:
                    # adding all currency sign
                    convert_to_numeric(html_table)

                    currency_symbols = ['$', 'CAD', 'GBP', '€']
                    html_table = convert_currency_symbols(
                        html_table, currency_symbols)

                    new_table = pd.read_html(
                        html_table.prettify(), skiprows=1)[0]

                    pattern = r'^([Tt]otal)'
                    new_table = drop_if_contain(pattern, new_table)

                    new_table = new_table.replace('', np.nan)
                    # new_table = new_table.dropna(axis=1, how='all')
                    # new_table.columns = headers

                    if master_table is None:
                        master_table = new_table
                    else:
                        master_table = pd.concat(
                            [master_table, new_table], ignore_index=True)

    return master_table


data = ExtractTable(parsed_content, test_file['reportDate'])
data.to_excel('test.xlsx')

Extractiong File: September 30, 2023


In [49]:
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,1st Lien/Senior Secured Debt - 196.24 %,,,,,,,,,,,,,,,,,
1,1272775 B.C. LTD. (dba Everest Clinical Research),Professional Services,11.54 %,,S + 6.0 %,11/06/26,,,$ 9171,,,$ 9092,,,$ 9056,,(5) (6) (7),
2,1272775 B.C. LTD. (dba Everest Clinical Research),Professional Services,14.5 %,,P + 6.0 %,11/06/26,,,919,,,874,,,871,,(5) (6) (7) (8),
3,1272775 B.C. LTD. (dba Everest Clinical Research),Professional Services,12.2 %,,CDN P + 4.75 %,11/06/26,,,CAD 454,,,339,,,330,,(5) (6) (7),
4,"3SI Security Systems, Inc.",Commercial Services & Supplies,12.05 %,,S + 6.5 %,12/16/26,,,13250,,,13122,,,12554,,(7),
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
392,Warrants - 0.02 %,,,,,,,,,,,,,,,,,
393,"CloudBees, Inc.",Software,,11/24/21,,333980,,,$ 1849,,,$ 327,,(6) (7) (12),,,,
394,KDOR Holdings Inc. (dba Senneca Holdings),Building Products,,06/22/20,,59,,,—,,,—,,(6) (7) (12),,,,
395,KDOR Holdings Inc. (dba Senneca Holdings),Building Products,,05/29/20,,2812,,,—,,,—,,(6) (7) (12),,,,
