In [1]:
import pandas as pd
import numpy as np
import requests
import html5lib
from bs4 import BeautifulSoup
from datetime import datetime
from openpyxl import workbook
import re
import os
import webbrowser
import unicodedata
from Helper_package import Helper
from html import unescape

In [2]:
headers = {
    'User-Agent': 'Goldman Sachs BDC, Inc. GSBD on NYSE'
}
CIK = '0001572694'

filing_data = Helper.fetch_filing_data(cik=CIK, headers=headers)
if filing_data is not None:
    # Write DataFrame to Excel file with auto-adjusting column widths
    file_name = "../GSBC_sec_filing_links.xlsx"
    with pd.ExcelWriter(file_name, engine='xlsxwriter') as writer:
        filing_data.to_excel(writer, index=False)
        worksheet = writer.sheets['Sheet1']
        for i, col in enumerate(filing_data.columns):
            column_len = max(filing_data[col].astype(
                str).str.len().max(), len(col)) + 2
            worksheet.set_column(i, i, column_len)

    print(f"Data written to {file_name}")

filing_links = Helper.get_filing_links('../GSBC_sec_filing_links.xlsx')
print("Filing link recived")

Data written to ../GSBC_sec_filing_links.xlsx
Filing link recived


In [3]:
filing_links['reportDate'] = pd.to_datetime(
    filing_links['reportDate']).dt.strftime("%B %d, %Y")
print("Date was converted to '%B %d, %Y' format and back to")
filing_links['reportDate'].info()

Date was converted to '%B %d, %Y' format and back to
<class 'pandas.core.series.Series'>
RangeIndex: 36 entries, 0 to 35
Series name: reportDate
Non-Null Count  Dtype 
--------------  ----- 
36 non-null     object
dtypes: object(1)
memory usage: 416.0+ bytes


In [4]:
filing_links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   accessionNumber        36 non-null     object 
 1   filingDate             36 non-null     object 
 2   reportDate             36 non-null     object 
 3   acceptanceDateTime     36 non-null     object 
 4   act                    36 non-null     int64  
 5   form                   36 non-null     object 
 6   fileNumber             36 non-null     object 
 7   filmNumber             36 non-null     int64  
 8   items                  0 non-null      float64
 9   size                   36 non-null     int64  
 10  isXBRL                 36 non-null     int64  
 11  isInlineXBRL           36 non-null     int64  
 12  primaryDocument        36 non-null     object 
 13  primaryDocDescription  36 non-null     object 
 14  fileLink               36 non-null     object 
 15  txtFileL

In [5]:
def check_links_validity(filing_links):
    '''
        Checks the validity of each file link in the DataFrame.
    '''
    valid_links = []
    invalid_links = []
    for index, row in filing_links.iterrows():
        link = row['fileLink']
        try:
            response = requests.head(link, headers=headers)
            response.raise_for_status()
            valid_links.append(link)
        except Exception as e:
            invalid_links.append((link, str(e)))
    if invalid_links:
        print("\nInvalid Links:")
        for link, error_message in invalid_links:
            print(f"{link}: {error_message}")

    if len(valid_links) == filing_links.shape[0]:
        print("All Valid Links")


check_links_validity(filing_links)

All Valid Links


In [14]:
test_file = filing_links.iloc[18]
print(test_file)
content = Helper.get_content(Helper.get_response(
    url=test_file['fileLink'], headers=headers))

accessionNumber                                       0001193125-19-210919
filingDate                                                      2019-08-01
reportDate                                                   June 30, 2019
acceptanceDateTime                                2019-08-01T16:27:08.000Z
act                                                                     34
form                                                                  10-Q
fileNumber                                                       814-00998
filmNumber                                                        19993072
items                                                                  NaN
size                                                               1794487
isXBRL                                                                   0
isInlineXBRL                                                             0
primaryDocument                                            d590037d10q.htm
primaryDocDescription    

In [28]:
consolidated_schedule_regex = re.compile(
    r'(?i)^\s*.*\s*SCHEDULE(S|)\s+OF\s+INVESTMENTS\s*.*\s*$')
date_regex_pattern = r'([A-Za-z]+\s+\d{1,2},\s+\d{4})'


def extract_tables(content, report_Date) -> pd.DataFrame:
    master_table = pd.DataFrame()
    print(f"Extractiong File: {report_Date}")
    for tag in content.findAll(string=consolidated_schedule_regex):
        try:
            date_matches = re.findall(date_regex_pattern, tag.text)
        except Exception as e:
            print(f'Could not find date on extract_tables() : {e}')

        if date_matches and len(date_matches) == 1:
            table_date = date_matches[0]
            if table_date is not None and unicodedata.normalize('NFKD', table_date) == unicodedata.normalize('NFKD', report_Date):
                html_table = tag.find_next('table')
                new_table = pd.read_html(
                    html_table.prettify(), na_values="No value", skiprows=0, flavor='bs4')[0]
                # new_table = new_table.applymap(lambda x: unicodedata.normalize(
                #     'NFKD', x.strip().strip(u'\u200b').replace('—', '0').replace('%', '').replace('(', '').replace(')', '')) if type(x) == str else x)

                new_table = new_table.dropna(how='all', axis=0)

                if master_table is None:
                    master_table = new_table
                else:
                    master_table = pd.concat(
                        [master_table, new_table], ignore_index=True)

    return master_table


extract_tables(content=content,
               report_Date=test_file['reportDate']).to_csv('test.csv')

Extractiong File: June 30, 2019


In [11]:

for index, (url, reporting_date) in enumerate(zip(filing_links['fileLink'], filing_links['reportDate'])):
    try:
        content = Helper.get_content(
            Helper.get_response(url=url, headers=headers))
    except Exception as e:
        print(f'failed to get the content: {e}')

    extract_tables(content, reporting_date)

Extractiong File: December 31, 2023
[                                           Unnamed: 0  \
0                                  Investment  (1)(5)   
1                       Debt Investments -  208.88  %   
2                                   Canada -  7.70  %   
3             1st Lien/Senior Secured Debt -  5.21  %   
4                                  Trader Corporation   
5                                  Trader Corporation   
6                                       Recochem, Inc   
7                                       Recochem, Inc   
8                                       Recochem, Inc   
9                                       Recochem, Inc   
10                                 ATX Networks Corp.   
11            Prophix Software Inc. (dba Pound Bidco)   
12            Prophix Software Inc. (dba Pound Bidco)   
13            Prophix Software Inc. (dba Pound Bidco)   
14            Prophix Software Inc. (dba Pound Bidco)   
15            Prophix Software Inc. (dba Pound Bidc

KeyboardInterrupt: 