In [142]:
from bs4 import BeautifulSoup
import re
import unicodedata
import pandas as pd
import numpy as np
import html5lib
import requests
from openpyxl import Workbook
from datetime import datetime

In [143]:
def parse_and_trim(content, content_type):
    if content_type == 'HTML':
        soup = BeautifulSoup(content, 'html.parser')
    else:
        soup = BeautifulSoup(content, 'html.parser')

    for tag in soup.recursiveChildGenerator():
        try:
            tag.attrs = None
        except AttributeError:
            pass

    for linebreak in soup.find_all('br'):
        linebreak.extract()

    return soup

In [144]:
def remove_multiple_spaces(string):
    pattern = r'\s+'
    replaced_string = re.sub(pattern, ' ', string)
    return replaced_string


def find_qrt_date(content):
    qtr_date = content.find_all(text=re.compile(
        r'for\s+(the\s+)?(fiscal\s+)?year\s+ended\s+|for\s+the\s+quarter\s+ended\s+|for\s+the\s+quarterly\s+period\s+ended\s+', re.IGNORECASE))
    qtr_match = re.search(
        r'([A-Za-z]+)\s+(\d{1,2}),\s+(\d{4})', qtr_date[0].replace('\n', ''))
    if qtr_match is None:
        qtr_match = qtr_match = re.search(
            r'([A-Za-z]+) (\d{1,2}), (\d{4})', qtr_date[1])
    return remove_multiple_spaces(str(qtr_match.group()))

In [145]:
headers = {
    'User-Agent': 'ARES CAPITAL CORP'
}
filing_links = pd.read_excel(
    "/Users/fuadhassan/Desktop/BDC_RA/ARCC/ARCC__sec_filing_links.xlsx")
filing_links.head()

Unnamed: 0,Form type,Form description,Filing date,Reporting date,Filings URL
0,10-Q,Quarterly report [Sections 13 or 15(d)],2023-07-25,2023-06-30,https://www.sec.gov/Archives/edgar/data/128775...
1,10-Q,Quarterly report [Sections 13 or 15(d)],2023-04-25,2023-03-31,https://www.sec.gov/Archives/edgar/data/128775...
2,10-K/A,"Annual report [Section 13 and 15(d), not S-K I...",2023-03-31,2022-12-31,https://www.sec.gov/Archives/edgar/data/128775...
3,10-K,"Annual report [Section 13 and 15(d), not S-K I...",2023-02-07,2022-12-31,https://www.sec.gov/Archives/edgar/data/128775...
4,10-Q,Quarterly report [Sections 13 or 15(d)],2022-10-25,2022-09-30,https://www.sec.gov/Archives/edgar/data/128775...


In [146]:
# drops all the amendment filing
filing_links = filing_links.drop(filing_links[filing_links['Form description'].str.contains(
    'amendment', case=False)].index).reset_index(drop=True)
filing_links.head()

Unnamed: 0,Form type,Form description,Filing date,Reporting date,Filings URL
0,10-Q,Quarterly report [Sections 13 or 15(d)],2023-07-25,2023-06-30,https://www.sec.gov/Archives/edgar/data/128775...
1,10-Q,Quarterly report [Sections 13 or 15(d)],2023-04-25,2023-03-31,https://www.sec.gov/Archives/edgar/data/128775...
2,10-K,"Annual report [Section 13 and 15(d), not S-K I...",2023-02-07,2022-12-31,https://www.sec.gov/Archives/edgar/data/128775...
3,10-Q,Quarterly report [Sections 13 or 15(d)],2022-10-25,2022-09-30,https://www.sec.gov/Archives/edgar/data/128775...
4,10-Q,Quarterly report [Sections 13 or 15(d)],2022-07-26,2022-06-30,https://www.sec.gov/Archives/edgar/data/128775...


In [147]:
date_columns = ['Filing date', 'Reporting date']
for col in date_columns:
    filing_links[col] = pd.to_datetime(filing_links[col], format='%Y-%m-%d')
for col in date_columns:
    filing_links[col] = filing_links[col].dt.strftime("%B %d, %Y")

In [148]:
filing_links.head()

Unnamed: 0,Form type,Form description,Filing date,Reporting date,Filings URL
0,10-Q,Quarterly report [Sections 13 or 15(d)],"July 25, 2023","June 30, 2023",https://www.sec.gov/Archives/edgar/data/128775...
1,10-Q,Quarterly report [Sections 13 or 15(d)],"April 25, 2023","March 31, 2023",https://www.sec.gov/Archives/edgar/data/128775...
2,10-K,"Annual report [Section 13 and 15(d), not S-K I...","February 07, 2023","December 31, 2022",https://www.sec.gov/Archives/edgar/data/128775...
3,10-Q,Quarterly report [Sections 13 or 15(d)],"October 25, 2022","September 30, 2022",https://www.sec.gov/Archives/edgar/data/128775...
4,10-Q,Quarterly report [Sections 13 or 15(d)],"July 26, 2022","June 30, 2022",https://www.sec.gov/Archives/edgar/data/128775...


In [149]:
filing_links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52 entries, 0 to 51
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Form type         52 non-null     object
 1   Form description  52 non-null     object
 2   Filing date       52 non-null     object
 3   Reporting date    52 non-null     object
 4   Filings URL       52 non-null     object
dtypes: object(5)
memory usage: 2.2+ KB


In [150]:
url = filing_links.iloc[0]['Filings URL']
date = filing_links.iloc[0]['Reporting date']
url, date

('https://www.sec.gov/Archives/edgar/data/1287750/000128775023000036/arcc-20230630.htm',
 'June 30, 2023')

In [151]:
response = requests.get(url, headers=headers)
content = parse_and_trim(response.content, 'HTML')

In [152]:
def extract_tables(soup_content, qtr_date):
    master_table = None
    all_tags = soup_content.find_all(True)
    print(type(all_tags))
    count = 0
    for tag in soup_content.find_all(text=re.compile('^.*CONSOLIDATED\s+SCHEDULE(S|)\s+OF\s+INVESTMENTS.*$')):
        date_str = re.search(r'([A-Za-z]+) (\d{1,2}), (\d{4})', tag)
        if date_str is None:
            next_line = tag.find_next(text=re.compile(
                r'([A-Za-z]+) (\d{1,2}), (\d{4})')).text
            date_str = re.search(r'([A-Za-z]+) (\d{1,2}), (\d{4})', next_line)
        if date_str is None:
            next_line = tag.next.next.next.next.next.next.text
            date_str = re.search(r'([A-Za-z]+) (\d{1,2}), (\d{4})', next_line)
        if date_str is not None:
            date_str = str(date_str.group())
            date_str = unicodedata.normalize('NFKD', date_str)
            if qtr_date.replace(',', '').strip().lower() in date_str.replace(',', '').strip().lower():
                count += 1
                print('Table found: ')
                print('Table #', count)
                html_table = tag.find_next('table')
                if master_table is None:
                    master_table = pd.read_html(
                        html_table.prettify(), skiprows=0, flavor='bs4')[0]
                    master_table = master_table.applymap(lambda x: unicodedata.normalize(
                        'NFKD', x.strip().strip(u'\u200b').replace('—', '-')) if type(x) == str else x)
                    master_table = master_table.replace(r'^\s*$', np.nan, regex=True).replace(r'^\s*\$\s*$', np.nan,
                                                                                              regex=True)
                    master_table = master_table.dropna(how='all', axis=0)
                else:
                    new_table = pd.read_html(
                        html_table.prettify(), skiprows=0, flavor='bs4')[0]
                    new_table = new_table.applymap(lambda x: unicodedata.normalize(
                        'NFKD', x.strip().strip(u'\u200b').replace('—', '-')) if type(x) == str else x)
                    new_table = new_table.replace(r'^\s*$', np.nan, regex=True).replace(r'^\s*\$\s*$', np.nan,
                                                                                        regex=True)
                    new_table = new_table.dropna(how='all', axis=0)
                    # print('head')
                    # print(new_table.head()) # text
                    master_table = master_table.append(
                        new_table.dropna(how='all', axis=0).reset_index(
                            drop=True).drop(index=0),
                        ignore_index=True)

    master_table = master_table.applymap(
        lambda x: x.strip().strip(u'\u200b') if type(x) == str else x)
    master_table = master_table.replace(r'^\s*$', np.nan, regex=True).replace(
        r'^\s*\$\s*$', np.nan, regex=True).replace(r'^\s*\)\s*$', np.nan, regex=True)
    return master_table

In [153]:
master_table = extract_tables(content, date)

<class 'bs4.element.ResultSet'>
Table found: 
Table # 1
Table found: 
Table # 2
Table found: 
Table # 3
Table found: 
Table # 4


  master_table = master_table.append(
  master_table = master_table.append(
  master_table = master_table.append(


Table found: 
Table # 5
Table found: 
Table # 6
Table found: 
Table # 7


  master_table = master_table.append(
  master_table = master_table.append(
  master_table = master_table.append(


Table found: 
Table # 8
Table found: 
Table # 9
Table found: 
Table # 10


  master_table = master_table.append(
  master_table = master_table.append(
  master_table = master_table.append(


Table found: 
Table # 11
Table found: 
Table # 12
Table found: 
Table # 13


  master_table = master_table.append(
  master_table = master_table.append(
  master_table = master_table.append(


Table found: 
Table # 14
Table found: 
Table # 15
Table found: 
Table # 16


  master_table = master_table.append(
  master_table = master_table.append(
  master_table = master_table.append(


Table found: 
Table # 17
Table found: 
Table # 18
Table found: 
Table # 19


  master_table = master_table.append(
  master_table = master_table.append(
  master_table = master_table.append(


Table found: 
Table # 20
Table found: 
Table # 21
Table found: 
Table # 22


  master_table = master_table.append(
  master_table = master_table.append(
  master_table = master_table.append(


Table found: 
Table # 23
Table found: 
Table # 24
Table found: 
Table # 25


  master_table = master_table.append(
  master_table = master_table.append(
  master_table = master_table.append(


Table found: 
Table # 26
Table found: 
Table # 27
Table found: 
Table # 28


  master_table = master_table.append(
  master_table = master_table.append(
  master_table = master_table.append(


Table found: 
Table # 29
Table found: 
Table # 30
Table found: 
Table # 31


  master_table = master_table.append(
  master_table = master_table.append(
  master_table = master_table.append(


Table found: 
Table # 32
Table found: 
Table # 33
Table found: 
Table # 34


  master_table = master_table.append(
  master_table = master_table.append(
  master_table = master_table.append(


Table found: 
Table # 35
Table found: 
Table # 36
Table found: 
Table # 37


  master_table = master_table.append(
  master_table = master_table.append(
  master_table = master_table.append(


Table found: 
Table # 38
Table found: 
Table # 39
Table found: 
Table # 40
Table found: 
Table # 41


  master_table = master_table.append(
  master_table = master_table.append(
  master_table = master_table.append(
  master_table = master_table.append(


Table found: 
Table # 42


  master_table = master_table.append(
  master_table = master_table.append(
  master_table = master_table.append(


Table found: 
Table # 43
Table found: 
Table # 44
Table found: 
Table # 45


  master_table = master_table.append(
  master_table = master_table.append(
  master_table = master_table.append(
  master_table = master_table.append(


Table found: 
Table # 46
Table found: 
Table # 47
Table found: 
Table # 48
Table found: 
Table # 49


  master_table = master_table.append(
  master_table = master_table.append(
  master_table = master_table.append(
  master_table = master_table.append(


Table found: 
Table # 50
Table found: 
Table # 51
Table found: 
Table # 52
Table found: 
Table # 53


  master_table = master_table.append(
  master_table = master_table.append(


Table found: 
Table # 54


In [154]:
# this one is the real one

def process_table(soi_table_df, append_str):
    soi_table_df = soi_table_df.replace(r'^\s*\$\s*$', np.nan, regex=True)
    soi_table_df = soi_table_df.dropna(how='all', axis=1)
    soi_table_df = soi_table_df.dropna(
        how='all', axis=0).reset_index(drop=True)

    # Separate header and data
    soi_table_header = soi_table_df.iloc[1].dropna(how='any')
    print('header: ')
    print(soi_table_header)
    soi_table_data_df = soi_table_df.iloc[2:]
    print('1: ' + str(soi_table_data_df.shape))

    # Drop Full NnN rows
    soi_table_data_df = soi_table_data_df.dropna(how='all', axis=1)
    soi_table_data_df = soi_table_data_df.dropna(
        how='all', axis=0).reset_index(drop=True)
    print('2: ' + str(soi_table_data_df.shape))

    # Rename columns to integer range
    num_cols = soi_table_data_df.shape[1]
    data_col_mapper = dict(zip(soi_table_data_df.columns.to_list(), [
                           i for i in range(0, num_cols)]))
    soi_table_data_df = soi_table_data_df.rename(columns=data_col_mapper)
    print('3: ' + str(soi_table_data_df.shape))
    # soi_table_data_df.to_csv('Code/csv_file/3_'+append_str+'.csv')

    # Drop "Control Instruments" rows
    # soi_table_data_df = soi_table_data_df.dropna(subset=[i for i in range(1, num_cols)], how='all')
    # print('4: ' + str(soi_table_data_df.shape))

    # if num_cols > 7:
    #    # Drop labeled subtotal/total rows
    #    soi_table_data_df = soi_table_data_df.dropna(subset=[1, 3, 5, 6, 7], how='all')
    #    print('5: ' + str(soi_table_data_df.shape))

    # Drop labeled subtotal rows
    soi_table_data_df = soi_table_data_df.dropna(
        subset=[i for i in range(1, num_cols - 2)], how='all')
    print('5: ' + str(soi_table_data_df.shape))

    # Drop subtotal/total rows based on regex
    sub_total_filter_pattern = r'^([Ss]ubtotal)|([Tt]otal)'
    sub_total_filter = soi_table_data_df[0].str.contains(
        sub_total_filter_pattern).replace(np.NaN, False)
    print(sub_total_filter)
    soi_table_data_df = soi_table_data_df[~sub_total_filter]

    # Drop Full NnN rows/cols
    soi_table_data_df = soi_table_data_df.dropna(how='all', axis=1)
    soi_table_data_df = soi_table_data_df.dropna(
        how='all', axis=0).reset_index(drop=True)
    print('6: ' + str(soi_table_data_df.shape))

    # Rename columns to integer range
    num_cols = soi_table_data_df.shape[1]
    data_col_mapper = dict(zip(soi_table_data_df.columns.to_list(), [
                           i for i in range(0, num_cols)]))
    soi_table_data_df = soi_table_data_df.rename(columns=data_col_mapper)

    # Drop totals rows
    soi_table_data_df = soi_table_data_df.dropna(
        subset=[i for i in range(0, num_cols-2)], how='all')
    print('7: ' + str(soi_table_data_df.shape))

    # Forward Fill first 2 columns
    ffill_cols = [i for i in range(0, num_cols-4)]
    soi_table_data_df[ffill_cols] = soi_table_data_df[ffill_cols].fillna(
        method='ffill')
    print('8: ' + str(soi_table_data_df.shape))
    # soi_table_data_df.to_csv('Code/csv_file/8_'+append_str+'.csv')

    # Drop rows with only first 2/3 columns present
    soi_table_data_df = soi_table_data_df.dropna(
        subset=[i for i in range(num_cols-4, num_cols)], how='all')
    print('9: ' + str(soi_table_data_df.shape))

    # Fill data cols NaN with 0
    soi_table_data_df = soi_table_data_df.fillna(0)
    soi_table_data_df = soi_table_data_df.replace('—', 0)
    print('10: ' + str(soi_table_data_df.shape))
    # soi_table_data_df.to_csv('Code/csv_file/10_'+append_str+'.csv', index=False)

    # Replace hyphen with 0
    soi_table_data_df = soi_table_data_df.replace('-', 0, regex=False)
    soi_table_data_df = soi_table_data_df.replace('%', "", regex=False)

    # Typecast data cols to int
    for col_index in range(num_cols-3, num_cols):
        col_name = soi_table_data_df.columns[col_index]
        soi_table_data_df[col_name] = pd.to_numeric(
            soi_table_data_df[col_name], errors='coerce')

# Multiply numeric columns by 1000
    numeric_cols = [
        col_name for col_name in soi_table_data_df.columns[num_cols-3:num_cols]]
    soi_table_data_df[numeric_cols] = soi_table_data_df[numeric_cols].apply(
        lambda x: x * 1000)

    print('11: ' + str(soi_table_data_df.shape))

    # first_str = soi_table_data_df[3].iloc[0]
    # print(first_str)
    # ord_l = [ord(c) for c in first_str]

    # print(ord_l)
    # Rename columns to table headers
    header_col_mapper = dict(
        zip(soi_table_data_df.columns.to_list(), soi_table_header))
    soi_table_data_df = soi_table_data_df.rename(columns=header_col_mapper)
    print('12: ' + str(soi_table_data_df.shape))

    soi_table_data_df = soi_table_data_df.iloc[:, :soi_table_header.shape[0]]

    return soi_table_data_df

In [155]:
master_table

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,56,57,58,59,60,61,62,63,64,65
0,Company (1),,Business Description,,Investment,,Coupon (3),Reference (7),Spread (3),,...,,,,,,,,,,
1,Software and Services,,,,,,,,,,...,,,,,,,,,,
2,"2U, Inc.",,Provider of course design and learning managem...,,First lien senior secured loan,,11.32 %,SOFR (M),6.50 %,,...,,,,,,,,,,
3,"AffiniPay Midco, LLC and AffiniPay Intermediat...",,Payment processing solution provider,,First lien senior secured loan,,10.20 %,SOFR (A),5.50 %,,...,,,,,,,,,,
4,,,,,First lien senior secured loan,,10.39 %,SOFR (A),5.50 %,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1503,,,,,First lien senior secured loan,,11.00 %,SOFR (Q),5.50 %,,...,,,,,,,,,,
1504,,,,,Series A preferred stock,,,,,,...,,,,,,,,,,
1505,,,,,,,,,,,...,,,,,,,,,,
1506,,,,,,,,,,,...,,,,,,,,,,


In [156]:
 master_table.to_csv('8_master_table.csv')

In [270]:
def process_table(soi_table_df, append_str):
    soi_table_df = soi_table_df.replace(r'^\s*\$\s*$', np.nan, regex=True)
    soi_table_df = soi_table_df.dropna(how='all', axis=1)
    soi_table_df = soi_table_df.dropna(
        how='all', axis=0).reset_index(drop=True)

    # Separate header and data?
    soi_table_header = soi_table_df.iloc[0].dropna(how='any')
    print('header: ')
    print(soi_table_header)
    soi_table_data_df = soi_table_df.rename(
        columns=soi_table_df.iloc[0]).drop(soi_table_df.index[0])
    print('1: ' + str(soi_table_data_df.shape))
    num_cols = soi_table_data_df.shape[1]

    # Drop Full NnN rows
    soi_table_data_df = soi_table_data_df.dropna(how='all', axis=1)
    soi_table_data_df = soi_table_data_df.dropna(
        how='all', axis=0).reset_index(drop=True)
    print('2: ' + str(soi_table_data_df.shape))

    # Rename columns to integer range
    num_cols = soi_table_data_df.shape[1]
    data_col_mapper = dict(zip(soi_table_data_df.columns.to_list(), [
                           i for i in range(0, num_cols)]))
    soi_table_data_df = soi_table_data_df.rename(columns=data_col_mapper)
    print('3: ' + str(soi_table_data_df.shape))
    # soi_table_data_df.to_csv('csv_file/3_'+append_str+'.csv')

    # Drop subtotal/total rows based on regex
    sub_total_filter_pattern = r'^([Ss]ubtotal)|([Tt]otal)'
    sub_total_filter = soi_table_data_df[0].str.contains(
        sub_total_filter_pattern).replace(np.NaN, False)
    print(sub_total_filter)
    soi_table_data_df = soi_table_data_df[~sub_total_filter]

    # Drop Full NnN rows/cols
    soi_table_data_df = soi_table_data_df.dropna(how='all', axis=1)
    soi_table_data_df = soi_table_data_df.dropna(
        how='all', axis=0).reset_index(drop=True)
    print('6: ' + str(soi_table_data_df.shape))

    # Rename columns to integer range
    num_cols = soi_table_data_df.shape[1]
    data_col_mapper = dict(zip(soi_table_data_df.columns.to_list(), [
                           i for i in range(0, num_cols)]))
    soi_table_data_df = soi_table_data_df.rename(columns=data_col_mapper)

    # # Drop totals rows
    # soi_table_data_df = soi_table_data_df.dropna(
    #     subset=[i for i in range(0, num_cols-2)], how='all')
    # print('7: ' + str(soi_table_data_df.shape))

    # Forward Fill first 2 columns
    ffill_cols = [i for i in range(0, num_cols-4)]
    soi_table_data_df[ffill_cols] = soi_table_data_df[ffill_cols].fillna(
        method='ffill')
    print('8: ' + str(soi_table_data_df.shape))
    # soi_table_data_df.to_csv('csv_file/8_'+append_str+'.csv')

    # Drop rows with only first 2/3 columns present
    soi_table_data_df = soi_table_data_df.dropna(
        subset=[i for i in range(num_cols-4, num_cols)], how='all')
    print('9: ' + str(soi_table_data_df.shape))

    return soi_table_data_df

In [271]:
print(process_table_.shape)
process_table_ = process_table(master_table, "hi")
process_table_

(1506, 19)
2: (0, 0)
3: (0, 0)
An error occurred: 0
