In [4]:
from bs4 import BeautifulSoup
import re
import unicodedata
import pandas as pd
import numpy as np
import html5lib
import requests
from openpyxl import Workbook
from datetime import datetime

In [5]:
def parse_and_trim(content, content_type):
    if content_type == 'HTML':
        soup = BeautifulSoup(content, 'html.parser')
    else:
        soup = BeautifulSoup(content, 'html.parser')

    for tag in soup.recursiveChildGenerator():
        try:
            tag.attrs = None
        except AttributeError:
            pass

    for linebreak in soup.find_all('br'):
        linebreak.extract()

    return soup

In [6]:
def remove_multiple_spaces(string):
    pattern = r'\s+'
    replaced_string = re.sub(pattern, ' ', string)
    return replaced_string


def find_qrt_date(content):
    qtr_date = content.find_all(text=re.compile(
        r'for\s+(the\s+)?(fiscal\s+)?year\s+ended\s+|for\s+the\s+quarter\s+ended\s+|for\s+the\s+quarterly\s+period\s+ended\s+', re.IGNORECASE))
    qtr_match = re.search(
        r'([A-Za-z]+)\s+(\d{1,2}),\s+(\d{4})', qtr_date[0].replace('\n', ''))
    if qtr_match is None:
        qtr_match = qtr_match = re.search(
            r'([A-Za-z]+) (\d{1,2}), (\d{4})', qtr_date[1])
    return remove_multiple_spaces(str(qtr_match.group()))

In [7]:
headers = {
    'User-Agent': 'ARES CAPITAL CORP'
}
url = 'https://www.sec.gov/Archives/edgar/data/1287750/000110465916153937/a16-17135_110q.htm'
date = 'September 30, 2016'
url, date

('https://www.sec.gov/Archives/edgar/data/1287750/000110465916153937/a16-17135_110q.htm',
 'September 30, 2016')

In [8]:
response = requests.get(url, headers=headers)
content = parse_and_trim(response.content, 'HTML')

In [9]:
print(content)

<document>
<type>10-Q
<sequence>1
<filename>a16-17135_110q.htm
<description>10-Q
<text>
<html>
<head>
<script>bazadebezolkohpepadr="1358545002"</script><script></script></head>
<body>
<div>
<p><font><a>Table of Contents</a></font></p>
<p><font> </font></p>
<div>
<p><font> </font></p>
</div>
<p><font> </font></p>
<p><b><font>UNITED STATES</font></b></p>
<p><b><font>SECURITIES AND EXCHANGE COMMISSION</font></b></p>
<p><b><font>Washington, D.C. 20549</font></b></p>
<p><font> </font></p>
<p><b><font>FORM 10-Q</font></b></p>
<p><font> </font></p>
<p><b><font>x</font></b><font>      </font><b><font>QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934</font></b></p>
<p><font> </font></p>
<p><b><font>For the quarterly period ended September 30, 2016</font></b></p>
<p><font> </font></p>
<p><b><font>OR</font></b></p>
<p><font> </font></p>
<p><b><font>o</font></b><font>         </font><b><font>TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES E

In [None]:
def extract_tables(soup_content, qtr_date):
    master_table = None
    all_tags = soup_content.find_all(True)
    print(type(all_tags))
    count = 0
    date_str = None
    for tag in soup_content.find_all(text=re.compile('^\s*.*\s*CONSOLIDATED\s+SCHEDULE(S|)\s+OF\s+INVESTMENTS\s*.*\s*$')):
        next_line_text = tag.next.text.strip()
        regex_pattern = r'([A-Za-z]+\s+\d{1,2},\s+\d{4})'
        date_str = re.search(regex_pattern, next_line_text).group(1)
        # print(date_str)
        if date_str is None:
            next_line = tag.find_next(text=re.compile(regex_pattern)).text
            date_str = re.search(regex_pattern, next_line_text).group(1)
        if date_str is None:
            next_line = tag.next.text.strip()
            date_str = re.search(
                regex_pattern, next_line).group(1)
        if date_str is not None:
            date_str = str(date_str)
            date_str = unicodedata.normalize('NFKD', date_str)
            print(qtr_date)
            if qtr_date.replace(',', '').strip().lower() in date_str.replace(',', '').strip().lower():
                print(qtr_date.replace(',', '').strip().lower(),
                      date_str.replace(',', '').strip().lower())
                count += 1
                print('Table found: ')
                print('Table #', count)
                html_table = tag.find_next('table')
                if master_table is None:
                    master_table = pd.read_html(
                        html_table.prettify(), skiprows=0, flavor='bs4')[0]
                    master_table = master_table.applymap(lambda x: unicodedata.normalize(
                        'NFKD', x.strip().strip(u'\u200b').replace('—', '-')) if type(x) == str else x)
                    master_table = master_table.replace(r'^\s*$', np.nan, regex=True).replace(r'^\s*\$\s*$', np.nan,
                                                                                              regex=True)
                    master_table = master_table.dropna(how='all', axis=0)
                else:
                    new_table = pd.read_html(
                        html_table.prettify(), skiprows=0, flavor='bs4')[0]
                    new_table = new_table.applymap(lambda x: unicodedata.normalize(
                        'NFKD', x.strip().strip(u'\u200b').replace('—', '-')) if type(x) == str else x)
                    new_table = new_table.replace(r'^\s*$', np.nan, regex=True).replace(r'^\s*\$\s*$', np.nan,
                                                                                        regex=True)
                    new_table = new_table.dropna(how='all', axis=0)
                    master_table = master_table.append(
                        new_table.dropna(how='all', axis=0).reset_index(
                            drop=True).drop(index=0),
                        ignore_index=True)

    master_table = master_table.applymap(
        lambda x: x.strip().strip(u'\u200b') if type(x) == str else x)
    master_table = master_table.replace(r'^\s*$', np.nan, regex=True).replace(
        r'^\s*\$\s*$', np.nan, regex=True).replace(r'^\s*\)\s*$', np.nan, regex=True)
    return master_table

In [None]:
for tag in content.find_all(text=re.compile('^\s*.*\s*CONSOLIDATED\s+SCHEDULE(S|)\s+OF\s+INVESTMENTS\s*.*\s*$')):
    print(tag)
    regex_pattern = r'([A-Za-z]+\s+\d{1,2},\s+\d{4})'
    next_line_text = tag.find_next(text=re.compile(regex_pattern)).text
    print(next_line_text)
    date_str = re.search(regex_pattern, next_line_text).group(1)
    print(date_str)

In [None]:
all_tags = content.find_all(True)
all_tags

In [None]:
# # date_str = re.search(r'([A-Za-z]+) (\d{1,2}), (\d{4})', tag)
# for tag in content.find_all(text=re.compile('^.*CONSOLIDATED\s+SCHEDULE(S|)\s+OF\s+INVESTMENTS.*$')):
#     regex_pattern = r'([A-Za-z]+\s+\d{1,2},\s+\d{4})'
#     date_str = re.search(regex_pattern, tag.next.text.strip())
#     if date_str is None:
#         next_line_text = tag.next.text.strip()  # Strip extra whitespace
#         date_str = re.search(regex_pattern, next_line_text)
#         print(date_str.group(1))

In [None]:
master_table = extract_tables(content, date)

In [None]:
# qtr_date = date
# for tag in content.find_all(text=re.compile('^.*CONSOLIDATED\s+SCHEDULE(S|)\s+OF\s+INVESTMENTS.*$')):
#     next_line_text = tag.next.text.strip()
#     regex_pattern = r'([A-Za-z]+\s+\d{1,2},\s+\d{4})'
#     date_str = re.search(regex_pattern, next_line_text).group(1)
#     print(date_str)
#     if date_str is None:
#         next_line = tag.find_next(text=re.compile(regex_pattern)).text
#         date_str = re.search(regex_pattern, next_line)
#     if date_str is None:
#         next_line = tag.next.text.strip()
#         date_str = re.search(
#             regex_pattern, next_line).group(1)
#     if date_str is not None:
#         date_str = str(date_str)
#         date_str = unicodedata.normalize('NFKD', date_str)
#         print(date_str)
#         print(qtr_date, date_str)
#         print(type(qtr_date))
#         if qtr_date.replace(',', '').strip().lower() in date_str.replace(',', '').strip().lower():
#             print(qtr_date, date_str)

In [None]:
master_table

In [None]:
def process_table(soi_table_df, append_str):
    soi_table_df = soi_table_df.replace(r'^\s*\$\s*$', np.nan, regex=True)
    soi_table_df = soi_table_df.dropna(how='all', axis=1)
    soi_table_df = soi_table_df.dropna(
        how='all', axis=0).reset_index(drop=True)
    print('1: ' + str(soi_table_df.shape))

    # Separate header and data
    soi_table_header = soi_table_df.iloc[0].dropna(how='any')
    print('header: ')
    print(soi_table_header)
    soi_table_data_df = soi_table_df.rename(
        columns=soi_table_df.iloc[0]).drop(soi_table_df.index[0])
    print('2: ' + str(soi_table_data_df.shape))

    # Drop rows with only two non-null values
    soi_table_data_df = soi_table_data_df.dropna(thresh=3)
    print('4: ' + str(soi_table_data_df.shape))

    columns_to_fill = ['Amortized Cost', 'Fair Value', '% of Net Assets']
    for col in columns_to_fill:
        col_index = soi_table_data_df.columns.get_loc(col)
        next_col_index = col_index + 1
        for i in range(len(soi_table_data_df)):
            current_value = soi_table_data_df.iat[i, col_index]

            if pd.isna(current_value) and next_col_index < len(soi_table_data_df.columns):
                next_valid_index = next((j for j, v in enumerate(
                    soi_table_data_df.iloc[i, next_col_index:], start=next_col_index) if pd.notna(v)), None)

                if next_valid_index is not None:
                    next_value = soi_table_data_df.iat[i, next_valid_index]
                    soi_table_data_df.iat[i, col_index] = next_value
                    soi_table_data_df.iat[i, next_valid_index] = pd.NA

    # # Drop rows labeled as subtotals
    # subtotal_rows = soi_table_data_df[soi_table_data_df['Company (1)'].str.contains(
    #     'subtotal', case=False, na=False)]
    # soi_table_data_df = soi_table_data_df[~soi_table_data_df.index.isin(
    #     subtotal_rows.index)]
    # print('3: ' + str(soi_table_data_df.shape))

    # # Drop rows based on regex pattern (e.g., 'subtotal' or 'total')
    # pattern = r'^([Ss]ubtotal)|([Tt]otal)'
    # mask = soi_table_data_df.apply(lambda row: row.astype(
    #     str).str.contains(pattern, case=False, na=False)).any(axis=1)
    # soi_table_data_df = soi_table_data_df[~mask]
    # print('4: ' + str(soi_table_data_df.shape))

    # # Drop rows with all missing values
    # soi_table_df = soi_table_df.dropna(how='all')
    # print('5: ' + str(soi_table_data_df.shape))

    # # # Drop columns with all missing values
    # soi_table_data_df = soi_table_data_df.dropna(how='all', axis=1)
    # print('6: ' + str(soi_table_data_df.shape))

    # # Forward fill the first two columns
    # col_indices = [0, 1]
    # soi_table_data_df.iloc[:, col_indices] = soi_table_data_df.iloc[:, col_indices].fillna(
    #     method='ffill')
    # print('7: ' + str(soi_table_data_df.shape))

    # num_cols = soi_table_data_df.shape[1]
    # for col_index in range(num_cols-3, num_cols):
    #     col_name = soi_table_data_df.columns[col_index]
    #     soi_table_data_df[col_name] = pd.to_numeric(
    #         soi_table_data_df[col_name], errors='coerce').fillna(0)

    # soi_table_data_df = soi_table_data_df.replace('-', 0, regex=False)
    # print('8: ' + str(soi_table_data_df.shape))

    return soi_table_data_df


process_table_ = process_table(master_table, "hi")
process_table_.to_excel("ex.xlsx")
process_table_.to_csv('ex.csv')
process_table_

In [None]:
# Forward fill the first two columns