In [7]:
from bs4 import BeautifulSoup
import re
import unicodedata
import pandas as pd
import numpy as np
import html5lib
import requests
from openpyxl import Workbook
from datetime import datetime
import webbrowser


def parse_and_trim(content, content_type):
    if content_type == 'HTML':
        soup = BeautifulSoup(content, 'html.parser')
    else:
        soup = BeautifulSoup(content, 'html.parser')
    for tag in soup.recursiveChildGenerator():
        try:
            tag.attrs = None
        except AttributeError:
            pass
    for linebreak in soup.find_all('br'):
        linebreak.extract()
    return soup

In [8]:
headers = {
    'User-Agent': 'GOLUB CAPITAL BDC, Inc.'
}
filing_links = pd.read_excel(
    "../GBDC__sec_filing_links.xlsx", engine='openpyxl')
filing_links.head(5)

Unnamed: 0,Form type,Form description,Filing date,Reporting date,Act,Film number,Accession number,Filings URL
0,10-Q,Quarterly report [Sections 13 or 15(d)],2013-05-03,2013-03-31,34,13810352,0001144204-13-026113,https://www.sec.gov/Archives/edgar/data/147676...
1,10-Q,Quarterly report [Sections 13 or 15(d)],2013-08-08,2013-06-30,34,131020072,0001144204-13-043799,https://www.sec.gov/Archives/edgar/data/147676...
2,10-K,"Annual report [Section 13 and 15(d), not S-K I...",2013-12-03,2013-09-30,34,131254591,0001144204-13-065322,https://www.sec.gov/Archives/edgar/data/000147...
3,10-Q,Quarterly report [Sections 13 or 15(d)],2014-02-06,2013-12-31,34,14578102,0001144204-14-006255,https://www.sec.gov/Archives/edgar/data/000147...
4,10-Q,Quarterly report [Sections 13 or 15(d)],2014-05-08,2014-03-31,34,14823110,0001144204-14-028416,https://www.sec.gov/Archives/edgar/data/000147...


In [9]:
date_columns = ['Filing date', 'Reporting date']
for col in date_columns:
    filing_links[col] = pd.to_datetime(filing_links[col], format='%Y-%m-%d')
for col in date_columns:
    filing_links[col] = filing_links[col].dt.strftime("%B %d, %Y")
filing_links.head(5)

Unnamed: 0,Form type,Form description,Filing date,Reporting date,Act,Film number,Accession number,Filings URL
0,10-Q,Quarterly report [Sections 13 or 15(d)],"May 03, 2013","March 31, 2013",34,13810352,0001144204-13-026113,https://www.sec.gov/Archives/edgar/data/147676...
1,10-Q,Quarterly report [Sections 13 or 15(d)],"August 08, 2013","June 30, 2013",34,131020072,0001144204-13-043799,https://www.sec.gov/Archives/edgar/data/147676...
2,10-K,"Annual report [Section 13 and 15(d), not S-K I...","December 03, 2013","September 30, 2013",34,131254591,0001144204-13-065322,https://www.sec.gov/Archives/edgar/data/000147...
3,10-Q,Quarterly report [Sections 13 or 15(d)],"February 06, 2014","December 31, 2013",34,14578102,0001144204-14-006255,https://www.sec.gov/Archives/edgar/data/000147...
4,10-Q,Quarterly report [Sections 13 or 15(d)],"May 08, 2014","March 31, 2014",34,14823110,0001144204-14-028416,https://www.sec.gov/Archives/edgar/data/000147...


In [4]:
def extract_tables(soup_content, qtr_date):
    master_table = None
    consolidated_schedule_regex = re.compile(
        r'(?i)^\s*.*\s*CONSOLIDATED\s+SCHEDULE(S|)\s+OF\s+INVESTMENTS\s*.*\s*$')
    date_regex_pattern1 = r'([A-Za-z]+\s+\d{1,2},\s+\d{4})'
    date_regex_pattern2 = r'\bAs\s+of\s+([A-Za-z]+\s+\d{1,2},\s+\d{4})\b'
    for tag in soup_content.find_all(text=re.compile(consolidated_schedule_regex)):
        date_str = re.search(date_regex_pattern1, tag.find_next().text)
        if date_str is None:
            date_str = re.search(date_regex_pattern1, tag.next.text)
        print(date_str)
        if date_str is not None:
            date_str = str(date_str.group(1))
            date_str = unicodedata.normalize('NFKD', date_str)
            qtr_date_cleaned = qtr_date.replace(',', '').replace(
                ' ', '').replace('\n', '').lower()
            date_str_cleaned = date_str.replace(',', '').replace(
                ' ', '').replace('\n', '').lower()
            print(qtr_date_cleaned, date_str_cleaned)

            if qtr_date_cleaned == date_str_cleaned:
                html_table = tag.find_next('table')
                new_table = pd.read_html(
                    html_table.prettify(), skiprows=0, flavor='bs4')[0]
                new_table = new_table.applymap(lambda x: unicodedata.normalize(
                    'NFKD', x.strip().strip(u'\u200b').replace('—', '-')) if type(x) == str else x)
                new_table = new_table.replace(
                    r'^\s*$', np.nan, regex=True).replace(r'^\s*\$\s*$', np.nan, regex=True)
                new_table = new_table.dropna(how='all', axis=0)

                if master_table is None:
                    master_table = new_table
                else:
                    master_table = pd.concat(
                        [master_table, new_table], ignore_index=True)

    master_table = master_table.applymap(
        lambda x: x.strip().strip(u'\u200b') if type(x) == str else x)
    master_table = master_table.replace(r'^\s*$', np.nan, regex=True).replace(
        r'^\s*\$\s*$', np.nan, regex=True).replace(r'^\s*\)\s*$', np.nan, regex=True)
    print(master_table.shape)
    return master_table

In [5]:

# filing_links = filing_links.drop(
#     filing_links[filing_links['Reporting date'] == 'September 30, 2017'].index)
# print(filing_links[filing_links['Reporting date'] == 'September 30, 2017'])

In [None]:
# path = '/Users/fuadhassan/Desktop/BDC_RA/GBDC/GBDC_Investment.xlsx'
# writer = pd.ExcelWriter(path, engine='openpyxl')
# for qtr_date, html_link in zip(filing_links['Reporting date'], filing_links['Filings URL']):
#     print(html_link, qtr_date)
#     response = requests.get(html_link, headers=headers)
#     content = parse_and_trim(response.content, 'HTML')
#     master_table = extract_tables(content, qtr_date)
#     master_table.to_excel(
#         writer, sheet_name=qtr_date.replace(',', ''), index=False)
#     writer.book.save(path)
# writer.close()

In [None]:

# url = 'https://www.sec.gov/Archives/edgar/data/1476765/000147676517000078/gbdc201710-k.htm'
# date = 'September 30, 2017'
# url, date
# response = requests.get(url, headers=headers)
# content = parse_and_trim(response.content, 'HTML')
# master_table = extract_tables(content, date)
# # process_table_ = process_table(master_table, "")
# # process_table_.to_excel("example.xlsx")
# # process_table_.to_csv('example.csv')
# # process_table_

In [None]:
# index=-2
# url, date = filing_links.iloc[index]['Filings URL'], filing_links.iloc[index]['Reporting date']
# url='https://www.sec.gov/Archives/edgar/data/1476765/000162828016021522/gbdc201510-k.htm'
# date='September 30, 2016'
# print(url, date)
# response = requests.get(url, headers=headers)
# content = parse_and_trim(response.content, 'HTML')
# search_texts = [
#     'Consolidated Schedule of Investments',
#     'Consolidated Schedule of Investments - (continued)',
#         'Consolidated Schedule of Investments (unaudited) - (continued)',
#         'Consolidated Schedule of Investments (unaudited)']
# all_tags=content.find_all(text=search_texts)
# for tag in all_tags:
#     print(tag,tag.next)

In [51]:
# for qtr_date, html_link in zip(filing_links['Reporting date'], filing_links['Filings URL']):
#     print(html_link, qtr_date)
#     response = requests.get(html_link, headers=headers)
#     content = parse_and_trim(response.content, 'HTML')
#     consolidated_schedule_regex = re.compile(r'(?i)^\s*.*\s*CONSOLIDATED\s+SCHEDULE(S|)\s+OF\s+INVESTMENTS\s*.*\s*$')
#     for tag in content.find_all(text=consolidated_schedule_regex):
#         print("Tag:", tag)
#         print("Next:", tag.find_next())
#         print("next:", tag.next)
#         print("\n")

In [None]:
# for qtr_date, html_link in zip(filing_links['Reporting date'], filing_links['Filings URL']):
#     print(html_link, qtr_date)
#     response = requests.get(html_link, headers=headers)
#     content = parse_and_trim(response.content, 'HTML')
#     master_table = extract_tables(content,qtr_date)
#     # print(master_table)

In [8]:
index = 0
data_frames = []
master_table = None
url, date = filing_links.iloc[index]['Filings URL'], filing_links.iloc[index]['Reporting date']
print(url, date)
response = requests.get(url, headers=headers)
content = parse_and_trim(response.content, 'HTML')
consolidated_schedule_regex = re.compile(
    r'(?i)^\s*.*\s*CONSOLIDATED\s+SCHEDULE(S|)\s+OF\s+INVESTMENTS\s*.*\s*$')
for tag in content.find_all(text=consolidated_schedule_regex):
    print("Tag:", tag)
    print("Next:", tag.find_next())
    print("Next:", tag.next)
    table = tag.find_next("table")
    if table:
        # Extract the table data into a data frame
        table_data = []
        for row in table.find_all('tr'):
            # Include header cells ('th') if necessary
            columns = row.find_all(['th', 'td'])
            row_data = [column.get_text(strip=True) for column in columns]
            table_data.append(row_data)

        # Create a data frame from the table data and add it to the list
        table_df = pd.DataFrame(table_data)
        data_frames.append(table_df)

        if master_table is None:
            master_table = table_df
        else:
            master_table = pd.concat(
                [master_table, table_df], ignore_index=True)

    # Print or process the data frames as needed
    # for idx, df in enumerate(data_frames):
    #     print(f"Data Frame {idx + 1}:\n", df)

    print("\n")

https://www.sec.gov/Archives/edgar/data/1476765/000114420413026113/v343181_10q.htm March 31, 2013


  for tag in content.find_all(text=consolidated_schedule_regex):


Tag: Consolidated Schedules of Investments
    as of March 31, 2013 (unaudited) and September 30, 2012
Next: <td>7</td>
Next: 



Tag: Consolidated
Schedule of Investments (unaudited)
Next: <p><font><b>March
31, 2013</b></font></p>
Next: 



Tag: Consolidated
Schedule of Investments (unaudited) - (Continued)
Next: <p><font><b>March
31, 2013</b></font></p>
Next: 



Tag: Consolidated
Schedule of Investments (unaudited) - (Continued)
Next: <p><font><b>March
31, 2013</b></font></p>
Next: 



Tag: Consolidated
Schedule of Investments (unaudited) - (Continued)
Next: <p><font><b>March
31, 2013</b></font></p>
Next: 



Tag: Consolidated
Schedule of Investments (unaudited) - (Continued)
Next: <p><font><b>March
31, 2013</b></font></p>
Next: 



Tag: Consolidated
Schedule of Investments (unaudited) - (Continued)
Next: <p><font><b>March
31, 2013</b></font></p>
Next: 



Tag: Consolidated
Schedule of Investments
Next: <p><font><b>September
30, 2012</b></font></p>
Next: 



Tag: Consolidated
Schedu

In [None]:
master_table.head(10)

In [None]:
master_table.to_csv("test.csv")
print(len(data_frames))

In [5]:
data_frames = []
data_frames_shapes = []


def extract_tables(soup_content, qtr_date):
    master_table = None
    consolidated_schedule_regex = re.compile(
        r'(?i)^\s*.*\s*CONSOLIDATED\s+SCHEDULE(S|)\s+OF\s+INVESTMENTS\s*.*\s*$')
    date_regex_pattern1 = r'([A-Za-z]+\s+\d{1,2},\s+\d{4})'
    # date_regex_pattern2 = r'\bAs\s+of\s+([A-Za-z]+\s+\d{1,2},\s+\d{4})\b'
    for tag in soup_content.find_all(text=re.compile(consolidated_schedule_regex)):
        date_str = re.search(date_regex_pattern1, tag.find_next().text)
        if date_str is None:
            date_str = re.search(date_regex_pattern1, tag.next.text)
        print(date_str)
        if date_str is not None:
            date_str = str(date_str.group(1))
            date_str = unicodedata.normalize('NFKD', date_str)
            qtr_date_cleaned = qtr_date.replace(',', '').replace(
                ' ', '').replace('\n', '').lower()
            date_str_cleaned = date_str.replace(',', '').replace(
                ' ', '').replace('\n', '').lower()
            print(qtr_date_cleaned, date_str_cleaned)

            if qtr_date_cleaned == date_str_cleaned:
                table = tag.find_next("table")
                if table:
                    # Extract the table data into a data frame
                    table_data = []
                    for row in table.find_all('tr'):
                        # Include header cells ('th') if necessary
                        columns = row.find_all(['th', 'td'])
                        row_data = [column.get_text(strip=True)
                                    for column in columns]
                        table_data.append(row_data)

                    # Create a data frame from the table data and add it to the list
                    table_df = pd.DataFrame(table_data)
                    if len(table_df.columns) > 10:
                        data_frames.append(table_df)

                        if master_table is None:
                            master_table = table_df
                        else:
                            master_table = pd.concat(
                                [master_table, table_df], ignore_index=True)
            # print(master_table)

    master_table = master_table.applymap(
        lambda x: x.strip().strip(u'\u200b') if type(x) == str else x)
    master_table = master_table.replace(r'^\s*$', np.nan, regex=True).replace(
        r'^\s*\$\s*$', np.nan, regex=True).replace(r'^\s*\)\s*$', np.nan, regex=True)
    print(master_table.shape)
    return master_table

In [None]:
print(url, date)
response = requests.get(url, headers=headers)
content = parse_and_trim(response.content, 'HTML')
master_table = extract_tables(content, date)
print(master_table)

In [11]:
master_table.to_csv("test.csv")
master_table

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,,,,,,,,,,,...,,,,,,,,,,
1,,,Investment,,Spread\n Above,,,Interest,,,...,,Fair,,,,,,,,
2,,,Type,,Index(1),,,Rate(2),,,...,,Value,,,,,,,,
3,Investments,,,,,,,,,,...,,,,,,,,,,
4,Canada,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433,,,,,,,,,,,...,,,,,,,,,,
434,,,,,,,,,,,...,,,,,,,,,,
435,Total\n United States,,,,,,,,,,...,780136,,,,157.6,%,,,787520,
436,,,,,,,,,,,...,,,,,,,,,,


In [None]:
path = '/Users/fuadhassan/Desktop/BDC_RA/GBDC/2_Test_GBDC_Investment.xlsx'
writer = pd.ExcelWriter(path, engine='openpyxl')
for qtr_date, html_link in zip(filing_links['Reporting date'], filing_links['Filings URL']):
    print(html_link, qtr_date)
    response = requests.get(html_link, headers=headers)
    content = parse_and_trim(response.content, 'HTML')
    master_table = extract_tables(content, qtr_date)
    master_table.to_excel(
        writer, sheet_name=qtr_date.replace(',', ''), index=False)
    writer.book .save(path)
writer.close()

# SEP 30 2017

In [46]:
url = "https://www.sec.gov/Archives/edgar/data/1476765/000147676517000078/gbdc201710-k.htm"
date = "September 30, 2017"
response = requests.get(url=url, headers=headers)
content = parse_and_trim(response.content, "HTML")
contentDUP = content

In [None]:
consolidated_schedule_regex = re.compile(
    r'(?i)^\s*.*\s*CONSOLIDATED\s+SCHEDULE(S|)\s+OF\s+INVESTMENTS\s*.*\s*$')
for tag in content.find_all(text=consolidated_schedule_regex):
    print("Tag:", tag)
    print("Next:", tag.find_next())
    date_regex_pattern1 = r'([A-Za-z]+\s+\d{1,2},\s+\d{4})'
    print("Next_date:", re.search(date_regex_pattern1, tag.find_next().text))
    print("next:", tag.next)
    print(tag.find_next("table"))
    print("\n")

In [None]:
master_table = extract_tables(content, date)

In [56]:
master_table.to_csv('test.csv')

### June_30_2014

In [42]:
data_frames = []
data_frames_shapes = []


def extract_tables(soup_content, qtr_date):
    master_table = None
    print(qtr_date)
    if qtr_date == 'December 31, 2015' or qtr_date == 'June 30, 2016':
        consolidated_schedule_regex = re.compile(
            r'(?i)^\s*.*\s*CONSOLIDATED\s+SCHEDULE(S|)\s+OF\s+INVESTMENTS\s*(\(.*\)|)\s*-.*\s*\(.*\)$')
    else:
        consolidated_schedule_regex = re.compile(
            r'(?i)^\s*.*\s*CONSOLIDATED\s+SCHEDULE(S|)\s+OF\s+INVESTMENTS\s*.*\s*$')
    date_regex_pattern = r'([A-Za-z]+\s+\d{1,2},\s+\d{4})'
    # date_regex_pattern2 = r'\bAs\s+of\s+([A-Za-z]+\s+\d{1,2},\s+\d{4})\b'
    for tag in soup_content.find_all(text=re.compile(consolidated_schedule_regex)):
        date_str = re.search(date_regex_pattern, tag.find_next().text)
        if date_str is None:
            date_str = re.search(date_regex_pattern, tag.next.text)
        if date_str is None:
            date_str = re.search(date_regex_pattern,
                                 tag.find_next().next.next.next.text)
        if date_str is not None:
            date_str = str(date_str.group(1))
            date_str = unicodedata.normalize('NFKD', date_str)
            qtr_date_cleaned = qtr_date.replace(',', '').replace(
                ' ', '').replace('\n', '').lower()
            date_str_cleaned = date_str.replace(',', '').replace(
                ' ', '').replace('\n', '').lower()
            print(date_str, qtr_date_cleaned, date_str_cleaned)

            if qtr_date_cleaned == date_str_cleaned:
                table = tag.find_next("table")
                if table:
                    # Extract the table data into a data frame
                    table_data = []
                    for row in table.find_all('tr'):
                        # Include header cells ('th') if necessary
                        columns = row.find_all(['th', 'td'])
                        row_data = [column.get_text(strip=True)
                                    for column in columns]
                        table_data.append(row_data)

                    # Create a data frame from the table data and add it to the list
                    table_df = pd.DataFrame(table_data)
                    if len(table_df.columns) > 10:

                        if master_table is None:
                            master_table = table_df
                        else:
                            master_table = pd.concat(
                                [master_table, table_df], ignore_index=True)
            # print(master_table)

    master_table = master_table.applymap(
        lambda x: x.strip().strip(u'\u200b') if type(x) == str else x)
    master_table = master_table.replace(r'^\s*$', np.nan, regex=True).replace(
        r'^\s*\$\s*$', np.nan, regex=True).replace(r'^\s*\)\s*$', np.nan, regex=True)
    print(master_table.shape)
    data_frames.append(master_table)
    data_frames_shapes.append(master_table.shape)
    return master_table

In [38]:
date = "June 30, 2016"
url = filing_links[filing_links['Reporting date']
                   == date]['Filings URL'].values[0]
webbrowser.open(url=url)
response = requests.get(url=url, headers=headers)
content = parse_and_trim(response.content, "HTML")
contentDUP = content

In [39]:
master_table = extract_tables(content, date)

data_str <re.Match object; span=(0, 13), match='June 30, 2016'>
june302016 june302016
data_str <re.Match object; span=(0, 13), match='June 30, 2016'>
june302016 june302016
data_str <re.Match object; span=(0, 13), match='June 30, 2016'>
june302016 june302016
data_str <re.Match object; span=(0, 13), match='June 30, 2016'>
june302016 june302016
data_str <re.Match object; span=(0, 13), match='June 30, 2016'>
june302016 june302016
data_str <re.Match object; span=(0, 13), match='June 30, 2016'>
june302016 june302016
data_str <re.Match object; span=(0, 13), match='June 30, 2016'>
june302016 june302016
data_str <re.Match object; span=(0, 13), match='June 30, 2016'>
june302016 june302016
(586, 27)


  for tag in soup_content.find_all(text=re.compile(consolidated_schedule_regex)):


In [40]:
master_table.to_csv('test.csv')
master_table

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,,,,,Spread,,,,,,...,Percentage,,,,,,,,,
1,,,Investment,,Above,,Interest,,Maturity,,...,,,Fair,,,,,,,
2,,,Type,,Index(1),,Rate(2),,Date,,...,,,Value,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,"Tate's Bake Shop, Inc.#",,Senior loan,,L + 5.00%,,,6.00%,,08/2019,...,,,,0.1,,,,599,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581,US Bank Money Market Account (CUSIP 8AMMF0176),,,,,,,,,,...,3027,,,,0.4,,,,3027,
582,US Bank Money Market Account (CUSIP 9AMMF05B2),,,,,,,,,,...,21441,,,,2.6,,,,21441,
583,"Total Cash, Restricted Cash and Cash Equivalents",,,,,,,,,,...,61894,,,,7.6,%,,,61894,
584,,,,,,,,,,,...,,,,,,,,,,


In [41]:
master_table.shape

(586, 27)

In [35]:
consolidated_schedule_regex = re.compile(
    r'(?i)^\s*.*\s*CONSOLIDATED\s+SCHEDULE(S|)\s+OF\s+INVESTMENTS\s*(\(.*\)|)\s*-.*\s*\(.*\)$')

for tag in content.find_all(text=consolidated_schedule_regex):
    print("Tag:", tag)
    print("Find_next:", tag.find_next())
    date_regex_pattern1 = r'([A-Za-z]+\s+\d{1,2},\s+\d{4})'
    print("Next_date:", re.search(date_regex_pattern1, tag.find_next().text))
    print("next:", tag.next)
    print("Next next: ", tag.find_next().next.next.next.text)
    print("next sib: ", tag.find_next_sibling())
    # print(tag.find_next("table"))
    print("\n")

  for tag in content.find_all(text=consolidated_schedule_regex):


In [36]:
consolidated_schedule_regex = re.compile(
    r'(?i)^\s*.*\s*CONSOLIDATED\s+SCHEDULE(S|)\s+OF\s+INVESTMENTS\s*.*\s*$')

date_regex_pattern1 = r'([A-Za-z]+\s+\d{1,2},\s+\d{4})'
for tag in content.find_all(text=consolidated_schedule_regex):
    print("Tag:", tag)
    print("Next:", tag.find_next())
    print("next:", tag.next)
    print("\n")

Tag: Consolidated Schedules of Investments as of September 30, 2013 and 2012
Next: <td><!-- GUTTER --> </td>
next: 



Tag: We have audited the accompanying consolidated statements of financial condition, including the consolidated schedules of investments, of Golub Capital BDC, Inc. and Subsidiaries (collectively, the “Company”) as of September 30, 2013 and 2012, and the related consolidated statements of operations, changes in net assets, and cash flows for each of the three years in the period ended September 30, 2013. These consolidated financial statements are the responsibility of the Company's management. Our responsibility is to express an opinion on these consolidated financial statements based on our audits. 
Next: <p>We conducted our audits in accordance with the standards of the Public Company Accounting Oversight Board (United States). Those standards require that we plan and perform the audit to obtain reasonable assurance about whether the financial statements are free o

  for tag in content.find_all(text=consolidated_schedule_regex):


In [14]:
consolidated_schedule_regex = re.compile(
    r'(?i)^\s*.*\s*CONSOLIDATED\s+SCHEDULE(S|)\s+OF\s+INVESTMENTS\s*.*\s*$')
for tag in content.find_all(text=consolidated_schedule_regex):
    print("Tag:", tag)
    print("Next:", tag.find_next())
    print("Next:", tag.next)
    table = tag.find_next("table")
    if table:
        # Extract the table data into a data frame
        table_data = []
        for row in table.find_all('tr'):
            # Include header cells ('th') if necessary
            columns = row.find_all(['th', 'td'])
            row_data = [column.get_text(strip=True) for column in columns]
            table_data.append(row_data)

        # Create a data frame from the table data and add it to the list
        table_df = pd.DataFrame(table_data)
        data_frames.append(table_df)

        if master_table is None:
            master_table = table_df
        else:
            master_table = pd.concat(
                [master_table, table_df], ignore_index=True)

    # Print or process the data frames as needed
    # for idx, df in enumerate(data_frames):
    #     print(f"Data Frame {idx + 1}:\n", df)

    print("\n")

Tag: Consolidated Schedules of Investments as of December 31, 2015 (unaudited) and September 30, 2015
Next: <td> </td>
Next: 



Tag: Consolidated Schedule of Investments (unaudited)
Next: <p><b>December 31, 2015</b></p>
Next: 



Tag: Consolidated Schedule of Investments
Next: <p><b>September 30, 2015</b></p>
Next: 



Tag: Refer to the consolidated schedules of investments for further details.
Next: <table><tr>
<td><sup>(2)</sup></td><td></td><td>Included in cash and cash equivalents and restricted cash and cash equivalents on
the consolidated statements of financial condition.</td>
</tr></table>
Next: 





  for tag in content.find_all(text=consolidated_schedule_regex):


In [22]:
search_texts = [
    'Consolidated Schedule of Investments',
    'Consolidated Schedule of Investments - (continued)',
    'Consolidated Schedule of Investments (unaudited) - (continued)',
    'Consolidated Schedule of Investments (unaudited)',
    'Consolidated Schedule of Investments (unaudited) -\n(continued)']
all_tags = content.find_all(text=search_texts)
for tag in all_tags:
    print(tag, tag.next)

Consolidated Schedule of Investments (unaudited) 

Consolidated Schedule of Investments (unaudited) -
(continued) 

Consolidated Schedule of Investments (unaudited) -
(continued) 

Consolidated Schedule of Investments (unaudited) -
(continued) 

Consolidated Schedule of Investments (unaudited) -
(continued) 

Consolidated Schedule of Investments (unaudited) -
(continued) 

Consolidated Schedule of Investments (unaudited) -
(continued) 

Consolidated Schedule of Investments (unaudited) -
(continued) 

Consolidated Schedule of Investments (unaudited) -
(continued) 

Consolidated Schedule of Investments (unaudited) -
(continued) 

Consolidated Schedule of Investments (unaudited) -
(continued) 

Consolidated Schedule of Investments 



  all_tags = content.find_all(text=search_texts)


In [37]:
with open("text.txt", "w") as file:
    file.write(str(content))

### RECHECKING THE EXTRACTR

In [89]:
data_frames = []
data_frames_shapes = []


def extract_tables(soup_content, qtr_date):
    master_table = None
    print(qtr_date)
    if qtr_date == 'December 31, 2015' or qtr_date == 'June 30, 2016':
        consolidated_schedule_regex = re.compile(
            r'(?i)^\s*.*\s*CONSOLIDATED\s+SCHEDULE(S|)\s+OF\s+INVESTMENTS\s*(\(.*\)|)\s*-.*\s*\(.*\)$')
    else:
        consolidated_schedule_regex = re.compile(
            r'(?i)^\s*.*\s*CONSOLIDATED\s+SCHEDULE(S|)\s+OF\s+INVESTMENTS\s*.*\s*$')
    date_regex_pattern = r'([A-Za-z]+\s+\d{1,2},\s+\d{4})'
    # date_regex_pattern2 = r'\bAs\s+of\s+([A-Za-z]+\s+\d{1,2},\s+\d{4})\b'
    for tag in soup_content.find_all(text=re.compile(consolidated_schedule_regex)):
        date_str = re.search(date_regex_pattern, tag.find_next().text)
        if date_str is None:
            date_str = re.search(date_regex_pattern, tag.next.text)
        if date_str is None:
            date_str = re.search(date_regex_pattern,
                                 tag.find_next().next.next.next.text)
        if date_str is not None:
            date_str = str(date_str.group(1))
            date_str = unicodedata.normalize('NFKD', date_str)
            qtr_date_cleaned = qtr_date.replace(',', '').replace(
                ' ', '').replace('\n', '').lower()
            date_str_cleaned = date_str.replace(',', '').replace(
                ' ', '').replace('\n', '').lower()
            print(date_str, qtr_date_cleaned, date_str_cleaned)

            if qtr_date_cleaned == date_str_cleaned:
                html_table = tag.find_next('table')
                new_table = pd.read_html(
                    html_table.prettify(), skiprows=0, flavor='bs4')[0]
                new_table = new_table.applymap(lambda x: unicodedata.normalize(
                    'NFKD', x.strip().strip(u'\u200b').replace('—', '-')) if type(x) == str else x)
                new_table = new_table.replace(
                    r'^\s*$', np.nan, regex=True).replace(r'^\s*\$\s*$', np.nan, regex=True)
                new_table = new_table.dropna(how='all', axis=0)

                if master_table is None:
                    master_table = new_table
                else:
                    master_table = pd.concat(
                        [master_table, new_table], ignore_index=True)
            # print(master_table)

    # master_table = master_table.applymap(
    #     lambda x: x.strip().strip(u'\u200b') if type(x) == str else x)
    # master_table = master_table.replace(r'^\s*$', np.nan, regex=True).replace(
    #     r'^\s*\$\s*$', 0, regex=True).replace(r'^\s*\)\s*$', np.nan, regex=True)
    # master_table = master_table.apply(lambda x: x.str.strip().replace(
    #     '', np.nan) if x.dtype == "object" else x)

    print(master_table.shape)
    data_frames.append(master_table)
    data_frames_shapes.append(master_table.shape)
    return master_table


date = "December 31, 2013"
url = filing_links[filing_links['Reporting date']
                   == date]['Filings URL'].values[0]
# webbrowser.open(url=url)
response = requests.get(url=url, headers=headers)
content = parse_and_trim(response.content, "HTML")
master_table = extract_tables(content, date)
contentDUP = content
master_tableDUP = master_table

December 31, 2013
December 31, 2013 december312013 december312013


  for tag in soup_content.find_all(text=re.compile(consolidated_schedule_regex)):


December 31, 2013 december312013 december312013
December 31, 2013 december312013 december312013
December 31, 2013 december312013 december312013
December 31, 2013 december312013 december312013
December 31, 2013 december312013 december312013
December 31, 2013 december312013 december312013
September 30, 2013 december312013 september302013
September 30, 2013 december312013 september302013
September 30, 2013 december312013 september302013
September 30, 2013 december312013 september302013
September 30, 2013 december312013 september302013
September 30, 2013 december312013 september302013
September 30, 2013 december312013 september302013
December 31, 2013 december312013 december312013
(443, 31)


In [90]:
master_table.to_csv('test_2.csv')
master_table

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,,,,,Spread,,,,,,...,,,,,,,,,,
1,,,Investment,,Above,,,Interest,,,...,,Fair,,,,,,,,
2,,,Type,,Index (1),,,Rate (2),,,...,,Value,,,,,,,,
3,Investments,,,,,,,,,,...,,,,,,,,,,
4,United States,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438,"ITC Global, Inc.",,Preferred stock,,,,,,,,...,,,0.1,,,,311.0,,,
439,Total equity investments United States,,,,,,,,,,...,,,6.5,%,,,42605.0,,,
440,Total United States,,,,,,,,,,...,,,178.8,%,,,1179919.0,,,
441,Total Investments,,,,,,,,,,...,,,178.8,%,,,1179919.0,,,


In [91]:
data_frames = []
data_frames_shapes = []


def extract_tables(soup_content, qtr_date):
    master_table = None
    consolidated_schedule_regex = re.compile(
        r'(?i)^\s*.*\s*CONSOLIDATED\s+SCHEDULE(S|)\s+OF\s+INVESTMENTS\s*.*\s*$')
    date_regex_pattern1 = r'([A-Za-z]+\s+\d{1,2},\s+\d{4})'
    # date_regex_pattern2 = r'\bAs\s+of\s+([A-Za-z]+\s+\d{1,2},\s+\d{4})\b'
    for tag in soup_content.find_all(text=re.compile(consolidated_schedule_regex)):
        date_str = re.search(date_regex_pattern1, tag.find_next().text)
        if date_str is None:
            date_str = re.search(date_regex_pattern1, tag.next.text)
        print(date_str)
        if date_str is not None:
            date_str = str(date_str.group(1))
            date_str = unicodedata.normalize('NFKD', date_str)
            qtr_date_cleaned = qtr_date.replace(',', '').replace(
                ' ', '').replace('\n', '').lower()
            date_str_cleaned = date_str.replace(',', '').replace(
                ' ', '').replace('\n', '').lower()
            print(qtr_date_cleaned, date_str_cleaned)

            if qtr_date_cleaned == date_str_cleaned:
                table = tag.find_next("table")
                if table:
                    # Extract the table data into a data frame
                    table_data = []
                    for row in table.find_all('tr'):
                        # Include header cells ('th') if necessary
                        columns = row.find_all(['th', 'td'])
                        row_data = [column.get_text(strip=True)
                                    for column in columns]
                        table_data.append(row_data)

                    # Create a data frame from the table data and add it to the list
                    table_df = pd.DataFrame(table_data)
                    new_table = table_df.applymap(lambda x: unicodedata.normalize(
                        'NFKD', x.strip().strip(u'\u200b').replace('—', '-')) if type(x) == str else x)
                    new_table = new_table.replace(
                        r'^\s*$', np.nan, regex=True).replace(r'^\s*\$\s*$', np.nan, regex=True)
                    table_df = new_table.dropna(how='all', axis=0)
                    if len(table_df.columns) > 10:
                        data_frames.append(table_df)

                        if master_table is None:
                            master_table = table_df
                        else:
                            master_table = pd.concat(
                                [master_table, table_df], ignore_index=True)

            # print(master_table)

    master_table = master_table.applymap(
        lambda x: x.strip().strip(u'\u200b') if type(x) == str else x)
    master_table = master_table.replace(r'^\s*$', np.nan, regex=True).replace(
        r'^\s*\$\s*$', np.nan, regex=True).replace(r'^\s*\)\s*$', np.nan, regex=True)
    print(master_table.shape)
    return master_table


date = "December 31, 2013"
url = filing_links[filing_links['Reporting date']
                   == date]['Filings URL'].values[0]
# webbrowser.open(url=url)
response = requests.get(url=url, headers=headers)
content = parse_and_trim(response.content, "HTML")
master_table = extract_tables(content, date)
contentDUP = content
master_tableDUP = master_table

  for tag in soup_content.find_all(text=re.compile(consolidated_schedule_regex)):


<re.Match object; span=(0, 17), match='December 31, 2013'>
december312013 december312013
[['', '', '', '', 'Spread', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'Percentage', '', '', '', ''], ['', '', 'Investment', '', 'Above', '', '', 'Interest', '', '', 'Maturity', '', '', 'Principal', '', '', '', '', '', 'of', '', '', 'Fair', ''], ['', '', 'Type', '', 'Index(1)', '', '', 'Rate(2)', '', '', 'Date', '', '', 'Amount', '', '', 'Cost', '', '', 'Net\n    Assets', '', '', 'Value', ''], ['Investments', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], ['United States', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], ['Debt investments', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], ['Aerospace and Defense', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '

In [92]:
master_table.to_csv('test.csv')
master_table

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,,,,,Spread,,,,,,...,,,,,,,,,,
1,,,Investment,,Above,,,Interest,,,...,,Fair,,,,,,,,
2,,,Type,,Index(1),,,Rate(2),,,...,,Value,,,,,,,,
3,Investments,,,,,,,,,,...,,,,,,,,,,
4,United States,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437,Telecommunications,,,,,,,,,,...,,,,,,,,,,
438,"ITC Global, Inc.",,Preferred stock,,,,,,,,...,,,0.1,,,,311,,,
439,Total\n equity investments United States,,,,,,,,,,...,,,6.5,%,,,42605,,,
440,Total\n United States,,,,,,,,,,...,,,178.8,%,,,1179919,,,


In [98]:
def process_table_fun(soi_table_df, process_tables_shapes):
    print(1, 'shape:', soi_table_df.shape)
    soi_table_df = soi_table_df.replace(r'^\s*\$\s*$', np.nan, regex=True)
    soi_table_df = soi_table_df.replace(r'\n', '', regex=True)
    print(2, 'shape:', soi_table_df.shape)
    print(6, 'shape:', soi_table_df.shape)
    soi_table_df = soi_table_df.replace('—', 0)
    soi_table_df = soi_table_df.replace('-', 0)
    print(7, 'shape:', soi_table_df.shape)
    soi_table_df.reset_index(drop=True, inplace=True)

    soi_table_df = soi_table_df.applymap(
        lambda x: x.strip() if isinstance(x, str) else x)

    pattern = r'Total\s+Investments'
    # Use the apply function to check if the pattern is in any column for each row
    matching_rows = soi_table_df.apply(
        lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)
    # Find the index of the first row that matches the pattern
    # Slice the DataFrame to keep only the rows up to and including the first matching row
    if soi_table_df[matching_rows].index[0] < 20:
        soi_table_df = soi_table_df.loc[:soi_table_df[matching_rows].index[1]].reset_index(
            drop=True)
    else:
        soi_table_df = soi_table_df.loc[:soi_table_df[matching_rows].index[0]].reset_index(
            drop=True)

# removest end extra
    pattern = r'Net asset value per common share|How We Addressed the Matter in Our Audit'
    matching_rows = soi_table_df.apply(
        lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)

    # Check if the pattern exists in the DataFrame
    if matching_rows.any():
        # Extract rows from the first occurrence onwards
        soi_table_df = soi_table_df.iloc[matching_rows.idxmax(
        )+1:].reset_index(drop=True)

    # removing all col name
    pattern = r'(?:Spread\s*Above|cost|Percentage|Above)'
    matching_rows = soi_table_df.apply(
        lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)
    soi_table_df = soi_table_df[~matching_rows]

    print(0, 'shape:', soi_table_df.shape)
    soi_table_df = soi_table_df.dropna(how='all', axis=1).reset_index(
        drop=True)
    print(3, 'shape:', soi_table_df.shape)
    soi_table_df = soi_table_df.dropna(how='all', axis=0).reset_index(
        drop=True)
    print(4, 'shape:', soi_table_df.shape)
    soi_table_df.dropna().reset_index(
        drop=True)
    print(5, 'shape:', soi_table_df.shape)


# drops the sub total
    soi_table_df = soi_table_df.dropna(subset=[soi_table_df.columns[0]])

    num_cols = soi_table_df.shape[1]
    data_col_mapper = dict(zip(soi_table_df.columns.to_list(), [
        i for i in range(0, num_cols)]))
    soi_table_df = soi_table_df.rename(columns=data_col_mapper)

    for index, row in soi_table_df.iterrows():
        for column in soi_table_df.columns:
            if pd.isna(row[column]):
                # Find the next column in the same row
                next_column = soi_table_df.columns.get_loc(column) + 1
                if next_column < len(soi_table_df.columns):
                    next_column_name = soi_table_df.columns[next_column]
                    # Replace NaN value with the value from the next column
                    soi_table_df.at[index, column] = row[next_column_name]
                    # Set the next column to NaN
                    soi_table_df.at[index, next_column_name] = np.nan

    for index, row in soi_table_df.iterrows():
        for column in soi_table_df.columns:
            if pd.isna(row[column]):
                while True:
                    # Find the next column in the same row
                    next_column = soi_table_df.columns.get_loc(column) + 1
                    if next_column < len(soi_table_df.columns):
                        next_column_name = soi_table_df.columns[next_column]
                        # Replace NaN value with the value from the next column
                        soi_table_df.at[index, column] = row[next_column_name]
                        # Set the next column to NaN
                        soi_table_df.at[index, next_column_name] = np.nan
                        column = next_column_name
                    else:
                        # No more columns to replace, break out of the loop
                        break

    soi_table_df = soi_table_df.dropna(axis=1, thresh=10)
    soi_table_df = soi_table_df.dropna(how='all', axis=1).reset_index(
        drop=True)

    process_tables_shapes.append(soi_table_df.shape)
    print(soi_table_df.info())

    return soi_table_df


process_tables = {}
process_tables_shape = []

In [100]:
process_table = process_table_fun(master_table, process_tables_shape)

1 shape: (442, 31)
2 shape: (442, 31)
6 shape: (442, 31)
7 shape: (442, 31)
0 shape: (420, 31)
3 shape: (420, 21)
4 shape: (420, 21)
5 shape: (420, 21)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 382 entries, 0 to 381
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       382 non-null    object
 1   1       329 non-null    object
 2   2       329 non-null    object
 3   3       322 non-null    object
 4   4       43 non-null     object
 5   5       326 non-null    object
 6   6       293 non-null    object
 7   7       333 non-null    object
 8   8       333 non-null    object
 9   9       258 non-null    object
 10  10      111 non-null    object
 11  11      222 non-null    object
 12  12      44 non-null     object
 13  13      221 non-null    object
dtypes: object(14)
memory usage: 41.9+ KB
None


In [103]:
master_table.to_excel('test.xlsx')