In [None]:
import pandas as pd
import numpy as np
import requests
import html5lib
from bs4 import BeautifulSoup
from datetime import datetime
from openpyxl import workbook
import re
import os
import webbrowser
import unicodedata
from Helper_package import Helper
from html import unescape

In [None]:
headers = {
    'User-Agent': 'Goldman Sachs BDC, Inc. GSBD on NYSE'
}
CIK = '0001572694'

filing_data = Helper.fetch_filing_data(cik=CIK, headers=headers)
if filing_data is not None:
    # Write DataFrame to Excel file with auto-adjusting column widths
    file_name = "../GSBC_sec_filing_links.xlsx"
    with pd.ExcelWriter(file_name, engine='xlsxwriter') as writer:
        filing_data.to_excel(writer, index=False)
        worksheet = writer.sheets['Sheet1']
        for i, col in enumerate(filing_data.columns):
            column_len = max(filing_data[col].astype(
                str).str.len().max(), len(col)) + 2
            worksheet.set_column(i, i, column_len)

    print(f"Data written to {file_name}")

filing_links = Helper.get_filing_links('../GSBC_sec_filing_links.xlsx')
print("Filing link recived")

In [None]:
filing_links['reportDate'] = pd.to_datetime(
    filing_links['reportDate']).dt.strftime("%B %d, %Y")
print("Date was converted to '%B %d, %Y' format and back to")
filing_links['reportDate'].info()

In [None]:
filing_links.info()

In [None]:
def check_links_validity(filing_links):
    '''
        Checks the validity of each file link in the DataFrame.
    '''
    valid_links = []
    invalid_links = []
    for index, row in filing_links.iterrows():
        link = row['fileLink']
        try:
            response = requests.head(link, headers=headers)
            response.raise_for_status()
            valid_links.append(link)
        except Exception as e:
            invalid_links.append((link, str(e)))
    if invalid_links:
        print("\nInvalid Links:")
        for link, error_message in invalid_links:
            print(f"{link}: {error_message}")

    if len(valid_links) == filing_links.shape[0]:
        print("All Valid Links")


check_links_validity(filing_links)

In [None]:
test_file = filing_links.iloc[18]
print(test_file)
content = Helper.get_content(Helper.get_response(
    url=test_file['fileLink'], headers=headers))

In [None]:
consolidated_schedule_regex = re.compile(
    r'(?i)^\s*.*\s*SCHEDULE(S|)\s+OF\s+INVESTMENTS\s*.*\s*$')
date_regex_pattern = r'([A-Za-z]+\s+\d{1,2},\s+\d{4})'


def extract_tables(content, report_Date) -> pd.DataFrame:
    master_table = pd.DataFrame()
    print(f"Extractiong File: {report_Date}")
    for tag in content.findAll(string=consolidated_schedule_regex):
        try:
            date_matches = re.findall(date_regex_pattern, tag.text)
        except Exception as e:
            print(f'Could not find date on extract_tables() : {e}')

        if date_matches and len(date_matches) == 1:
            table_date = date_matches[0]
            if table_date is not None and unicodedata.normalize('NFKD', table_date) == unicodedata.normalize('NFKD', report_Date):
                html_table = tag.find_next('table')
                new_table = pd.read_html(
                    html_table.prettify(), na_values="No value", skiprows=0, flavor='bs4')[0]
                # new_table = new_table.applymap(lambda x: unicodedata.normalize(
                #     'NFKD', x.strip().strip(u'\u200b').replace('—', '0').replace('%', '').replace('(', '').replace(')', '')) if type(x) == str else x)

                new_table = new_table.dropna(how='all', axis=0)

                if master_table is None:
                    master_table = new_table
                else:
                    master_table = pd.concat(
                        [master_table, new_table], ignore_index=True)

    return master_table


extract_tables(content=content,
               report_Date=test_file['reportDate']).to_csv('test.csv')

In [None]:
path = '/Users/fuadhassan/Desktop/BDC_RA/GSBD/New_Master_tables_GSBD_Investment.xlsx'
writer = pd.ExcelWriter(path, engine='openpyxl')
for index, (url, reporting_date) in enumerate(zip(filing_links['fileLink'], filing_links['reportDate'])):
    try:
        content = Helper.get_content(
            Helper.get_response(url=url, headers=headers))
    except Exception as e:
        print(f'failed to get the content: {e}')

    master_table = extract_tables(content, reporting_date)
    master_table.to_csv(
        '../MT_csv_files_2/'+reporting_date.replace(',', '')+'.csv')
    master_table.to_excel(
        writer, sheet_name=reporting_date.replace(',', ''), index=False)
    writer.book .save(path)
writer.close()