In [106]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import unicodedata
import os
import webbrowser
import html5lib
from openpyxl import workbook
from datetime import datetime
import requests

In [107]:
path = '../Master_tables_GBDC_Investment.xlsx'
dataframes = pd.read_excel(path, sheet_name=None)

In [142]:
process_tables = {}
process_tables_shape = {}
if not os.path.exists('../PT_csv_file'):
    os.makedirs('../PT_csv_file')


def run_process_function(dataframes, process_tables, process_tables_shape):
    path = '../process_tables_GBDC_Investment.xlsx'
    writer = pd.ExcelWriter(path=path, engine='openpyxl')
    for dataframe in dataframes:
        print(dataframe)
        processed_table = process_table_function(dataframes[dataframe])
        process_tables[dataframe] = processed_table
        process_tables_shape[dataframe] = processed_table.shape
        processed_table.to_excel(
            writer, sheet_name=dataframe.replace(',', ''), index=False)
        processed_table.to_csv(
            '../PT_csv_file/'+dataframe.replace(',', '')+'.csv')
        writer.book.save(path)
    writer.close()

In [143]:
def shape(count, df):
    print(f"{count} : shape : {df.shape}")
    count += 1
    return count


def dropna_col_row(df):
    df = df.dropna(how='all', axis=0).reset_index(drop=True)
    df = df.dropna(how='all', axis=1).reset_index(drop=True)
    return df


def drop_if_contain(pattern, df):
    matching_rows = df.apply(
        lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)
    df = df[~matching_rows]
    return df


def rename_columns(df):
    num_cols = df.shape[1]
    data_col_mapper = dict(
        zip(df.columns.to_list(), [i for i in range(0, num_cols)]))
    df = df.rename(columns=data_col_mapper)
    return df

In [146]:
def process_table_function(soi_table_df):
    count = 1
    count = shape(count, soi_table_df)
    soi_table_df = soi_table_df.replace(
        r'^\s*\$\s*$', '', regex=True).replace(r'\n', '', regex=True)
    soi_table_df = soi_table_df.replace('-', '0')
    soi_table_df = dropna_col_row(soi_table_df)
    soi_table_df = soi_table_df.apply(
        lambda x: x.strip() if isinstance(x, str) else x)
    count = shape(count, soi_table_df)

    # drops all the extra top row
    pattern = r'Net asset value per common share|How We Addressed the Matter in Our Audit'
    matching_rows = soi_table_df.apply(
        lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)
    # Check if the pattern exists in the DataFrame
    if matching_rows.any():
        # Extract rows from the first occurrence onwards
        soi_table_df = soi_table_df.iloc[matching_rows.idxmax(
        )+1:].reset_index(drop=True)
    count = shape(count, soi_table_df)

    # drops all the extra bottom row
    pattern = r'Total\s+Investments'
    # Use the apply function to check if the pattern is in any column for each row
    matching_rows = soi_table_df.apply(
        lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)
    # Find the index of the first row that matches the pattern
    # Slice the DataFrame to keep only the rows up to and including the first matching row
    if soi_table_df[matching_rows].index[0] < 20:
        soi_table_df = soi_table_df.loc[:soi_table_df[matching_rows].index[1]].reset_index(
            drop=True)
    else:
        soi_table_df = soi_table_df.loc[:soi_table_df[matching_rows].index[0]].reset_index(
            drop=True)
    count = shape(count, soi_table_df)

    # drop all the col name
    pattern = r'(?:Spread\s*Above|cost|Percentage|Above)'
    soi_table_df = drop_if_contain(pattern, soi_table_df)
    pattern = r'^([Tt]otal)'
    soi_table_df = drop_if_contain(pattern, soi_table_df)
    count = shape(count, soi_table_df)

# drop nan col row
    soi_table_df = dropna_col_row(soi_table_df)
    count = shape(count, soi_table_df)
# drops the sub total
    soi_table_df = soi_table_df.dropna(subset=[soi_table_df.columns[0]])
    count = shape(count, soi_table_df)


# rename col
    soi_table_df = rename_columns(soi_table_df)

    for index, row in soi_table_df.iterrows():
        for column in soi_table_df.columns:
            pattern = re.compile(r'\([a-zA-Z]\)')
            if str(row[column])[-1] == "+" or '+' in str(row[column]):
                next_column_index = soi_table_df.columns.get_loc(
                    column) + 1
                if str(row[column])[-1] == "+":
                    if (next_column_index < len(soi_table_df.columns)
                        and not pd.isna(row[soi_table_df.columns[next_column_index]])
                        ):
                        soi_table_df.at[index, column] = str(
                            soi_table_df.at[index, column])+str(row[soi_table_df.columns[next_column_index]])
                        soi_table_df.at[index,
                                        soi_table_df.columns[next_column_index]] = np.nan

                    next_column_index = soi_table_df.columns.get_loc(
                        next_column_index) + 1
                if (
                    next_column_index < len(soi_table_df.columns)
                    and pattern.search(str(row[soi_table_df.columns[next_column_index]]))
                    and not pd.isna(row[soi_table_df.columns[next_column_index]])
                ):
                    soi_table_df.at[index, column] = str(
                        soi_table_df.at[index, column])+str(row[soi_table_df.columns[next_column_index]])
                    soi_table_df.at[index,
                                    soi_table_df.columns[next_column_index]] = np.nan

            if row[column] == "No Value":
                pattern = re.compile(r'\([0-9]\)')
                next_column_index = soi_table_df.columns.get_loc(column) + 1
                if (
                    next_column_index < len(soi_table_df.columns)
                    and pattern.search(str(row[soi_table_df.columns[next_column_index]]))
                    and not pd.isna(row[soi_table_df.columns[next_column_index]])
                ):
                    soi_table_df.at[index,
                                    column] = row[soi_table_df.columns[next_column_index]]
                    soi_table_df.at[index,
                                    soi_table_df.columns[next_column_index]] = np.nan

    soi_table_df.insert(0, 'Industy', '')

    for index, row in soi_table_df.iterrows():
        if row.nunique() == 2:
            soi_table_df.at[index, 'Industy'] = row.loc[0]
    soi_table_df['Industy'] = soi_table_df['Industy'].replace('', np.nan)

    col_indices = [0, 1, 2]
    soi_table_df.iloc[:, col_indices] = soi_table_df.iloc[:, col_indices].fillna(
        method='ffill')
    col_indices = [0]
    soi_table_df.iloc[:, col_indices] = soi_table_df.iloc[:,
                                                          col_indices].fillna('No value')

    for index, row in soi_table_df.iterrows():
        cleanedList = [x for x in list(row) if str(x) != 'nan']
        row = pd.Series(cleanedList)
        soi_table_df.loc[index] = row


# drop nan col row
    # soi_table_df = soi_table_df.dropna(axis=0, thresh=4)
    soi_table_df = dropna_col_row(soi_table_df)
    count = shape(count, soi_table_df)
# rename col
    soi_table_df = rename_columns(soi_table_df)

    return soi_table_df


run_process_function(dataframes=dataframes, process_tables=process_tables,
                     process_tables_shape=process_tables_shape)

March 31 2013
1 : shape : (424, 31)
2 : shape : (424, 15)
3 : shape : (424, 15)
4 : shape : (424, 15)
5 : shape : (399, 15)
6 : shape : (399, 10)
7 : shape : (365, 10)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


8 : shape : (365, 10)
June 30 2013
1 : shape : (432, 31)
2 : shape : (432, 23)
3 : shape : (432, 23)
4 : shape : (431, 23)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


5 : shape : (403, 23)
6 : shape : (403, 18)
7 : shape : (365, 18)
8 : shape : (365, 10)
September 30 2013
1 : shape : (486, 41)
2 : shape : (486, 17)
3 : shape : (454, 17)
4 : shape : (440, 17)
5 : shape : (420, 17)
6 : shape : (420, 10)
7 : shape : (381, 10)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


8 : shape : (381, 10)
December 31 2013
1 : shape : (443, 31)
2 : shape : (443, 23)
3 : shape : (443, 23)
4 : shape : (442, 23)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


5 : shape : (416, 23)
6 : shape : (416, 18)
7 : shape : (378, 18)
8 : shape : (378, 10)
March 31 2014
1 : shape : (467, 31)
2 : shape : (467, 22)
3 : shape : (467, 22)
4 : shape : (462, 22)
5 : shape : (436, 22)
6 : shape : (436, 17)
7 : shape : (399, 17)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


8 : shape : (399, 10)
June 30 2014
1 : shape : (487, 27)
2 : shape : (487, 21)
3 : shape : (487, 21)
4 : shape : (482, 21)
5 : shape : (451, 21)
6 : shape : (451, 15)
7 : shape : (412, 15)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


8 : shape : (412, 10)
September 30 2014
1 : shape : (527, 33)
2 : shape : (527, 18)
3 : shape : (493, 18)
4 : shape : (487, 18)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


5 : shape : (468, 18)
6 : shape : (468, 10)
7 : shape : (432, 10)
8 : shape : (432, 10)
December 31 2014
1 : shape : (506, 31)
2 : shape : (506, 22)
3 : shape : (506, 22)
4 : shape : (498, 22)
5 : shape : (466, 22)
6 : shape : (466, 18)
7 : shape : (430, 18)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


8 : shape : (430, 10)
March 31 2015
1 : shape : (524, 31)
2 : shape : (524, 20)
3 : shape : (524, 20)
4 : shape : (517, 20)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


5 : shape : (485, 20)
6 : shape : (485, 13)
7 : shape : (445, 13)
8 : shape : (445, 10)
June 30 2015
1 : shape : (571, 31)
2 : shape : (571, 20)
3 : shape : (571, 20)
4 : shape : (560, 20)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


5 : shape : (528, 20)
6 : shape : (528, 13)
7 : shape : (473, 13)
8 : shape : (473, 10)
September 30 2015
1 : shape : (584, 33)
2 : shape : (584, 18)
3 : shape : (551, 18)
4 : shape : (542, 18)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


5 : shape : (525, 18)
6 : shape : (525, 10)
7 : shape : (483, 10)
8 : shape : (483, 10)
December 31 2015
1 : shape : (505, 27)
2 : shape : (505, 21)
3 : shape : (505, 21)
4 : shape : (497, 21)
5 : shape : (461, 21)
6 : shape : (461, 20)
7 : shape : (425, 20)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


8 : shape : (425, 10)
March 31 2016
1 : shape : (589, 27)
2 : shape : (589, 19)
3 : shape : (589, 19)
4 : shape : (581, 19)
5 : shape : (547, 19)
6 : shape : (547, 16)
7 : shape : (505, 16)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


8 : shape : (505, 10)
June 30 2016
1 : shape : (564, 27)
2 : shape : (564, 19)
3 : shape : (564, 19)
4 : shape : (540, 19)
5 : shape : (509, 19)
6 : shape : (509, 14)
7 : shape : (469, 14)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


8 : shape : (469, 10)
September 30 2016
1 : shape : (700, 23)
2 : shape : (700, 22)
3 : shape : (654, 22)
4 : shape : (648, 22)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


5 : shape : (624, 22)
6 : shape : (624, 20)
7 : shape : (582, 20)
8 : shape : (582, 10)
December 31 2016
1 : shape : (646, 23)
2 : shape : (646, 21)
3 : shape : (646, 21)
4 : shape : (640, 21)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


5 : shape : (619, 21)
6 : shape : (619, 20)
7 : shape : (579, 20)
8 : shape : (579, 10)
March 31 2017
1 : shape : (677, 23)
2 : shape : (677, 22)
3 : shape : (677, 22)
4 : shape : (669, 22)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


5 : shape : (648, 22)
6 : shape : (648, 22)
7 : shape : (606, 22)
8 : shape : (606, 10)
June 30 2017
1 : shape : (699, 23)
2 : shape : (699, 22)
3 : shape : (699, 22)
4 : shape : (694, 22)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


5 : shape : (675, 22)
6 : shape : (675, 22)
7 : shape : (635, 22)
8 : shape : (635, 10)
December 31 2017
1 : shape : (734, 41)
2 : shape : (734, 28)
3 : shape : (734, 28)
4 : shape : (725, 28)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


5 : shape : (703, 28)
6 : shape : (703, 22)
7 : shape : (663, 22)
8 : shape : (663, 10)
March 31 2018
1 : shape : (766, 25)
2 : shape : (766, 22)
3 : shape : (766, 22)
4 : shape : (760, 22)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


5 : shape : (737, 22)
6 : shape : (737, 22)
7 : shape : (699, 22)
8 : shape : (699, 10)
June 30 2018
1 : shape : (817, 25)
2 : shape : (817, 23)
3 : shape : (817, 23)
4 : shape : (798, 23)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


5 : shape : (774, 23)
6 : shape : (774, 20)
7 : shape : (737, 20)
8 : shape : (737, 10)
September 30 2018
1 : shape : (883, 24)
2 : shape : (883, 22)
3 : shape : (851, 22)
4 : shape : (845, 22)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


5 : shape : (819, 22)
6 : shape : (819, 18)
7 : shape : (778, 18)
8 : shape : (778, 10)
December 31 2018
1 : shape : (916, 41)
2 : shape : (916, 30)
3 : shape : (916, 30)
4 : shape : (906, 30)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


5 : shape : (879, 30)
6 : shape : (879, 19)
7 : shape : (841, 19)
8 : shape : (841, 10)
March 31 2019
1 : shape : (951, 24)
2 : shape : (951, 22)
3 : shape : (951, 22)
4 : shape : (945, 22)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


5 : shape : (915, 22)
6 : shape : (915, 18)
7 : shape : (875, 18)
8 : shape : (875, 10)
June 30 2019
1 : shape : (994, 24)
2 : shape : (994, 22)
3 : shape : (994, 22)
4 : shape : (988, 22)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


5 : shape : (957, 22)
6 : shape : (957, 18)
7 : shape : (917, 18)
8 : shape : (917, 10)
September 30 2019
1 : shape : (1096, 25)
2 : shape : (1096, 23)
3 : shape : (1094, 23)
4 : shape : (1087, 23)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


5 : shape : (1055, 23)
6 : shape : (1055, 19)
7 : shape : (1013, 19)
8 : shape : (1013, 10)
December 31 2019
1 : shape : (1149, 25)
2 : shape : (1149, 22)
3 : shape : (1149, 22)
4 : shape : (1143, 22)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


5 : shape : (1108, 22)
6 : shape : (1108, 19)
7 : shape : (1063, 19)
8 : shape : (1063, 10)
March 31 2020
1 : shape : (1211, 65)
2 : shape : (1211, 32)
3 : shape : (1211, 32)
4 : shape : (1192, 32)
5 : shape : (1155, 32)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


6 : shape : (1155, 21)
7 : shape : (1106, 21)
8 : shape : (1106, 11)
June 30 2020
1 : shape : (1205, 54)
2 : shape : (1205, 27)
3 : shape : (1205, 27)
4 : shape : (1192, 27)
5 : shape : (1155, 27)
6 : shape : (1155, 18)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


7 : shape : (1105, 18)
8 : shape : (1105, 10)
September 30 2020
1 : shape : (1275, 64)
2 : shape : (1275, 26)
3 : shape : (1273, 26)
4 : shape : (1254, 26)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


5 : shape : (1208, 26)
6 : shape : (1208, 19)
7 : shape : (1138, 19)
8 : shape : (1138, 10)
December 31 2020
1 : shape : (1318, 64)
2 : shape : (1318, 26)
3 : shape : (1318, 26)
4 : shape : (1299, 26)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


5 : shape : (1249, 26)
6 : shape : (1249, 19)
7 : shape : (1175, 19)
8 : shape : (1175, 10)
March 31 2021
1 : shape : (1371, 64)
2 : shape : (1371, 26)
3 : shape : (1371, 26)
4 : shape : (1352, 26)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


5 : shape : (1301, 26)
6 : shape : (1301, 19)
7 : shape : (1227, 19)
8 : shape : (1227, 10)
June 30 2021
1 : shape : (1456, 64)
2 : shape : (1456, 27)
3 : shape : (1456, 27)
4 : shape : (1436, 27)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


5 : shape : (1385, 27)
6 : shape : (1385, 19)
7 : shape : (1308, 19)
8 : shape : (1308, 10)
September 30 2021
1 : shape : (1572, 64)
2 : shape : (1572, 26)
3 : shape : (1569, 26)
4 : shape : (1550, 26)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


5 : shape : (1498, 26)
6 : shape : (1498, 19)
7 : shape : (1419, 19)
8 : shape : (1419, 10)
December 31 2021
1 : shape : (1621, 64)
2 : shape : (1621, 26)
3 : shape : (1621, 26)
4 : shape : (1563, 26)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


5 : shape : (1509, 26)
6 : shape : (1509, 19)
7 : shape : (1433, 19)
8 : shape : (1433, 10)
March 31 2022
1 : shape : (1684, 64)
2 : shape : (1684, 26)
3 : shape : (1684, 26)
4 : shape : (1666, 26)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


5 : shape : (1611, 26)
6 : shape : (1611, 19)
7 : shape : (1531, 19)
8 : shape : (1531, 10)
June 30 2022
1 : shape : (1777, 64)
2 : shape : (1777, 26)
3 : shape : (1777, 26)
4 : shape : (1759, 26)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


5 : shape : (1702, 26)
6 : shape : (1702, 19)
7 : shape : (1624, 19)
8 : shape : (1624, 10)
September 30 2022
1 : shape : (1773, 76)
2 : shape : (1773, 28)
3 : shape : (1770, 28)
4 : shape : (1753, 28)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


5 : shape : (1694, 28)
6 : shape : (1694, 23)
7 : shape : (1617, 23)
8 : shape : (1617, 13)
December 31 2022
1 : shape : (1769, 76)
2 : shape : (1769, 28)
3 : shape : (1769, 28)
4 : shape : (1753, 28)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


5 : shape : (1696, 28)
6 : shape : (1696, 24)
7 : shape : (1620, 24)
8 : shape : (1620, 13)
March 31 2023
1 : shape : (1605, 76)
2 : shape : (1605, 28)
3 : shape : (1605, 28)
4 : shape : (1588, 28)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


5 : shape : (1534, 28)
6 : shape : (1534, 23)
7 : shape : (1460, 23)
8 : shape : (1460, 13)
June 30 2023
1 : shape : (1523, 76)
2 : shape : (1523, 28)
3 : shape : (1523, 28)
4 : shape : (1504, 28)


  lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)


5 : shape : (1454, 28)
6 : shape : (1454, 23)
7 : shape : (1383, 23)
8 : shape : (1383, 13)


In [148]:
process_tables_shape

{'March 31 2013': (365, 10),
 'June 30 2013': (365, 10),
 'September 30 2013': (381, 10),
 'December 31 2013': (378, 10),
 'March 31 2014': (399, 10),
 'June 30 2014': (412, 10),
 'September 30 2014': (432, 10),
 'December 31 2014': (430, 10),
 'March 31 2015': (445, 10),
 'June 30 2015': (473, 10),
 'September 30 2015': (483, 10),
 'December 31 2015': (425, 10),
 'March 31 2016': (505, 10),
 'June 30 2016': (469, 10),
 'September 30 2016': (582, 10),
 'December 31 2016': (579, 10),
 'March 31 2017': (606, 10),
 'June 30 2017': (635, 10),
 'December 31 2017': (663, 10),
 'March 31 2018': (699, 10),
 'June 30 2018': (737, 10),
 'September 30 2018': (778, 10),
 'December 31 2018': (841, 10),
 'March 31 2019': (875, 10),
 'June 30 2019': (917, 10),
 'September 30 2019': (1013, 10),
 'December 31 2019': (1063, 10),
 'March 31 2020': (1106, 11),
 'June 30 2020': (1105, 10),
 'September 30 2020': (1138, 10),
 'December 31 2020': (1175, 10),
 'March 31 2021': (1227, 10),
 'June 30 2021': (130