In [109]:
import json
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import unicodedata
import os
import webbrowser
import html5lib
from openpyxl import workbook
from datetime import datetime
import requests
from fuzzywuzzy import process
import warnings
warnings.filterwarnings('ignore')

In [110]:
path = '../FeedBack_Master_tables_MFIC_Investment.xlsx'
# dataframes = pd.read_excel(path, sheet_name=None)

xls = pd.ExcelFile(path)
all_sheets = pd.read_excel(path, keep_default_na=False, sheet_name=None)

dataframes = {}
# Loop through each sheet and create a DataFrame in the dictionary
for sheet_name, sheet_df in all_sheets.items():
    dataframes[sheet_name.replace(' ', '_')] = sheet_df
    print(f"DataFrame name: {sheet_name} : {sheet_df.shape}")


process_tables = {}
process_tables_shape = {}
if not os.path.exists('../FB_PT_csv_file'):
    os.makedirs('../FB_PT_csv_file')

headers = {}

DataFrame name: September 30 2023 : (983, 45)
DataFrame name: June 30 2023 : (1043, 45)
DataFrame name: March 31 2023 : (986, 45)
DataFrame name: December 31 2022 : (890, 45)
DataFrame name: September 30 2022 : (950, 45)
DataFrame name: June 30 2022 : (961, 50)
DataFrame name: March 31 2022 : (915, 50)
DataFrame name: December 31 2021 : (935, 50)
DataFrame name: September 30 2021 : (982, 50)
DataFrame name: June 30 2021 : (1050, 50)
DataFrame name: March 31 2021 : (1006, 50)
DataFrame name: December 31 2020 : (1023, 50)
DataFrame name: September 30 2020 : (1059, 50)
DataFrame name: June 30 2020 : (1109, 50)
DataFrame name: March 31 2020 : (1114, 50)
DataFrame name: December 31 2019 : (872, 27)
DataFrame name: September 30 2019 : (639, 21)
DataFrame name: June 30 2019 : (582, 21)
DataFrame name: March 31 2019 : (492, 21)
DataFrame name: December 31 2018 : (422, 21)
DataFrame name: September 30 2018 : (403, 26)
DataFrame name: June 30 2018 : (369, 21)
DataFrame name: March 31 2018 : (367

In [111]:
Total_investments_per_filing = {}


def run_process_function(dataframes, process_tables, process_tables_shape):
    path = '../FeedBack_process_tables_MFIC_Investment.xlsx'
    writer = pd.ExcelWriter(path=path, engine='openpyxl')
    for dataframe in dataframes:
        print(dataframe)
        processed_table = process_table_function(dataframes[dataframe])
        process_tables[dataframe] = processed_table[0]
        process_tables_shape[dataframe] = processed_table[0].shape
        headers[dataframe] = processed_table[0].columns.values
        Total_investments_per_filing[dataframe] = processed_table[1]

        processed_table[0].to_excel(
            writer, sheet_name=dataframe.replace(',', ''), index=False)
        processed_table[0].to_csv(
            '../PT_csv_file/'+dataframe.replace(',', '')+'.csv')

        writer.book.save(path)
    writer.close()


def shape(count, df):
    print(f"{count} : shape : {df.shape}")
    count += 1
    return count


def dropna_col_row(df):
    df = df.dropna(how='all', axis=0).reset_index(drop=True)
    df = df.dropna(how='all', axis=1).reset_index(drop=True)
    return df


def drop_if_contain(pattern, df):
    matching_rows = df.apply(
        lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)
    df = df[~matching_rows]
    return df


def add_missing_beakets(value):
    if isinstance(value, str):
        if '(' in value and ')' not in value:
            value += ')'

        value = value.strip()
    return value


def sentence_cosine_similarity(sentence1, sentence2):
    # Load the spaCy model with word embeddings
    nlp = spacy.load("en_core_web_md")

    # Process the sentences using spaCy
    doc1 = nlp(sentence1)
    doc2 = nlp(sentence2)

    # Calculate the average vector representation for each sentence
    vec1 = doc1.vector.reshape(1, -1)
    vec2 = doc2.vector.reshape(1, -1)

    # Compute the cosine similarity between the vectors
    similarity_score = cosine_similarity(vec1, vec2)[0][0]

    return similarity_score


column_pattern = {
    r'^Industry\s*\((\d+)\)$': 'Industry',
    r'^Industry\s+/\s+Company$': 'Company',
    r'^Industry\s*/\s*Company$': 'Company',
    r'^Investment\s+Type$': 'Investment_Type',
    r'^Interest\s+Rate\s*\((\d+)\)$': 'Interest_Rate',
    r'^Maturity\s+Date$': 'Maturity_Date',
    r'^Par[\s/]+Shares\s*\((\d+)\)$': 'Par_Shares',
    r'^Cost\s*\((\d+)\)$': 'Cost',
    r'^Fair\s+Value\s*\((\d+)\)$': 'Fair_Value',
    r'^Fair\s+Value\s+\((\d+)\)$': 'Fair_Value',
    r'^Fair\s+Value\s*\(\d+\)\(\d+\)$': 'Fair_Value',
    r'^ASC\s+820\s+Level\s*\((\d+)\)$': 'ASC_820_Level',
    r'^Par\s+Amount\*?$': 'Par Amount',
    r'^Par\s+Amount\s*\((\d+)\)$': 'Par Amount',
    r'^Par\s+Amount\*\s*\((\d+)\)$': 'Par Amount',
    r'^Par\s+Amount\(\d+\)$': 'Par Amount',
    r'^Par\s+Amount\s*\(\d+\)$': 'Par Amount',
    r'^INVESTMENTS\s+IN\s+NON-CONTROLLED/NON-AFFILIATED\s+INVESTMENTS\d+(\.\d+)?\(\d+\)$': 'Company',
    r'^Fair\s+Value\s*\(\d+\)\s*\(\d+\)$': 'Fair_Value',
    r'^Fair\s+Value\s+\(\d+\)\s+\(\d+\)$': 'Fair_Value',
    r'^Fair\s+Value\s+\(\d+\)$': 'Fair_Value',
    r'^Fair\s+Value\s*\(\d+\)$': 'Fair_Value',
    r'^Par\s*\(\d+\)$': 'Par',
    r'^Par\s*\(\d+\)\s*$': 'Par'
}


def rename_columns_with_pattern(df):
    df.columns = df.columns.to_series().replace(column_pattern, regex=True)


# List of possible header names
header_list = {
    'Company': 'Company',
    'INVESTMENTS IN NON-CONTROLLED/NON-AFFILIATED INVESTMENTS': 'Industry/Company',
    'Industry/Company': 'Industry/Company',
    'Industry': 'Industry',
    'Investment Type': 'Investment_Type',
    'Interest Rate': 'Interest_Rate',
    'Maturity Date': 'Maturity_Date',
    'Par Shares': 'Par_Shares',
    'Par Amount(12)': 'Par_Shares',
    'Cost': 'Cost',
    'Fair Value': 'Fair_Value',
    'Investment': 'Investment',
    'ASC 820 Level': 'ASC 820 Level'
}

# Function to find the most similar header


def find_most_similar(header, header_list):
    # Check if the header is NaN
    if pd.isna(header):
        return header
    # Convert header to string
    header_str = str(header)
    # Using process.extractOne to get the most similar value
    match, score = process.extractOne(header_str, header_list.keys())
    print(header_str, match, score)
    return header_list[match]

In [119]:
def process_table_function(soi_table_df):
    count = 1
    count = shape(count, soi_table_df)
    soi_table_df = soi_table_df.replace(
        r'^\s*\$\s*$', '', regex=True).replace(r'\n', '', regex=True)
    soi_table_df = soi_table_df.replace('€', np.nan)
    soi_table_df = soi_table_df.replace('C$', np.nan)
    soi_table_df = soi_table_df.replace('£', np.nan)
    soi_table_df = soi_table_df.replace('CAD', np.nan)

    soi_table_df = dropna_col_row(soi_table_df)
    soi_table_df = soi_table_df.apply(
        lambda x: x.strip() if isinstance(x, str) else x)
    count = shape(count, soi_table_df)

    # drops all the extra top row
    pattern = r'Industry'
    matching_rows = soi_table_df.apply(
        lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)
    # Check if the pattern exists in the DataFrame
    if matching_rows.any():
        # Extract rows from the first occurrence onwards
        soi_table_df = soi_table_df.iloc[matching_rows.idxmax():].reset_index(
            drop=True)
    count = shape(count, soi_table_df)

    # drops all the extra bottom row
    pattern = r'Total\s+Investments'
    pattern = r'Net\s+Assets'
    # Use the apply function to check if the pattern is in any column for each row
    matching_rows = soi_table_df.apply(
        lambda row: row.str.contains(pattern, flags=re.IGNORECASE, regex=True).any(), axis=1)

    Total_investments_per_filing_row = soi_table_df[matching_rows]

    # Find the index of the first row that matches the pattern
    # Slice the DataFrame to keep only the rows up to and including the first matching row
    if soi_table_df[matching_rows].index[0] < 20:
        soi_table_df = soi_table_df.loc[:soi_table_df[matching_rows].index[1]].reset_index(
            drop=True)
    else:
        soi_table_df = soi_table_df.loc[:soi_table_df[matching_rows].index[0]].reset_index(
            drop=True)
    count = shape(count, soi_table_df)

    # drop nan col row
    soi_table_df = dropna_col_row(soi_table_df)
    count = shape(count, soi_table_df)

    # drops the sub total
    soi_table_df = soi_table_df.dropna(subset=[soi_table_df.columns[0]])
    count = shape(count, soi_table_df)

    soi_table_df = soi_table_df.replace('', np.nan)
    col_indices = [0]
    soi_table_df.iloc[:, col_indices] = soi_table_df.iloc[:, col_indices].fillna(
        method='ffill')
    count = shape(count, soi_table_df)

    for index, row in soi_table_df.iterrows():
        cleanedList = [x for x in list(row) if str(x) != 'nan']
        if len(cleanedList) != 3:
            row = pd.Series(cleanedList)
            soi_table_df.loc[index] = row
        else:
            soi_table_df.drop(index, inplace=True)

    soi_table_df.to_csv("test.csv")

    # Separate header and data
    soi_table_df = soi_table_df.rename(
        columns=soi_table_df.iloc[0]).drop(soi_table_df.index[0])
    # drops all the rows that contains header
    soi_table_df = soi_table_df[soi_table_df[soi_table_df.columns[0]]
                                != soi_table_df.columns[0]]

    soi_table_df = dropna_col_row(soi_table_df)
    count = shape(count, soi_table_df)

    pattern = r'(?:Spread\s*Above|cost|Percentage|Above)'
    soi_table_df = drop_if_contain(pattern, soi_table_df)
    pattern = r'^([Tt]otal)'
    soi_table_df = drop_if_contain(pattern, soi_table_df)
    count = shape(count, soi_table_df)

    # if soi_table_df.columns[0].replace(" ", "") == 'Industry/Company':
    #     try:
    #         soi_table_df.insert(0, 'Industry', '')

    #         for index, row in soi_table_df.iterrows():
    #             if row.nunique() == 2:
    #                 soi_table_df.at[index, 'Industry'] = row.iloc[1]
    #         soi_table_df['Industry'] = soi_table_df['Industry'].replace(
    #             '', np.nan)
    #         col_indices = [0]
    #         soi_table_df.iloc[:, col_indices] = soi_table_df.iloc[:, col_indices].fillna(
    #             method='ffill')

    #     except Exception as e:
    #         print(f'Industry/Company can\'t be found: {e}')

    # soi_table_df = soi_table_df.dropna(thresh=5)
    count = shape(count, soi_table_df)
    # rename_columns_with_pattern(soi_table_df)
    soi_table_df.columns = [find_most_similar(
        header, header_list) for header in soi_table_df.columns]

    # Apply the function to each value in the DataFrame
    soi_table_df = soi_table_df.applymap(add_missing_beakets)

    headers = soi_table_df.columns
    print(headers)

    return soi_table_df, Total_investments_per_filing_row


# run_process_function(dataframes=dataframes, process_tables=process_tables,
#                      process_tables_shape=process_tables_shape)

In [142]:
df = dataframes['December_31_2013']

In [143]:
df = process_table_function(df)

1 : shape : (265, 15)
2 : shape : (265, 15)
3 : shape : (265, 15)
4 : shape : (226, 15)
5 : shape : (226, 15)
6 : shape : (226, 15)
7 : shape : (226, 15)
8 : shape : (192, 5)
9 : shape : (190, 5)
10 : shape : (190, 5)
INVESTMENTS IN NON-CONTROLLED/NON-AFFILIATED INVESTMENTS -137.3% INVESTMENTS IN NON-CONTROLLED/NON-AFFILIATED INVESTMENTS 95
Industry Industry 100
Par Amount* Par Amount(12) 95
Cost Cost 100
Fair  Value (1) Fair Value 95
Index(['Industry/Company', 'Industry', 'Par_Shares', 'Cost', 'Fair_Value'], dtype='object')


In [144]:
df[0].to_excel('test.xlsx')

In [None]:
dataframes['September_30_2023'].to_csv("test.csv")
Total_investments_per_filing

In [None]:
with open('Total_investments_per_filing.json', 'w') as json_file:
    json.dump({k: v.to_json()
              for k, v in Total_investments_per_filing.items()}, json_file)

In [None]:
if not os.path.exists('../Total_investmensts'):
    os.makedirs('../Total_investmensts')
for total_invest, df in Total_investments_per_filing.items():
    filename = f'../Total_investmensts/{total_invest}.csv'
    df.to_csv(filename, header=False)

In [90]:
path = '../FeedBack_process_tables_MFIC_Investment.xlsx'
xls = pd.ExcelFile(path)
all_sheets = pd.read_excel(path, keep_default_na=False, sheet_name=None)

In [91]:
dataframes = {}
# Loop through each sheet and create a DataFrame in the dictionary
for sheet_name, sheet_df in all_sheets.items():
    dataframes[sheet_name.replace(' ', '_')] = sheet_df
    print(f"DataFrame name: {sheet_name} : {sheet_df.shape}")

DataFrame name: Total investments : (65, 7)
DataFrame name: September_30_2023 : (486, 8)
DataFrame name: June_30_2023 : (512, 8)
DataFrame name: March_31_2023 : (473, 8)
DataFrame name: December_31_2022 : (418, 8)
DataFrame name: September_30_2022 : (447, 8)
DataFrame name: June_30_2022 : (454, 9)
DataFrame name: March_31_2022 : (450, 9)
DataFrame name: December_31_2021 : (453, 9)
DataFrame name: September_30_2021 : (492, 9)
DataFrame name: June_30_2021 : (544, 9)
DataFrame name: March_31_2021 : (525, 9)
DataFrame name: December_31_2020 : (538, 9)
DataFrame name: September_30_2020 : (567, 9)
DataFrame name: June_30_2020 : (584, 9)
DataFrame name: March_31_2020 : (637, 9)
DataFrame name: December_31_2019 : (569, 9)
DataFrame name: September_30_2019 : (391, 9)
DataFrame name: June_30_2019 : (349, 9)
DataFrame name: March_31_2019 : (293, 9)
DataFrame name: December_31_2018 : (258, 9)
DataFrame name: September_30_2018 : (230, 9)
DataFrame name: June_30_2018 : (212, 8)
DataFrame name: March

In [92]:
def add_industy(soi_table_df):
    for index, row in soi_table_df.iterrows():
        if row.nunique() < 3:
            soi_table_df.at[index, 'Industy'] = row.iloc[1]
    soi_table_df['Industy'] = soi_table_df['Industy'].replace('', np.nan)
    col_indices = [0]
    soi_table_df.iloc[:, col_indices] = soi_table_df.iloc[:, col_indices].fillna(
        method='ffill')
    return soi_table_df

In [93]:
def run_process_function(dataframes):
    path = '../Industy_FeedBack_process_tables_MFIC_Investment.xlsx'
    writer = pd.ExcelWriter(path=path, engine='openpyxl')
    for dataframe in dataframes:
        print(dataframe)
        processed_table = add_industy(dataframes[dataframe])
        processed_table.to_excel(
            writer, sheet_name=dataframe.replace(',', ''), index=False)
        writer.book.save(path)
    writer.close()

run_process_function(dataframes)

Total_investments
September_30_2023
June_30_2023
March_31_2023
December_31_2022
September_30_2022
June_30_2022
March_31_2022
December_31_2021
September_30_2021
June_30_2021
March_31_2021
December_31_2020
September_30_2020
June_30_2020
March_31_2020
December_31_2019
September_30_2019
June_30_2019
March_31_2019
December_31_2018
September_30_2018
June_30_2018
March_31_2018
December_31_2017
September_30_2017
June_30_2017
March_31_2017
December_31_2016
September_30_2016
June_30_2016
March_31_2016
December_31_2015
September_30_2015
June_30_2015
March_31_2015
December_31_2014
September_30_2014
June_30_2014
March_31_2014
December_31_2013
September_30_2013
March_31_2013
June_30_2013


In [97]:
Total_investments_per_filing['June_30_2017']

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
187,Total Investments before Cash Equivalents-156....,,,,,,,,,2605423.0,...,,2316708.0,,,,,,,,
189,Total Investments after Cash Equivalents-157.0...,,,,,,,,,2615206.0,...,,2326491.0,,,,,,,,
223,Industry Classification,Percentage of Total Investments (at Fair Value...,,,,,,,,,...,,,,,,,,,,
249,Total Investments,100.0%,,,,,,,,,...,,,,,,,,,,


In [103]:
import pandas as pd

# Create a DataFrame from the provided data
data = [
    ["Warrants Gryphon Colleges Corp., Common Stock",
        "Education", "N/A", "N/A", 9820, 98, None],
    ["Warrants Invuity, Inc.", "Healthcare & Pharmaceuticals",
        "N/A", "N/A", 16873, 80, 94],
    [None, None, None, None, 1681, 94, None],
    ["Total Equity—0.4%", None, None, None, 89529, 6555, None],
    ["Total Non-Controlled/Non-Affiliated Investments—94.6%",
        None, None, None, 1510980, 1402409, None],
    [None, None, None, None, None, None, None],
    ["Corporate Debt", None, None, None, None, None, None],
    ["Pelican Energy, LLC", "Energy – Oil & Gas",
        "10.00% PIK Toggle (10.00% Cash)", "12/31/18", 31141, 26665, 15417],
    ["Solarplicity Group Limited", "Energy – Electricity",
        "8.00% PIK Toggle (8.00% Cash)", "11/30/22", 125468, 146598, 119426],
    [None, None, None, None, 173263, 134843, None],
    [None, None, None, None, 173263, 134843, None],
    [None, None, None, None, None, None, None],
    ["Unsecured Debt", None, None, None, None, None, None],
    ["Solarplicity UK Holdings Limited", "Energy - Electricity",
        "8.00% PIK Toggle (8.00% Cash)", "2/24/22", 2000, 2499, 2501],
    ["Venoco, Inc.", "Energy - Oil & Gas",
        "10.00% PIK", "7/25/17", 338, 337, None],
    [None, None, None, None, 2836, 2501, None],
    [None, None, None, None, 176099, 137344, None],
    [None, None, None, None, None, None, None],
    ["Structured Products and Other", None, None, None, None, None, None],
    ["Golden Bear 2016-R, LLC, Membership Interests",
        "Diversified Investment Vehicles, Banking, Finance, Real Estate", "N/A", "9/20/42", None, 16459, 17066],
    ["Ivy Hill Middle Market Credit Fund IX, Ltd., Subordinated Notes",
        "Diversified Investment Vehicles, Banking, Finance, Real Estate", "13.34%", "10/18/25", 12500, 9158, 9537],
    ["Ivy Hill Middle Market Credit Fund X, Ltd., Subordinated Notes",
        "Diversified Investment Vehicles, Banking, Finance, Real Estate", "11.25%", "7/18/27", 14000, 11078, 10841],
    [None, None, None, None, 36695, 37444, None],
    [None, None, None, None, None, None, None],
    ["Equity", None, None, None, None, None, None],
    ["Preferred Equity", None, None, None, None, None, None],
    ["Renew Financial LLC (f/k/a Renewable Funding, LLC), Series B Preferred Stock",
     "Energy - Electricity", "N/A", "N/A", 1506, 8343, 19383],
    ["Renew Financial LLC (f/k/a Renewable Funding, LLC), Series D Preferred Stock",
     "Energy - Electricity", "N/A", "N/A", 437, 5568, 6254],
    [None, None, None, None, 13911, 25637, None],
    [None, None, None, None, None, None, None],
    ["Common Equity/Interests", None, None, None, None, None, None],
    ["AIC SPV Holdings I, LLC, Membership Interests",
        "Diversified Investment Vehicles, Banking, Finance, Real Estate", "N/A", "N/A", None, 69040, 24285]
]

columns = ["Investment", "Industry", "Interest Rate",
           "Maturity Date", "Par Value", "Cost", "Fair Value"]
df = pd.DataFrame(data, columns=columns)

# Save the DataFrame to an Excel file
df.to_excel("investment_data.xlsx", index=False)