In [22]:
Quarterly_Reports_Dir = '/home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2'
Simple_Dir_Name = "2024_Q2_Buyout"

Save_Pickle_Directory="/home/jovyan/shared/projects/Sep_Iter/Data/Simple_Dir_Data"
Metadata_File_Dir = '/home/jovyan/shared/projects/Sep_Iter/Metadata/Data/Metadata_Final/metadata_final.csv'


In [23]:
import os
import re
import hashlib
import logging
import pickle

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define the directory
directory = Quarterly_Reports_Dir 

# Function to compute file hash to detect duplicates
def compute_file_hash(file_path):
    hash_func = hashlib.sha256()  # Using SHA-256 for better collision resistance
    try:
        with open(file_path, 'rb') as f:
            while chunk := f.read(8192):
                hash_func.update(chunk)
        return hash_func.hexdigest()
    except Exception as e:
        logging.error(f"Error reading file {file_path}: {e}")
        return None

# Remove duplicate files with preference logic
def remove_duplicate_files(directory):
    seen_files = {}
    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            file_path = os.path.join(directory, filename)
            file_hash = compute_file_hash(file_path)
            if file_hash:
                if file_hash in seen_files:
                    seen_files[file_hash].append(file_path)
                else:
                    seen_files[file_hash] = [file_path]

    # Resolve duplicates
    for file_paths in seen_files.values():
        if len(file_paths) > 1:
            remove_files_with_suffix(file_paths)

def remove_files_with_suffix(file_paths):
    pattern = re.compile(r'\(\d+\)\.pdf$')
    suffix_free_exists = any(not pattern.search(path) for path in file_paths)  # Check if there's at least one file without the suffix
    if suffix_free_exists:
        for file_path in file_paths:
            if pattern.search(file_path):
                logging.info(f"Removing file with suffix as duplicate exists without suffix: {os.path.basename(file_path)}")
                os.remove(file_path)

# Rename files based on the provided regex and convert filenames to lowercase
def rename_files(directory):
    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            new_filename = re.sub(r"\.{2,}(?=pdf)", ".", filename).lower()  # Convert to lowercase as well
            if new_filename != filename:
                old_path = os.path.join(directory, filename)
                new_path = os.path.join(directory, new_filename)
                try:
                    os.rename(old_path, new_path)
                    logging.info(f"Renamed: {filename} -> {new_filename}")
                except Exception as e:
                    logging.error(f"Failed to rename {filename}: {e}")

# Execute the functions
if os.path.exists(directory):
    remove_duplicate_files(directory)
    rename_files(directory)
else:
    logging.error(f"The directory {directory} does not exist.")


2024-09-04 00:35:09,382 - INFO - Renamed: 2024.06.30.Rpt.Hg 8 D..pdf -> 2024.06.30.rpt.hg 8 d.pdf
2024-09-04 00:35:09,392 - INFO - Renamed: 2024.06.30.2.Ltr.Green Equity VII Side..pdf -> 2024.06.30.2.ltr.green equity vii side.pdf
2024-09-04 00:35:09,397 - INFO - Renamed: 2024.06.30.Rpt.Innovation Alpha..pdf -> 2024.06.30.rpt.innovation alpha.pdf
2024-09-04 00:35:09,402 - INFO - Renamed: 2024.06.30.Rpt.Hg Mercury 4 A..pdf -> 2024.06.30.rpt.hg mercury 4 a.pdf
2024-09-04 00:35:09,408 - INFO - Renamed: 2024.06.30.Ltr.Trident VII Parallel..pdf -> 2024.06.30.ltr.trident vii parallel.pdf
2024-09-04 00:35:09,413 - INFO - Renamed: 2024.06.30.Rpt.WP Nexus CoInv 2..pdf -> 2024.06.30.rpt.wp nexus coinv 2.pdf
2024-09-04 00:35:09,418 - INFO - Renamed: 2024.06.30.Rpt.GCP XI..pdf -> 2024.06.30.rpt.gcp xi.pdf
2024-09-04 00:35:09,431 - INFO - Renamed: 2024.06.30.Rpt.PAI VIII..pdf -> 2024.06.30.rpt.pai viii.pdf
2024-09-04 00:35:09,435 - INFO - Renamed: 2024.06.30.Ltr.KKR Asian III..pdf -> 2024.06.30.ltr.

In [24]:
import pandas as pd
import os
import re

# Specify the path to your CSV file
file_path = Metadata_File_Dir

# Read the CSV file into a DataFrame
metadata_df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame to confirm successful import
# Dropping the columns
metadata_df = metadata_df.drop(columns=['_merge', 'Date'])

# Changing "quarter_end_date" to pandas datetime format
metadata_df['quarter_end_date'] = pd.to_datetime(metadata_df['quarter_end_date'])

# Changing "Vintage" column to integer format
metadata_df['Vintage'] = metadata_df['Vintage'].astype(int)
metadata_df


Unnamed: 0,GPID,GP_Name,FundID,Fund_Name,omni_fund_id,omni_fund_name,Vintage,Fund_Size_USD,Asset_Class,Sector,Sub_Sector,Region,Country,primary_country,OMNI Fund ID,Document Name,quarter_end_date
0,5720,Wavemaker Partners (fka Siemer Ventures),42475,Wavemaker SEA Fund IV,10252,"Wavemaker Pacific IV, L.P.",2021,136.069625,Private Equity,Venture Capital,Early Stage,North America,United States,1,10252,2024.03.31.rpt.wavemaker pacific iv.pdf,2024-03-31
1,13712,The Venture City,39063,The Venture City Fund II,12341,"The Venture City Fund II, L.P.",2020,120.000000,Private Equity,Venture Capital,Early Stage,North America,United States,1,12341,2024.03.31.rpt.venture city ii.pdf,2024-03-31
2,13712,The Venture City,33345,The Venture City Fund I,12340,"The Venture City Fund I, L.P.",2017,50.000000,Private Equity,Venture Capital,Early Stage,North America,United States,1,12340,2024.03.31.rpt.venture city i.pdf,2024-03-31
3,3263,Valor Equity Partners,42247,Valor Equity Partners VI,10706,"Valor Equity Partners VI, L.P.",2022,2350.000000,Private Equity,Venture Capital,Expansion,North America,United States,1,10706,2024.03.31.rpt.valor vi.pdf,2024-03-31
4,3263,Valor Equity Partners,35468,Valor Equity Partners Fund V,6354,"Valor Equity Partners V, L.P.",2020,1700.000000,Private Equity,Venture Capital,Balanced,North America,United States,1,6354,2024.03.31.rpt.valor v.pdf,2024-03-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15579,3493,TPG Capital,8266,Texas Pacific Group Partners V,1535,"TPG Partners V, L.P.",2006,15372.000000,Private Equity,Buyout,Global Buyout,North America,United States,1,1535,2022.06.30.rpt.tpg v.pdf,2022-06-30
15580,3493,TPG Capital,9946,TPG Partners VI,1536,"TPG Partners VI, L.P.",2008,18873.000000,Private Equity,Buyout,Global Buyout,North America,United States,1,1536,2022.06.30.rpt.tpg vi.pdf,2022-06-30
15581,3493,TPG Capital,19584,TPG Partners VII,2111,"TPG Partners VII, L.P.",2015,10495.060000,Private Equity,Buyout,Global Buyout,North America,United States,1,2111,2022.06.30.rpt.tpg vii.pdf,2022-06-30
15582,3493,TPG Capital,29534,TPG Partners VIII,4652,"TPG Partners VIII, L.P.",2019,11200.000000,Private Equity,Buyout,Global Buyout,North America,United States,1,4652,2022.06.30.rpt.tpg viii.pdf,2022-06-30


In [25]:
file_names =os.listdir(Quarterly_Reports_Dir)
file_names = [re.sub(r"\.{2,}(?=pdf$)", ".", file_name) for file_name in file_names]
len(file_names)

366

In [26]:
#filter for quarterly reports we want 
matching_df = metadata_df[metadata_df['Document Name'].isin(file_names)]
matching_df

Unnamed: 0,GPID,GP_Name,FundID,Fund_Name,omni_fund_id,omni_fund_name,Vintage,Fund_Size_USD,Asset_Class,Sector,Sub_Sector,Region,Country,primary_country,OMNI Fund ID,Document Name,quarter_end_date
2510,10319,New State Capital Partners,40419,New State Capital Partners III,7337,"New State Capital Partners III, L.P.",2021,450.0,Private Equity,Buyout,Small Buyout,North America,United States,1,7337,2024.06.30.2.rpt.new state iii.pdf,2024-06-30
2511,7411,Reverence Capital Partners,29050,Reverence Capital Partners Opportunities Fund II,5054,Reverence Capital Partners Opportunities Fund ...,2018,1200.0,Private Equity,Buyout,Small Buyout,North America,United States,1,5054,2024.06.30.2.rpt.reverence ii.pdf,2024-06-30
2512,1494,Halyard Capital,4061,Halyard Capital Fund II,734,"Halyard Capital Fund II, L.P.",2007,270.0,Private Equity,Buyout,Small Buyout,North America,United States,1,734,2024.06.30.cab.rpt.halyard ii.pdf,2024-06-30
2514,6068,Grain Management,29683,Grain Communications Opportunity Fund II,4912,"Grain Communications Opportunity Fund II, L.P.",2018,1138.0,Private Equity,Buyout,Small Buyout,North America,United States,1,4912,2024.06.30.rpt.grain comms ii.pdf,2024-06-30
2516,1615,ICV Partners,12182,ICV Partners III,767,"ICV Partners III, L.P.",2013,400.0,Private Equity,Buyout,Small Buyout,North America,United States,1,767,2024.06.30.rpt.icv iii.pdf,2024-06-30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13886,3203,TSG Consumer Partners,25178,TSG Consumer Partners 7B,2589,TSG7 B L.P.,2016,510.0,Private Equity,Buyout,Small Buyout,North America,United States,1,2589,2024.06.30.ltr.tsg7 b.pdf,2024-06-30
13887,3361,Waud Capital Partners,11170,Waud Capital Partners III,1600,Waud Capital Partners III,2011,443.0,Private Equity,Buyout,Small Buyout,North America,United States,1,1600,2024.06.30.ltr.waud iii.pdf,2024-06-30
13888,3361,Waud Capital Partners,19878,Waud Capital Partners IV,2629,"Waud Capital Partners IV, L.P.",2017,909.0,Private Equity,Buyout,Small Buyout,North America,United States,1,2629,2024.06.30.ltr.waud iv.pdf,2024-06-30
13889,3384,WestView Capital Partners,23747,WestView Capital Partners III,10261,"WestView Capital Partners III, L.P.",2014,439.0,Private Equity,Buyout,Small Buyout,North America,United States,1,10261,2024.06.30.ltr.westview iii.pdf,2024-06-30


In [27]:
non_matching_file_names = set(file_names) - set(metadata_df['Document Name'])
non_matching_file_names = list(non_matching_file_names)  # If you want it back as a list

non_matching_file_names

['2024.06.30.2.ltr.green equity vii side.pdf',
 '2024.06.30.ltr.ohcp vi te 892.pdf']

In [28]:
import os
import datetime
import numpy as np
import pandas as pd
from typing import Dict  # Import Dict from typing

# Assuming `matching_df` is your DataFrame
# matching_df = pd.read_csv('path_to_your_csv.csv') # Load the DataFrame if not already loaded

def make_json_serializable(metadata):
    for key, value in metadata.items():
        if isinstance(value, np.generic):
            metadata[key] = value.item()
        elif isinstance(value, (datetime.date, datetime.datetime)):
            metadata[key] = value.isoformat()
    return metadata

# Function to extract metadata from DataFrame for a given document name
def add_fund_metadata(document_name: str, matching_df: pd.DataFrame) -> Dict:
    # Find the row corresponding to the document name
    fund_data = matching_df[matching_df['Document Name'] == document_name]
    if not fund_data.empty:
        fund_data = fund_data.iloc[0].to_dict()
        metadata = {
            'fund_name': fund_data.get('omni_fund_name'),
            'gp_id': fund_data.get('GPID'),
            'gp_name': fund_data.get('GP_Name'),
            'omni_fund_id': fund_data.get('omni_fund_id'),
            'vintage_year': fund_data.get('Vintage'),
            'fund_size': fund_data.get('Fund_Size_USD'),
            'asset_class': fund_data.get('Asset_Class'),
            'sector': fund_data.get('Sector'),
            'sub_sector': fund_data.get('Sub_Sector'),
            'report_date': fund_data.get('quarter_end_date'),
            'primary_region': fund_data.get('Region'),
            'primary_country': fund_data.get('Country'),
        }
        return make_json_serializable(metadata)
    else:
        return {}  # Return an empty dictionary if no metadata is found

# Loop through each PDF file in `file_names` and attach metadata
from llama_index.core import SimpleDirectoryReader
import time

directory = Quarterly_Reports_Dir # Adjust as needed

start_time = time.time()/60
documents_pages = []
success_count = 0

for pdf in file_names:
    try:
        doc_path = os.path.join(directory, pdf)
        fund_name_metadata = add_fund_metadata(pdf, matching_df)  # Get metadata from matching_df
        if fund_name_metadata:
            print(f"Processing: {doc_path}")
            doc = SimpleDirectoryReader(
                input_files=[doc_path],
                filename_as_id=False,
                file_metadata=lambda x: fund_name_metadata
            ).load_data()

            for ii in range(len(doc)):
                documents_pages.append(doc[ii])

            print(f"Metadata attached for {pdf}")
            success_count += 1
        else:
            print(f"No metadata found for {pdf}")
    except Exception as e:
        print(f"Error processing {pdf}: {e}")


end_time = time.time()/60
print('------------------------')
print(f'Time taken (mins): {round(end_time - start_time, 2)}')
print(f'Total files processed: {len(file_names)}')
print(f'Successfully attached metadata to {success_count} files')


Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.pine brook.pdf
Metadata attached for 2024.06.30.rpt.pine brook.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.ltr.bain xiii.pdf
Metadata attached for 2024.06.30.ltr.bain xiii.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.trident vi.pdf
Metadata attached for 2024.06.30.rpt.trident vi.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.ltr.wellspring v.pdf
Metadata attached for 2024.06.30.ltr.wellspring v.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.carlyle asia iv.pdf
Metadata attached for 2024.06.30.rpt.carlyle asia iv.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.llr iv.pdf
Metadata attached for 2024.06.30.rpt.llr iv.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.industrial growth vi.pdf
Metadata attached for 2



Metadata attached for 2024.06.30.rpt.monomoy iii aiv.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.ltr.adelis iii.pdf
Metadata attached for 2024.06.30.ltr.adelis iii.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.trilantic iv.pdf
Metadata attached for 2024.06.30.rpt.trilantic iv.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.river viii.pdf
Metadata attached for 2024.06.30.rpt.river viii.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.ltr.trive iii.pdf
Metadata attached for 2024.06.30.ltr.trive iii.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.sterling group iv parallel.pdf
Metadata attached for 2024.06.30.rpt.sterling group iv parallel.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.rlj i.pdf
Metadata attached for 2024.06.30.rpt.rlj i.pdf
No metadata found for 2024.06.30.ltr.ohcp vi te 8



Metadata attached for 2024.06.30.rpt.hg saturn a.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.bridgepoint v.pdf




Metadata attached for 2024.06.30.rpt.bridgepoint v.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.ltr.f3 presto hvd cv.pdf
Metadata attached for 2024.06.30.ltr.f3 presto hvd cv.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.monomoy iv.pdf
Metadata attached for 2024.06.30.rpt.monomoy iv.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.ltr.american viii.pdf
Metadata attached for 2024.06.30.ltr.american viii.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.platte river iii.pdf
Metadata attached for 2024.06.30.rpt.platte river iii.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.align ii.pdf
Metadata attached for 2024.06.30.rpt.align ii.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.draft.rpt.ta ap vii.pdf
Metadata attached for 2024.06.30.draft.rpt.ta ap vii.pdf
Processing: /home/jovyan/shared/project



Metadata attached for 2024.06.30.rpt.llr iii.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.ltr.actis global 4.pdf
Metadata attached for 2024.06.30.ltr.actis global 4.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.ltr.clp coinv 2018.pdf
Metadata attached for 2024.06.30.ltr.clp coinv 2018.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.axcel vii.pdf
Metadata attached for 2024.06.30.rpt.axcel vii.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.genstar viii.pdf
Metadata attached for 2024.06.30.rpt.genstar viii.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.atlas capital ii.pdf
Metadata attached for 2024.06.30.rpt.atlas capital ii.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.ltr.lindsay gold iv.pdf
Metadata attached for 2024.06.30.ltr.lindsay gold iv.pdf
Processing: /home/jovyan/shared/projects/



Metadata attached for 2024.06.30.rpt.carlyle dash coinv.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.flash.rpt.acof v.pdf




Metadata attached for 2024.06.30.flash.rpt.acof v.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.grain comms iii.pdf
Metadata attached for 2024.06.30.rpt.grain comms iii.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.jfl vi.pdf
Metadata attached for 2024.06.30.rpt.jfl vi.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.excellere ii.pdf
Metadata attached for 2024.06.30.rpt.excellere ii.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.gcp ix.pdf
Metadata attached for 2024.06.30.rpt.gcp ix.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.cab.rpt.halyard ii.pdf
Metadata attached for 2024.06.30.cab.rpt.halyard ii.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.sterling group v.pdf
Metadata attached for 2024.06.30.rpt.sterling group v.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/202



Metadata attached for 2024.06.30.rpt.hg 8 d.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.pai viii.pdf




Metadata attached for 2024.06.30.rpt.pai viii.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.ltr.sbj.pdf
Metadata attached for 2024.06.30.ltr.sbj.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.innovation alpha.pdf
Metadata attached for 2024.06.30.rpt.innovation alpha.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.wp nexus coinv 2.pdf
Metadata attached for 2024.06.30.rpt.wp nexus coinv 2.pdf
No metadata found for 2024.06.30.2.ltr.green equity vii side.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.avista healthcare ii.pdf
Metadata attached for 2024.06.30.rpt.avista healthcare ii.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.gcp xi.pdf
Metadata attached for 2024.06.30.rpt.gcp xi.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.ltr.trident vii parallel.pdf
Metadata attached for 2024.06.30.lt



Metadata attached for 2024.06.30.ltr.amulet iii.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.bridgepoint vii.pdf




Metadata attached for 2024.06.30.rpt.bridgepoint vii.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.draft.rpt.ta xii a.pdf
Metadata attached for 2024.06.30.draft.rpt.ta xii a.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.hg genesis 10 a.pdf
Metadata attached for 2024.06.30.rpt.hg genesis 10 a.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.apollo x.mda.pdf
Metadata attached for 2024.06.30.rpt.apollo x.mda.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.permira iv.pdf
Metadata attached for 2024.06.30.rpt.permira iv.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.aea vii.pdf
Metadata attached for 2024.06.30.rpt.aea vii.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.hcpe iv.pdf
Metadata attached for 2024.06.30.rpt.hcpe iv.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/



Metadata attached for 2024.06.30.ltr.trive iii a.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.bkf coinv.pdf




Metadata attached for 2024.06.30.rpt.bkf coinv.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.reverence v parallel.pdf
Metadata attached for 2024.06.30.rpt.reverence v parallel.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.2.ltr.green equity cf iii-c.pdf
Metadata attached for 2024.06.30.2.ltr.green equity cf iii-c.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.grain comms ii.pdf
Metadata attached for 2024.06.30.rpt.grain comms ii.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.jfl v.pdf
Metadata attached for 2024.06.30.rpt.jfl v.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.ltr.pritzker private cap iii a.pdf
Metadata attached for 2024.06.30.ltr.pritzker private cap iii a.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.gcp x.pdf
Metadata attached for 2024.06.30.rpt.gcp x.pdf
Processing: /



Metadata attached for 2024.06.30.ltr.amulet ii.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.bridgepoint vi.pdf




Metadata attached for 2024.06.30.rpt.bridgepoint vi.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.aea vi.pdf
Metadata attached for 2024.06.30.rpt.aea vi.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.hoc lp.pdf
Metadata attached for 2024.06.30.rpt.hoc lp.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.apollo vi.mda.pdf
Metadata attached for 2024.06.30.rpt.apollo vi.mda.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.tdr iii a.pdf
Metadata attached for 2024.06.30.rpt.tdr iii a.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.ltr.american vii b.pdf
Metadata attached for 2024.06.30.ltr.american vii b.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.hahn iv supp.pdf
Metadata attached for 2024.06.30.rpt.hahn iv supp.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.



Metadata attached for 2024.06.30.rpt.trident vi parallel.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.bridgepoint iv.pdf




Metadata attached for 2024.06.30.rpt.bridgepoint iv.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.ltr.hig europe ii.pdf
Metadata attached for 2024.06.30.ltr.hig europe ii.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.hcpe v.pdf
Metadata attached for 2024.06.30.rpt.hcpe v.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.ltr.waud iv.pdf
Metadata attached for 2024.06.30.ltr.waud iv.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.ltr.sterling investment iii.pdf
Metadata attached for 2024.06.30.ltr.sterling investment iii.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.ltr.francisco vii.pdf
Metadata attached for 2024.06.30.ltr.francisco vii.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.2.ltr.jade equity off.pdf




Metadata attached for 2024.06.30.2.ltr.jade equity off.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.ssid cv.pdf
Metadata attached for 2024.06.30.rpt.ssid cv.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.sterling group foundation.pdf
Metadata attached for 2024.06.30.rpt.sterling group foundation.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.ltr.american industrial viii.pdf
Metadata attached for 2024.06.30.ltr.american industrial viii.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.t ix boost coinv.beeline coinv.pdf
Metadata attached for 2024.06.30.rpt.t ix boost coinv.beeline coinv.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.ltr.comvest iv.pdf
Metadata attached for 2024.06.30.ltr.comvest iv.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.warren elido.pdf
Metadata attached for 2024.06



Metadata attached for 2024.06.30.ltr.unigestion direct ii asia.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.ltr.breck i.pdf
Metadata attached for 2024.06.30.ltr.breck i.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.flash.rpt.acof vi.pdf




Metadata attached for 2024.06.30.flash.rpt.acof vi.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.ltr.trident ix parallel.pdf
Metadata attached for 2024.06.30.ltr.trident ix parallel.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.ltr.trive structured i.pdf
Metadata attached for 2024.06.30.ltr.trive structured i.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.ta xv a.pdf
Metadata attached for 2024.06.30.rpt.ta xv a.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.2.ltr.green equity v.pdf
Metadata attached for 2024.06.30.2.ltr.green equity v.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.carlyle vii.pdf
Metadata attached for 2024.06.30.rpt.carlyle vii.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.krg iv pa.pdf
Metadata attached for 2024.06.30.rpt.krg iv pa.pdf
Processing: /home/jovyan/shared/pro



Metadata attached for 2024.06.30.ltr.luminate iii-a.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.flash.rpt.acof iv.pdf




Metadata attached for 2024.06.30.flash.rpt.acof iv.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.ltr.trident viii parallel.pdf
Metadata attached for 2024.06.30.ltr.trident viii parallel.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.fulcrum v.pdf
Metadata attached for 2024.06.30.rpt.fulcrum v.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.rhone union coinv.pdf
Metadata attached for 2024.06.30.rpt.rhone union coinv.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.ltr.kkr north america xiii.pdf
Metadata attached for 2024.06.30.ltr.kkr north america xiii.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.industrial growth v.pdf
Metadata attached for 2024.06.30.rpt.industrial growth v.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.august equity v.pdf
Metadata attached for 2024.06.30.rpt.august eq



Metadata attached for 2024.06.30.rpt.am ii.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.flash.rpt.acof iii.pdf




Metadata attached for 2024.06.30.flash.rpt.acof iii.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.baring asia vi.pdf
Metadata attached for 2024.06.30.rpt.baring asia vi.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.hahn i.pdf
Metadata attached for 2024.06.30.rpt.hahn i.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.trilantic vi na.pdf
Metadata attached for 2024.06.30.rpt.trilantic vi na.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.apollo viii.pdf
Metadata attached for 2024.06.30.rpt.apollo viii.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.carlyle us opps ii.pdf
Metadata attached for 2024.06.30.rpt.carlyle us opps ii.pdf
Processing: /home/jovyan/shared/projects/Sep_Iter/Data/2024_Q2/2024.06.30.rpt.excellere iv.pdf
Metadata attached for 2024.06.30.rpt.excellere iv.pdf
Processing: /home/jovyan/shared/projects/S

In [29]:
total_length = [doc.metadata.get("file_name") for doc in documents_pages]
print(f"total pdf processed: {len(set(total_length))}")

total pdf processed: 363


In [30]:
documents_pages[0]

Document(id_='e0700d0f-5dcc-44ce-9c8d-15e8f669f598', embedding=None, metadata={'page_label': '1', 'file_name': '2024.06.30.rpt.pine brook.pdf', 'fund_name': 'Pine Brook Capital Partners, L.P.', 'gp_id': 2507, 'gp_name': 'Pine Brook Partners', 'omni_fund_id': 1030, 'vintage_year': 2008, 'fund_size': 1434.0, 'asset_class': 'Private Equity', 'sector': 'Buyout', 'sub_sector': 'Middle Buyout', 'report_date': '2024-06-30T00:00:00', 'primary_region': 'North America', 'primary_country': 'United States'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text=' \nPine Brook Capital Partners, L .P. \n \nSecond Quarter Report  \nAugust 2 2, 2024 \n ', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_templa

In [31]:
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Ensure the save directory exists, create if it does not
if not os.path.exists(Save_Pickle_Directory):
    os.makedirs(Save_Pickle_Directory)
    logger.info(f"Created directory {Save_Pickle_Directory}")

# Full path for saving the file
file_path = os.path.join(Save_Pickle_Directory, f'simple_dir_data_{Simple_Dir_Name}.pickle')

# Saving the file
try:
    with open(file_path, 'wb') as file:
        pickle.dump(documents_pages, file)
    logger.info(f'Data has been successfully pickled and saved as {file_path}')
except Exception as e:
    logger.error(f'An error occurred while pickling data: {e}')

2024-09-04 00:43:51,094 - INFO - Data has been successfully pickled and saved as /home/jovyan/shared/projects/Sep_Iter/Data/Simple_Dir_Data/simple_dir_data_2024_Q2_Buyout.pickle
