In [11]:
# understand statistics (average number of words per page) + establish nuber of paragraphs per page

import PyPDF2

# Open the PDF file
with open('data/sample1.pdf', 'rb') as file:
    reader = PyPDF2.PdfReader(file)
    
    # Extract text from all pages
    tot_n_words = 0
    for page_num in range(len(reader.pages)):
        print(f'page number: {str(page_num)}')
        page = reader.pages[page_num]
        text = page.extract_text()
        words = text.split(' ')
        number_of_words = len(words)
        print(f'\t number of words: {number_of_words}')
        tot_n_words += number_of_words
    mean_w_per_page = tot_n_words/len(reader.pages)
    print(f'\naverage number of words per page: {mean_w_per_page}')
    print(' '.join(words[0:100]))

page number: 0
	 number of words: 557
page number: 1
	 number of words: 110
page number: 2
	 number of words: 453
page number: 3
	 number of words: 526
page number: 4
	 number of words: 309
page number: 5
	 number of words: 419
page number: 6
	 number of words: 613
page number: 7
	 number of words: 277
page number: 8
	 number of words: 757
page number: 9
	 number of words: 1064
page number: 10
	 number of words: 1711
page number: 11
	 number of words: 1502
page number: 12
	 number of words: 387

average number of words per page: 668.0769230769231
will be supplied upon request.  
Differing Levels of Service provided by Global Investment Research:  The level and types of services provided to you by the Global Investment  
Research division of GS may vary as compared to that provided to internal and other external clients of GS, depending on variou s factors including your  
individual preferences as to the frequency and manner of receiving communication, your risk proﬁle and investment f

In [25]:
text = """Differing Levels of Service provided by Global Investment Research:  The level and types of services provided to you by the Global Investment  
Research division of GS may vary as compared to that provided to internal and other external clients of GS, depending on variou s factors"""
words = text.split()
words[30:60]

['to',
 'that',
 'provided',
 'to',
 'internal',
 'and',
 'other',
 'external',
 'clients',
 'of',
 'GS,',
 'depending',
 'on',
 'variou',
 's',
 'factors']

In [37]:
# divide text every 100 token

def split_text_into_paragraphs(text, paragraph_size=100):
    # Split the text into words
    words = text.split(' ')
    print('total len: ' + str(len(words)))
    paragraphs = []
    # Create paragraphs with the specified number of words
    for i in range(0, len(words), paragraph_size):
        print(i,i+paragraph_size)
        paragraph = ' '.join(words[i:i + paragraph_size])
        paragraphs.append(paragraph)
    # merge the last paragraph if too short
    merge_threshold = paragraph_size//2
    if len(paragraphs) > 1 and len(paragraphs[-1].split()) < merge_threshold:
        additional_part = paragraphs.pop()
        paragraphs[-1] += ' ' + additional_part
    return paragraphs



# Example usage:
text = """Differing Levels of Service provided by Global Investment Research:  The level and types of services provided to you by the Global Investment  
Research division of """
paragraphs = split_text_into_paragraphs(text)

# Print paragraphs
for i, paragraph in enumerate(paragraphs):
    print(f"Paragraph {i+1}: ({len(paragraph.split())} words)\n{paragraph}\n")


total len: 28
0 100
Paragraph 1: (25 words)
Differing Levels of Service provided by Global Investment Research:  The level and types of services provided to you by the Global Investment  
Research division of 



In [58]:
import os
import glob
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

def split_text_into_paragraphs(text, paragraph_size=100):
    # Split the text into words
    words = text.split(' ')
    paragraphs = []
    # Create paragraphs with the specified number of words
    for i in range(0, len(words), paragraph_size):
        paragraph = ' '.join(words[i:i + paragraph_size])
        paragraphs.append(paragraph)
    # merge the last paragraph if too short
    merge_threshold = paragraph_size//2
    if len(paragraphs) > 1 and len(paragraphs[-1].split()) < merge_threshold:
        additional_part = paragraphs.pop()
        paragraphs[-1] += ' ' + additional_part
    return paragraphs


model = SentenceTransformer('all-MiniLM-L6-v2')
df = pd.DataFrame(columns=['file_name', 'page', 'text', 'embedding'])
# list all .pdf in the directory (relative paths)
files = glob.glob('data/*.pdf')
for file in files:
    file_name = os.path.basename(file)
    print('\t'+file_name)
    # read pdf and populate the df
    with open(file, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text = page.extract_text()
            paragraphs = split_text_into_paragraphs(text)
            for par in paragraphs:
                emb_text = model.encode(par)
                df.loc[len(df)] = [file_name, page_num+1, par, emb_text]

#pd.options.display.max_rows = None
#display(df)

# user query
query = 'How americans are going to manage aluminium resources if russia attack ucraine'
query_embedding = model.encode(query)

# Compute the cosine similarity
embeddings_matrix = np.vstack(df['embedding'].values) # convertion to a NumPy array for better efficiency
similarities = model.similarity(query_embedding, embeddings_matrix)[0]

# nicely print the search results
df['similarity_score'] = similarities

# row number over ordered partition
df = df.sort_values(['file_name', 'page', 'similarity_score'], ascending=[True, True, False])  # Sort by partition column and then by value within each partition
df['row_number'] = df.groupby(['file_name', 'page']).cumcount()

# Keep only rows where row_number <= 2
df = df[df['row_number'] <= 2]

# Calculate the average 'value' for each category
df_agg = df.groupby(['file_name', 'page'])['similarity_score'].mean().reset_index()
df_agg = df_agg.sort_values(by='similarity_score', ascending=False).reset_index(drop=True)
df_agg['rank'] = range(1, len(df_agg)+1)

df_agg



	sample0.pdf
	sample1.pdf


Unnamed: 0,file_name,page,similarity_score,rank
0,sample1.pdf,5,0.522988,1
1,sample1.pdf,9,0.363853,2
2,sample1.pdf,6,0.34538,3
3,sample1.pdf,3,0.342688,4
4,sample1.pdf,1,0.336091,5
5,sample1.pdf,7,0.311732,6
6,sample1.pdf,2,0.310567,7
7,sample1.pdf,8,0.303113,8
8,sample1.pdf,11,0.270189,9
9,sample1.pdf,12,0.248495,10
