# In-context learning for Citation Prediction

In [2]:
import dspy
import pandas as pd
import ast
import numpy as np
import os
from numpy.linalg import norm
from tqdm import tqdm
from pathlib import Path
# from operator import add
from PyPDF2 import PdfReader
from openai import OpenAI
from dspy.evaluate import Evaluate
from dotenv import load_dotenv

## Get the test data

In [3]:
query_candidate_data = pd.read_csv('~/test.qrel.cid', sep=' ', header=None, names=['query', 'candidate', 'bool'])

In [4]:

with open('/Users/jamie/qpaper_to_emb', 'r') as f:
    query_papers = [line.strip() for line in f]

with open('/Users/jamie/cpaper_to_emb', 'r') as f:
    candidate_papers = [line.strip() for line in f]

print(f'len(query_papers): {len(query_papers)}')
print(f'len(candidate_papers): {len(candidate_papers)}')


len(query_papers): 115
len(candidate_papers): 637


In [5]:
counter_4 = 0
valid_rows = pd.DataFrame()
query_dir = '/Users/jamie/s2-folks/examples/python/get_open_access_pdf/query_papers'
candidate_dir = '/Users/jamie/s2-folks/examples/python/get_open_access_pdf/cand_papers_combined'
# Iterate over the rows of the data
for _, row in query_candidate_data.iterrows():
    query_file = os.path.join(query_dir, str(row['query']) + '.pdf')
    candidate_file = os.path.join(candidate_dir, str(row['candidate']) + '.pdf')

    # Check if both files exist
    if os.path.isfile(query_file) and os.path.isfile(candidate_file):
        # If both files exist, append the row to valid_rows
        valid_rows = valid_rows._append(row)
        
# Reset the index of valid_rows
valid_rows.reset_index(drop=True, inplace=True)
print(valid_rows.head())
print(f'Number of query candidate pairs with valid files: {len(valid_rows)}')

     query candidate  bool
0  3498240   1824499     1
1  3498240  53645322     0
2  3498240   1915951     0
3  3498240   3048298     0
4  3498240   3627503     0
Number of query candidate pairs with valid files: 651


In [6]:
data = [{"query_file": query_file, "candidate_file": candidate_file, "cites": bool(bool_)} for query_file, candidate_file, bool_ in zip(valid_rows['query'], valid_rows['candidate'], valid_rows['bool'])]
data = [dspy.Example(**x).with_inputs('query_file', 'candidate_file') for x in data]

def split_data(data, split_ratio, seed=42):
    np.random.seed(seed)
    indices = np.random.permutation(len(data))
    split_index = int(split_ratio * len(data))
    train_indices = indices[:split_index]
    test_indices = indices[split_index:]
    trainset = [data[i] for i in train_indices]
    testset = [data[i] for i in test_indices]
    return trainset, testset

# trainset, testset = split_data(data, 0)
trainset = data


## Chunker

In [7]:
load_dotenv()
llm = dspy.OpenAI(model="gpt-3.5-turbo")
dspy.settings.configure(lm=llm, rm=None)

client = OpenAI(
    # this is also the default, it can be omitted
    api_key=os.environ['OPENAI_API_KEY'],
)

In [8]:
class Chunker:
    def __init__(self, context_window=3000, max_windows=5):
        self.context_window = context_window
        self.max_windows = max_windows
        self.window_overlap = 0.02

    def __call__(self, paper):
        snippet_idx = 0

        while snippet_idx < self.max_windows and paper:
            endpos = int(self.context_window * (1.0 + self.window_overlap))
            snippet, paper = paper[:endpos], paper[endpos:]

            next_newline_pos = snippet.rfind('\n')
            if paper and next_newline_pos != -1 and next_newline_pos >= self.context_window // 2:
                paper = snippet[next_newline_pos+1:] + paper
                snippet = snippet[:next_newline_pos]

            yield snippet_idx, snippet.strip()
            snippet_idx += 1

## DSPy Module

In [9]:
def get_embeddings(texts, model="text-embedding-3-small", save_file=None):
    if save_file and Path(save_file).exists():
        with open(save_file, 'r') as f:
            # print(f"Loading embeddings from {save_file}")
            embeddings = [ast.literal_eval(line.strip()) for line in f]
        return embeddings
        
    try:
        response = client.embeddings.create(input=texts, model=model)
        embeddings = [embedding.embedding for embedding in response.data]
        if save_file: # Save the embeddings to a file
            with open(save_file, 'w') as f:
                # print(f"Saving embeddings to {save_file}")
                for embedding in embeddings:
                    f.write(str(embedding) + '\n')
        return embeddings
    except Exception as e:
        print("Error during API call:", e)
        return []
    
def get_most_similar_chunk(query_embedding, candidate_embeddings, candidate_chunks):
    similarities = np.dot(candidate_embeddings, query_embedding) / (norm(candidate_embeddings, axis=1) * norm(query_embedding))
    most_similar_idx = np.argmax(similarities)
    return candidate_chunks[most_similar_idx]
    
    
class PredictCitation(dspy.Signature):
    __doc__ = """Predict if the two chunks are related by a citation. Consider all possible ways in which a citation could occur, such as direct quotes, paraphrasing, or referring to the same ideas or data. Don't be afraid to predict that the chunks are related by a citation. If you're not sure, it's better to predict that they are related."""   
    query_chunk: str = dspy.InputField(desc='Query chunk to compare to the candidate chunk.')
    candidate_chunk: str = dspy.InputField(desc='Candidate chunk to compare to the query chunk.')
    answer: bool = dspy.OutputField(desc="either True or False", prefix="Answer:")


class PredictCitationAndResolve(dspy.Module):
    def __init__(self, context_window=3000, max_windows=5, resolve_function=any,
                 candidate_folder='/Users/jamie/s2-folks/examples/python/get_open_access_pdf/cand_papers_combined', 
                 query_folder='/Users/jamie/s2-folks/examples/python/get_open_access_pdf/query_papers',
                 reset_embedding=False):
        super().__init__()
        
        self.chunk = Chunker(context_window=context_window, max_windows=max_windows)
        # self.predict = dspy.TypedPredictor(PredictCitation)
        # self.predict = dspy.TypedChainOfThought(PredictCitation)
        self.predict = dspy.ChainOfThought(PredictCitation)
        self.resolve_function = resolve_function
        self.query_folder = query_folder
        self.candidate_folder = candidate_folder
        os.makedirs('embeddings', exist_ok=True)
        if reset_embedding:
            for emb_file in os.listdir('embeddings'):
                os.remove(f'embeddings/{emb_file}')

    def forward(self, query_file, candidate_file):
        predictions = []
        
        # Get the text from the pdfs
        query_pdf = PdfReader(f'{self.query_folder}/{query_file}.pdf')
        query_text = ""
        for page in query_pdf.pages:
            page_text = page.extract_text()
            if page_text:
                query_text += page_text + " "  # Adding space to separate text between pages
        query_text = query_text.replace("\n", " ")
        
        candidate_pdf = PdfReader(f'{self.candidate_folder}/{candidate_file}.pdf')
        candidate_text = ""
        for page in candidate_pdf.pages:
            page_text = page.extract_text()
            if page_text:
                candidate_text += page_text + " "
        candidate_text = candidate_text.replace("\n", " ")
        
        # for each chunk in the paper
        query_chunks = [snippet for _, snippet in self.chunk(query_text)]
        candidate_chunks = [snippet for _, snippet in self.chunk(candidate_text)]
        
        # Create embeddings for the chunks
        candidate_embeddings = get_embeddings(candidate_chunks, save_file=f'embeddings/candidate_{candidate_file}.emb')
        query_embeddings = get_embeddings(query_chunks, save_file=f'embeddings/query_{query_file}.emb')
        
        for snippet, query_embedding in zip(query_chunks, query_embeddings):
            # Get the candidate chunk that is most similar to the snippet
            candidate_chunk = get_most_similar_chunk(query_embedding, candidate_embeddings, candidate_chunks)
            prediction = self.predict(query_chunk=snippet, candidate_chunk=candidate_chunk)
            # print(prediction)
            predictions.append(prediction.answer=='True')

        return dspy.Prediction(predictions=predictions, resolved=self.resolve_function(predictions))

In [10]:
pipeline_chunking = PredictCitationAndResolve(max_windows=15, context_window=1000, reset_embedding=False)

## Example

In [11]:
# chunker = Chunker(context_window=1000, max_windows=15)
# query_pdf = PdfReader(f'/Users/jamie/s2-folks/examples/python/get_open_access_pdf/query_papers/1323414.pdf')
# query_text = ""
# for page in query_pdf.pages:
#     page_text = page.extract_text()
#     if page_text:
#         query_text += page_text + " "  # Adding space to separate text between pages
# query_text = query_text.replace("\n", " ")
# query_chunks = [snippet for _, snippet in chunker(query_text)]
# print(query_chunks)

In [12]:
# print(len(query_chunks[0]))
# print(len(query_chunks))

In [13]:
# # get an example
# example = trainset[-2]
# example_x = example.inputs()
# example_y = example.labels()
# print(example_x)
# print(example_y)

# prediction = pipeline_chunking(**example_x)
# print(prediction)
# print(example_y.cites)

In [14]:
llm.inspect_history(n=5)

## Evaluate

In [15]:
def metric(example, result):
    '''Match metric'''
    return 1 if example.cites == result.resolved else 0

In [16]:
# evaluate = Evaluate(devset=trainset, metric=metric, num_threads=8, display_progress=True, display_table=0, max_errors=100, return_outputs=True)
# outputs = evaluate(pipeline_chunking)

In [17]:
# all_predictions = []
# for x in outputs[1]:
#     if type(x[1])==dspy.Prediction:
#         all_predictions.append(x[1].resolved)
#     else:
#         all_predictions.append(np.nan)
    

# all_labels = [x[0].cites for x in outputs[1]]
# print(len(all_predictions))

# with open('darwin/eval/predictions_COT_large_prompt_1000.txt', 'w') as f:
#     for pred in all_predictions:
#         f.write(str(pred) + '\n')

In [18]:
# # Compute the accuracy of the final predictions
# correct_predictions = [prediction == label for prediction, label in zip(all_predictions, all_labels)]
# accuracy = sum(correct_predictions) / len(correct_predictions)
# print(f'Accuracy: {accuracy:.2f}')

# # Compute the recall of the final predictions
# true_positives = sum([prediction and label for prediction, label in zip(all_predictions, all_labels)])
# false_negatives = sum([not prediction and label for prediction, label in zip(all_predictions, all_labels)])
# recall = true_positives / (true_positives + false_negatives)
# print(f'Recall: {recall: .2f}')

# # Compute the precision of the final predictions
# true_positives = sum([prediction and label for prediction, label in zip(all_predictions, all_labels)])
# false_positives = sum([prediction and not label for prediction, label in zip(all_predictions, all_labels)])
# precision = true_positives / (true_positives + false_positives)
# print(f'Precision: {precision:.2f}')

# # F1 score
# f1 = 2 * (precision * recall) / (precision + recall)
# print(f'F1 Score: {f1:.2f}')

In [19]:
# all_predictions

NameError: name 'all_predictions' is not defined

Weird paper

In [None]:
PdfReader('darwin/query_papers/53079158.pdf').pages[-1].extract_text()

In [20]:
llm.inspect_history(n=5)

## Optimizer Class

### Produce the dataset where dspy will retrieve from
#### Each sample has the following format. "Query Chunk: ...\n Candidate Chunk: ...\n Answer: ...\n  " 

In [24]:
test_papers = []
r_papers = []
with open('/Users/jamie/link-recorder-final-1', 'r') as f:
    for line in f:
        temp = line.split('\t')
        tpaper = temp[0].strip()
        rpaper = temp[1].strip()
        test_papers.append(tpaper)
        r_papers.append(rpaper)
test_retrieved_data = pd.DataFrame({'tpaper': test_papers, 'rpaper': r_papers})

In [36]:
valid_rows_retrieved = pd.DataFrame()
retrieved_dir = '/Users/jamie/s2-folks/examples/python/get_open_access_pdf/r-paper-final'
for _, row in test_retrieved_data.iterrows():
    test_file_1 = os.path.join(query_dir, str(row['tpaper']) + '.pdf')
    test_file_2 = os.path.join(candidate_dir, str(row['tpaper']) + '.pdf')
    r_file = os.path.join(retrieved_dir, str(row['rpaper']) + '.pdf')

    # Check if both files exist
    if (os.path.isfile(test_file_1) or os.path.isfile(test_file_2)) and os.path.isfile(r_file):
        # If both files exist, append the row to valid_rows
        valid_rows_retrieved = valid_rows_retrieved._append(row)
valid_rows_retrieved.reset_index(drop=True, inplace=True)
print(valid_rows_retrieved.head())
print(f'Number of query candidate pairs with valid files: {len(valid_rows_retrieved)}')

     tpaper     rpaper
0  16897790     252854
1   2538574  253145517
2  11633392  113541825
3   4655781    2011582
4   6833818  189960050
Number of query candidate pairs with valid files: 738


In [45]:
# Randoly select 100 to set up the retrieval dataset
dspy_r_set = pd.DataFrame(columns=['query', 'candidate', 'label'])
valid_r_papers = valid_rows_retrieved['rpaper'].to_numpy()
sampled_df = valid_rows_retrieved.sample(n=100)
for _, row in sampled_df.iterrows():
    test_set_paper = row['tpaper']
    retrieved_paper = row['rpaper']
    new_row = {'query': test_set_paper, 'candidate': retrieved_paper, 'label': 1}
    dspy_r_set = pd.concat([dspy_r_set, pd.DataFrame([new_row])], ignore_index=True)
    
    # Get 8 negative samples for one positive sample as in the SPECTER svm experiment.
    neg_papers = np.random.choice(valid_r_papers, size=8)
    for neg_p in neg_papers:
        new_row = {'query': test_set_paper, 'candidate': neg_p, 'label': 0}
        dspy_r_set = pd.concat([dspy_r_set, pd.DataFrame([new_row])], ignore_index=True)
print(dspy_r_set.head())
print(f'Number of query candidate pairs in dspy retrieval set: {len(dspy_r_set)}')

     query  candidate label
0  6869636  235125640     1
1  6869636  114250615     0
2  6869636  111334241     0
3  6869636  189960050     0
4  6869636    2439435     0
Number of query candidate pairs in dspy retrieval set: 900


In [40]:
def get_most_similar_chunk_emb(query_embedding, candidate_embeddings, candidate_chunks):
    similarities = np.dot(candidate_embeddings, query_embedding) / (norm(candidate_embeddings, axis=1) * norm(query_embedding))
    most_similar_idx = np.argmax(similarities)
    return candidate_chunks[most_similar_idx], candidate_embeddings[most_similar_idx]

In [58]:
def get_most_similar_emb_idx(query_embedding, candidate_embeddings):
    similarities = np.dot(candidate_embeddings, query_embedding) / (norm(candidate_embeddings, axis=1) * norm(query_embedding))
    most_similar_idx = np.argmax(similarities)
    return most_similar_idx

In [None]:
# produce retrieval set
query_folder='/Users/jamie/s2-folks/examples/python/get_open_access_pdf/query_papers'
query_folder_2 = '/Users/jamie/s2-folks/examples/python/get_open_access_pdf/cand_papers_combined'
candidate_folder= '/Users/jamie/s2-folks/examples/python/get_open_access_pdf/r-paper-final'
chunk = Chunker(context_window=1000, max_windows=15)
dspy_r_emb = []
dspy_r_text = []
for _, row in dspy_r_set.iterrows():
    # Get the text from the pdfs
    query_file_1 = os.path.join(query_folder, str(row['query']) + '.pdf')
    query_file_2 = os.path.join(query_folder_2, str(row['query']) + '.pdf')
    if os.path.isfile(query_file_1):
        query_file_path = query_file_1
    if os.path.isfile(query_file_2):
        query_file_path = query_file_2
    try:
        query_pdf = PdfReader(query_file_path)
    except:
        cotinue
    query_text = ""
    for page in query_pdf.pages:
        page_text = page.extract_text()
        if page_text:
            query_text += page_text + " "  # Adding space to separate text between pages
    query_text = query_text.replace("\n", " ")
    
    candidate_file = row['candidate']
    try:
        candidate_pdf = PdfReader(f'{candidate_folder}/{candidate_file}.pdf')
    except:
        continue
    candidate_text = ""
    for page in candidate_pdf.pages:
        page_text = page.extract_text()
        if page_text:
            candidate_text += page_text + " "
    candidate_text = candidate_text.replace("\n", " ")
    
    # for each chunk in the paper
    query_chunks = [snippet for _, snippet in chunk(query_text)]
    candidate_chunks = [snippet for _, snippet in chunk(candidate_text)]
    
    # Create embeddings for the chunks
    candidate_embeddings = get_embeddings(candidate_chunks, save_file=f'embeddings/candidate_{candidate_file}.emb')
    query_embeddings = get_embeddings(query_chunks, save_file=f'embeddings/query_{query_file}.emb')
    
    for snippet, query_embedding in zip(query_chunks, query_embeddings):
        # Get the candidate chunk that is most similar to the snippet
        candidate_chunk, c_emb = get_most_similar_chunk_emb(query_embedding, candidate_embeddings, candidate_chunks)
        dspy_r_emb.append((query_embedding, c_emb, row['label']))
        dspy_r_text.append((snippet, candidate_chunk, row['label']))

In [49]:
len(dspy_r_emb), len(dspy_r_text)

(1080, 1080)

In [54]:
dspy_r_emb_concat = []
for q_emb, c_emb, label in dspy_r_emb:
    concat_emb = q_emb + c_emb
    dspy_r_emb_concat.append(concat_emb)

In [59]:
class PredictCitationWithRetrieval(dspy.Signature):
    __doc__ = """Predict if the two chunks are related by a citation. Consider all possible ways in which a citation could occur, such as direct quotes, paraphrasing, or referring to the same ideas or data. Don't be afraid to predict that the chunks are related by a citation. If you're not sure, it's better to predict that they are related."""   
    query_chunk: str = dspy.InputField(desc='Query chunk to compare to the candidate chunk.')
    candidate_chunk: str = dspy.InputField(desc='Candidate chunk to compare to the query chunk.')
    answer: bool = dspy.OutputField(desc="either True or False", prefix="Answer:")
    context: str = dspy.InputField(desc="A good example to learn from.")

In [64]:
class PredictCitationRetrieveAndResolve(dspy.Module):
    def __init__(self, context_window=3000, max_windows=5, resolve_function=any,
                 candidate_folder='/Users/jamie/s2-folks/examples/python/get_open_access_pdf/cand_papers_combined', 
                 query_folder='/Users/jamie/s2-folks/examples/python/get_open_access_pdf/query_papers',
                 reset_embedding=False):
        super().__init__()
        
        self.chunk = Chunker(context_window=context_window, max_windows=max_windows)
        # self.predict = dspy.TypedPredictor(PredictCitation)
        # self.predict = dspy.TypedChainOfThought(PredictCitation)
        self.predict = dspy.Predict(PredictCitationWithRetrieval)
        self.resolve_function = resolve_function
        self.query_folder = query_folder
        self.candidate_folder = candidate_folder
        os.makedirs('embeddings', exist_ok=True)
        if reset_embedding:
            for emb_file in os.listdir('embeddings'):
                os.remove(f'embeddings/{emb_file}')

    def forward(self, query_file, candidate_file):
        predictions = []     
        # Get the text from the pdfs
        query_pdf = PdfReader(f'{self.query_folder}/{query_file}.pdf')
        query_text = ""
        for page in query_pdf.pages:
            page_text = page.extract_text()
            if page_text:
                query_text += page_text + " "  # Adding space to separate text between pages
        query_text = query_text.replace("\n", " ")
        
        candidate_pdf = PdfReader(f'{self.candidate_folder}/{candidate_file}.pdf')
        candidate_text = ""
        for page in candidate_pdf.pages:
            page_text = page.extract_text()
            if page_text:
                candidate_text += page_text + " "
        candidate_text = candidate_text.replace("\n", " ")
        
        # for each chunk in the paper
        query_chunks = [snippet for _, snippet in self.chunk(query_text)]
        candidate_chunks = [snippet for _, snippet in self.chunk(candidate_text)]
        
        # Create embeddings for the chunks
        candidate_embeddings = get_embeddings(candidate_chunks, save_file=f'embeddings/candidate_{candidate_file}.emb')
        query_embeddings = get_embeddings(query_chunks, save_file=f'embeddings/query_{query_file}.emb')
        
        for snippet, query_embedding in zip(query_chunks, query_embeddings):
            # Get the candidate chunk that is most similar to the snippet
            candidate_chunk, candidate_chunk_emb = get_most_similar_chunk_emb(query_embedding, candidate_embeddings, candidate_chunks)
            original_emb_concat = query_embedding + candidate_chunk_emb
            context_idx = get_most_similar_emb_idx(original_emb_concat, dspy_r_emb_concat)
            context_text = dspy_r_text[context_idx]
            context = f"Query Chunk: {context_text[0]}\nCandidate Chunk: {context_text[1]}\nAnswer: {context_text[2]}\n"
            prediction = self.predict(query_chunk=snippet, candidate_chunk=candidate_chunk, context=context)
            # print(prediction)
            predictions.append(prediction.answer=='True')
        return dspy.Prediction(context=context, predictions=predictions, resolved=self.resolve_function(predictions))

In [62]:
def metric(example, result):
    '''Match metric'''
    return 1 if example.cites == result.resolved else 0

In [67]:
pipeline_chunking_retrieval = PredictCitationRetrieveAndResolve(max_windows=15, context_window=1000, reset_embedding=False)

In [None]:
evaluate = Evaluate(devset=trainset, metric=metric, num_threads=8, display_progress=True, display_table=0, max_errors=100, return_outputs=True)
outputs = evaluate(pipeline_chunking_retrieval)

In [70]:
llm.inspect_history(n=5)

Error for example in dev set: 		 Socket operation on non-socket



Predict if the two chunks are related by a citation. Consider all possible ways in which a citation could occur, such as direct quotes, paraphrasing, or referring to the same ideas or data. Don't be afraid to predict that the chunks are related by a citation. If you're not sure, it's better to predict that they are related.

---

Follow the following format.

Query Chunk: Query chunk to compare to the candidate chunk.

Candidate Chunk: Candidate chunk to compare to the query chunk.

Context: A good example to learn from.

Answer: either True or False

---

Query Chunk: ersity of Verona. M.A. is supported by an APIF grant from the University of Barcelona. P.R. is partly supported by an ICREA Academia grant.Other studies shows that clothing correlates with the personality traits of people in a way that people with formal clothing perceive actions and objects, the inter-relationship and the intra-relationship between them 

  return v1_cached_gpt3_turbo_request_v2(**kwargs)
  return v1_cached_gpt3_turbo_request_v2(**kwargs)
  return v1_cached_gpt3_turbo_request_v2(**kwargs)
  return v1_cached_gpt3_turbo_request_v2(**kwargs)
  return v1_cached_gpt3_turbo_request_v2(**kwargs)
  return v1_cached_gpt3_turbo_request_v2(**kwargs)
  return v1_cached_gpt3_turbo_request_v2(**kwargs)
  return v1_cached_gpt3_turbo_request_v2(**kwargs)
  return v1_cached_gpt3_turbo_request_v2(**kwargs)
  return v1_cached_gpt3_turbo_request_v2(**kwargs)
  return v1_cached_gpt3_turbo_request_v2(**kwargs)
  return v1_cached_gpt3_turbo_request_v2(**kwargs)
  return v1_cached_gpt3_turbo_request_v2(**kwargs)
  return v1_cached_gpt3_turbo_request_v2(**kwargs)
  return v1_cached_gpt3_turbo_request_v2(**kwargs)
  return v1_cached_gpt3_turbo_request_v2(**kwargs)
