# In-context learning for Citation Prediction

In [1]:
import dspy
import pandas as pd
import ast
import numpy as np
import os
from numpy.linalg import norm
from tqdm import tqdm
from pathlib import Path
# from operator import add
from PyPDF2 import PdfReader
from openai import OpenAI

  from .autonotebook import tqdm as notebook_tqdm


## Get the test data

In [2]:
query_candidate_data = pd.read_csv('darwin/test.qrel.cid', sep=' ', header=None, names=['query', 'candidate', 'bool'])

In [3]:

with open('darwin/qpaper_to_emb', 'r') as f:
    query_papers = [line.strip() for line in f]

with open('darwin/cpaper_to_emb', 'r') as f:
    candidate_papers = [line.strip() for line in f]

print(f'len(query_papers): {len(query_papers)}')
print(f'len(candidate_papers): {len(candidate_papers)}')


len(query_papers): 115
len(candidate_papers): 637


In [4]:
valid_rows = pd.DataFrame()
query_dir = 'darwin/query_papers'
candidate_dir = 'darwin/candidate_papers'
# Iterate over the rows of the data
for _, row in query_candidate_data.iterrows():
    query_file = os.path.join(query_dir, str(row['query']) + '.pdf')
    candidate_file = os.path.join(candidate_dir, str(row['candidate']) + '.pdf')

    # Check if both files exist
    if os.path.isfile(query_file) and os.path.isfile(candidate_file):
        # If both files exist, append the row to valid_rows
        valid_rows = valid_rows._append(row)

# Reset the index of valid_rows
valid_rows.reset_index(drop=True, inplace=True)
print(valid_rows.head())
print(f'Number of query candidate pairs with valid files: {len(valid_rows)}')

     query candidate  bool
0  3498240   1824499     1
1  3498240  53645322     0
2  3498240   1915951     0
3  3498240   3048298     0
4  3498240   3627503     0
Number of query candidate pairs with valid files: 651


In [5]:
data = [{"query_file": query_file, "candidate_file": candidate_file, "cites": bool(bool_)} for query_file, candidate_file, bool_ in zip(valid_rows['query'], valid_rows['candidate'], valid_rows['bool'])]
data = [dspy.Example(**x).with_inputs('query_file', 'candidate_file') for x in data]

def split_data(data, split_ratio, seed=42):
    np.random.seed(seed)
    indices = np.random.permutation(len(data))
    split_index = int(split_ratio * len(data))
    train_indices = indices[:split_index]
    test_indices = indices[split_index:]
    trainset = [data[i] for i in train_indices]
    testset = [data[i] for i in test_indices]
    return trainset, testset

trainset, testset = split_data(data, 0.2)


## Chunker

In [6]:
llm = dspy.OpenAI(model="gpt-3.5-turbo")
dspy.settings.configure(lm=llm, rm=None)

client = OpenAI(
    # this is also the default, it can be omitted
    api_key=os.environ['OPENAI_API_KEY'],
)

In [7]:
class Chunker:
    def __init__(self, context_window=3000, max_windows=5):
        self.context_window = context_window
        self.max_windows = max_windows
        self.window_overlap = 0.02

    def __call__(self, paper):
        snippet_idx = 0

        while snippet_idx < self.max_windows and paper:
            endpos = int(self.context_window * (1.0 + self.window_overlap))
            snippet, paper = paper[:endpos], paper[endpos:]

            next_newline_pos = snippet.rfind('\n')
            if paper and next_newline_pos != -1 and next_newline_pos >= self.context_window // 2:
                paper = snippet[next_newline_pos+1:] + paper
                snippet = snippet[:next_newline_pos]

            yield snippet_idx, snippet.strip()
            snippet_idx += 1

## DSPy Module

In [9]:
def get_embeddings(texts, model="text-embedding-3-small", save_file=None):
    if save_file and Path(save_file).exists():
        with open(save_file, 'r') as f:
            print(f"Loading embeddings from {save_file}")
            embeddings = [ast.literal_eval(line.strip()) for line in f]
        return embeddings
        
    try:
        response = client.embeddings.create(input=texts, model=model)
        embeddings = [embedding.embedding for embedding in response.data]
        if save_file: # Save the embeddings to a file
            with open(save_file, 'w') as f:
                print(f"Saving embeddings to {save_file}")
                for embedding in embeddings:
                    f.write(str(embedding) + '\n')
        return embeddings
    except Exception as e:
        print("Error during API call:", e)
        return []
    
def get_most_similar_chunk(query_embedding, candidate_embeddings, candidate_chunks):
    similarities = np.dot(candidate_embeddings, query_embedding) / (norm(candidate_embeddings, axis=1) * norm(query_embedding))
    most_similar_idx = np.argmax(similarities)
    return candidate_chunks[most_similar_idx]
    
    
class PredictCitation(dspy.Signature):
    __doc__ = """Predict if the two chunks are related by a citation."""
    query_chunk: str = dspy.InputField(desc='Query chunk to compare to the candidate chunk.')
    candidate_chunk: str = dspy.InputField(desc='Candidate chunk to compare to the query chunk.')
    answer: bool = dspy.OutputField(desc="either True or False", prefix="Answer:")


class PredictCitationAndResolve(dspy.Module):
    def __init__(self, context_window=3000, max_windows=5, resolve_function=any,
                 candidate_folder='darwin/candidate_papers', query_folder='darwin/query_papers'):
        super().__init__()
        
        self.chunk = Chunker(context_window=context_window, max_windows=max_windows)
        self.predict = dspy.TypedPredictor(PredictCitation)
        # self.predict = dspy.ChainOfThought(PredictCitation)
        self.resolve_function = resolve_function
        self.query_folder = query_folder
        self.candidate_folder = candidate_folder
        os.mkdir('embeddings', exist_ok=True)
        

    def forward(self, query_file, candidate_file):
        predictions = []
        
        # Get the text from the pdfs
        query_pdf = PdfReader(f'{self.query_folder}/{query_file}.pdf')
        query_text = ""
        for page in query_pdf.pages:
            page_text = page.extract_text()
            if page_text:
                query_text += page_text + " "  # Adding space to separate text between pages
        query_text = query_text.replace("\n", " ")
        
        candidate_pdf = PdfReader(f'{self.candidate_folder}/{candidate_file}.pdf')
        candidate_text = ""
        for page in candidate_pdf.pages:
            page_text = page.extract_text()
            if page_text:
                candidate_text += page_text + " "
        candidate_text = candidate_text.replace("\n", " ")
        
        # for each chunk in the paper
        query_chunks = [snippet for _, snippet in self.chunk(query_text)]
        candidate_chunks = [snippet for _, snippet in self.chunk(candidate_text)]
        
        # Create embeddings for the chunks
        candidate_embeddings = get_embeddings(candidate_chunks, save_file=f'embeddings/candidate_{candidate_file}.emb')
        query_embeddings = get_embeddings(query_chunks, save_file=f'embeddings/query_{query_file}.emb')
        
        for snippet, query_embedding in zip(query_chunks, query_embeddings):
            # Get the candidate chunk that is most similar to the snippet
            candidate_chunk = get_most_similar_chunk(query_embedding, candidate_embeddings, candidate_chunks)
            prediction = self.predict(query_chunk=snippet, candidate_chunk=candidate_chunk)
            # print(prediction)
            predictions.append(prediction.answer=='True')

        return dspy.Prediction(predictions=predictions, resolved=self.resolve_function(predictions))

## Tests

In [36]:
pipeline_chunking = PredictCitationAndResolve(max_windows=10)

In [20]:
# get an example
example = trainset[0]
example_x = example.inputs()
example_y = example.labels()
print(example_x)
print(example_y)

prediction = pipeline_chunking(**example_x)
print(prediction)
print(example_y.cites)

Example({'query_file': 1054114, 'candidate_file': '14525920'}) (input_keys=None)
Example({'cites': False}) (input_keys=None)
Loading embeddings from candidate_14525920.emb
Loading embeddings from query_1054114.emb
Prediction(
    predictions=[False, False, False, False, False],
    resolved=False
)
False


In [21]:
llm.inspect_history(n=5)





Predict if the two chunks are related by a citation.

---

Follow the following format.

Query Chunk: Query chunk to compare to the candidate chunk.
Candidate Chunk: Candidate chunk to compare to the query chunk.
Answer: either True or False (Respond with a single bool value)

---

Query Chunk: Pooled Motion Features for First-Person Videos M. S. Ryoo, Brandon Rothrock, and Larry Matthies Jet Propulsion Laboratory, California Institute of Technology, Pasadena, CA mryoo@jpl.nasa.gov Abstract In this paper, we present a new feature representation for ﬁrst-person videos. In ﬁrst-person video understanding (e.g., activity recognition), it is very important to capture both entire scene dynamics (i.e., egomotion) and salient lo- cal motion observed in videos. We describe a representa- tion framework based on time series pooling, which is de- signed to abstract short-term/long-term changes in feature descriptor elements. The idea is to keep track of how de- scriptor values are changing ov

In [33]:
from dspy.evaluate import Evaluate

def metric(example, result):
    '''Match metric'''
    # print('inside metric')
    # print(f'{example=}, {result=}')
    return 1 if example.cites == result.resolved else 0

In [35]:
evaluate = Evaluate(devset=trainset, metric=metric, num_threads=8, display_progress=True, display_table=0, max_errors=100)
evaluate(pipeline_chunking)

  0%|          | 0/130 [00:00<?, ?it/s]

Loading embeddings from candidate_13111019.emb
Loading embeddings from candidate_14525920.emb
Loading embeddings from query_12842965.emb
Loading embeddings from candidate_26230598.emb
Loading embeddings from candidate_17401731.emb
Loading embeddings from query_1054114.emb
inside metric
example=Example({'query_file': 12842965, 'candidate_file': '13111019', 'cites': True}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False],
    resolved=False
)


Average Metric: 1 / 2  (50.0):   1%|          | 1/130 [00:03<07:09,  3.33s/it]

inside metric
example=Example({'query_file': 1054114, 'candidate_file': '14525920', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False],
    resolved=False
)
Loading embeddings from query_2197102.emb
Loading embeddings from query_21724528.emb


Average Metric: 2 / 3  (66.7):   2%|▏         | 3/130 [00:04<02:50,  1.34s/it]

Loading embeddings from candidate_13935040.emb
inside metric
example=Example({'query_file': 21724528, 'candidate_file': '17401731', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False],
    resolved=False
)
inside metric
example=Example({'query_file': 2197102, 'candidate_file': '26230598', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from query_18428799.emb


Average Metric: 4 / 5  (80.0):   3%|▎         | 4/130 [00:06<02:49,  1.34s/it]

inside metric
example=Example({'query_file': 18428799, 'candidate_file': '13935040', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False],
    resolved=False
)
Loading embeddings from candidate_15723346.emb


Average Metric: 4 / 6  (66.7):   4%|▍         | 5/130 [00:08<02:21,  1.13s/it]

Loading embeddings from query_14520569.emb
Loading embeddings from candidate_11004850.emb
Loading embeddings from candidate_209552.emb
inside metric
example=Example({'query_file': 14520569, 'candidate_file': '15723346', 'cites': True}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from query_1634096.emb
Loading embeddings from query_12808563.emb


Average Metric: 4 / 7  (57.1):   5%|▌         | 7/130 [00:09<02:25,  1.18s/it]

inside metric
example=Example({'query_file': 1634096, 'candidate_file': '209552', 'cites': True}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
inside metric
example=Example({'query_file': 12808563, 'candidate_file': '11004850', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)


Average Metric: 5 / 8  (62.5):   6%|▌         | 8/130 [00:11<02:49,  1.39s/it]

Loading embeddings from candidate_7913851.emb
Loading embeddings from candidate_16950643.emb
Loading embeddings from query_17630799.emb
Loading embeddings from query_19225409.embLoading embeddings from candidate_518708.emb
Loading embeddings from candidate_9456062.emb
Loading embeddings from candidate_1063677.emb



Average Metric: 7 / 10  (70.0):   7%|▋         | 9/130 [00:12<02:36,  1.30s/it]

inside metric
example=Example({'query_file': 19225409, 'candidate_file': '16950643', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from query_1604520.emb
Loading embeddings from query_15811205.emb
Loading embeddings from query_888444.emb
inside metric
example=Example({'query_file': 17630799, 'candidate_file': '7913851', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
inside metric
example=Example({'query_file': 1604520, 'candidate_file': '1063677', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False],
    resolved=False
)
inside metric
example=Example({'query_file': 888444, 'candid

Average Metric: 8 / 11  (72.7):   8%|▊         | 11/130 [00:14<02:09,  1.09s/it]

Loading embeddings from candidate_28001085.emb
Loading embeddings from candidate_6178111.emb


Average Metric: 10 / 13  (76.9):  10%|█         | 13/130 [00:15<01:54,  1.03it/s]

Loading embeddings from query_49325027.emb
Loading embeddings from query_21724528.emb
inside metric
example=Example({'query_file': 49325027, 'candidate_file': '28001085', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
inside metric
example=Example({'query_file': 21724528, 'candidate_file': '6178111', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False],
    resolved=False
)


Average Metric: 12 / 15  (80.0):  12%|█▏        | 15/130 [00:17<01:36,  1.20it/s]

Loading embeddings from candidate_1432085.emb
Loading embeddings from candidate_12857779.emb
Loading embeddings from candidate_3792942.emb
Loading embeddings from candidate_20159414.emb
Loading embeddings from candidate_5324521.emb
Loading embeddings from query_14068125.emb
Loading embeddings from query_14846121.emb
Loading embeddings from query_13582167.emb
inside metric
example=Example({'query_file': 14068125, 'candidate_file': '1432085', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False],
    resolved=False
)
Loading embeddings from candidate_7487588.emb


Average Metric: 14 / 17  (82.4):  13%|█▎        | 17/130 [00:18<01:24,  1.33it/s]

Loading embeddings from query_1604520.emb
Loading embeddings from query_10124817.emb
inside metric
example=Example({'query_file': 13582167, 'candidate_file': '3792942', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False],
    resolved=False
)
inside metric
example=Example({'query_file': 14846121, 'candidate_file': '12857779', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
inside metric
example=Example({'query_file': 1604520, 'candidate_file': '20159414', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from query_9951086.emb
inside metric
example=Example({'query_file': 10124817, 'candidate_file': '5324521', 'c

Average Metric: 16 / 19  (84.2):  14%|█▍        | 18/130 [00:19<01:31,  1.22it/s]

Loading embeddings from candidate_8806164.emb
Loading embeddings from candidate_18074692.emb
Loading embeddings from query_49541818.emb
Loading embeddings from candidate_16236733.emb


Average Metric: 16 / 21  (76.2):  15%|█▌        | 20/130 [00:20<01:11,  1.53it/s]

Loading embeddings from candidate_9022876.emb
Loading embeddings from candidate_5041738.emb
Loading embeddings from query_15811205.emb
inside metric
example=Example({'query_file': 49541818, 'candidate_file': '8806164', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from query_1733200.emb
Loading embeddings from query_929001.emb
inside metric
example=Example({'query_file': 15811205, 'candidate_file': '18074692', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from query_10822829.emb
Loading embeddings from candidate_11208402.emb
inside metric
example=Example({'query_file': 929001, 'candidate_file': '9022876', 'cites': False}) (input_keys={'candidate_file', 'query_fi

Average Metric: 19 / 24  (79.2):  18%|█▊        | 23/130 [00:22<01:03,  1.69it/s]

inside metric
example=Example({'query_file': 1733200, 'candidate_file': '16236733', 'cites': True}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from query_14676755.emb
Loading embeddings from query_51880918.emb
Loading embeddings from candidate_7536915.emb
inside metric
example=Example({'query_file': 14676755, 'candidate_file': '15890292', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
inside metric
example=Example({'query_file': 51880918, 'candidate_file': '11208402', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from query_49363931.

Average Metric: 20 / 25  (80.0):  18%|█▊        | 24/130 [00:23<01:05,  1.62it/s]

Loading embeddings from candidate_21723747.emb
inside metric
example=Example({'query_file': 49363931, 'candidate_file': '7536915', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False],
    resolved=False
)


Average Metric: 20 / 26  (76.9):  19%|█▉        | 25/130 [00:23<01:01,  1.70it/s]

Loading embeddings from candidate_6530718.emb
Loading embeddings from query_11718977.emb


Average Metric: 21 / 27  (77.8):  21%|██        | 27/130 [00:25<01:14,  1.39it/s]

Loading embeddings from query_17958964.emb
Loading embeddings from candidate_14072234.emb
Loading embeddings from candidate_17942323.emb
inside metric
example=Example({'query_file': 11718977, 'candidate_file': '21723747', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
inside metric
example=Example({'query_file': 17958964, 'candidate_file': '6530718', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from query_18428799.emb
Loading embeddings from candidate_891339.emb
Loading embeddings from query_13266306.emb


Average Metric: 24 / 30  (80.0):  22%|██▏       | 29/130 [00:26<01:04,  1.57it/s]

inside metric
example=Example({'query_file': 18428799, 'candidate_file': '14072234', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False],
    resolved=False
)
Loading embeddings from query_4661204.emb
inside metric
example=Example({'query_file': 4661204, 'candidate_file': '891339', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
inside metric
example=Example({'query_file': 13266306, 'candidate_file': '17942323', 'cites': True}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from candidate_14292825.emb
Loading embeddings from candidate_4533859.emb
Loading embeddings from candidate_6592413.emb
Loading embeddings from que

Average Metric: 24 / 30  (80.0):  23%|██▎       | 30/130 [00:27<01:11,  1.39it/s]

Loading embeddings from query_29268022.embLoading embeddings from query_17958964.emb

inside metric
example=Example({'query_file': 2261105, 'candidate_file': '14292825', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
inside metric
example=Example({'query_file': 17958964, 'candidate_file': '6592413', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)


Average Metric: 26 / 32  (81.2):  25%|██▍       | 32/130 [00:29<01:18,  1.24it/s]

Loading embeddings from candidate_9941351.emb
inside metric
example=Example({'query_file': 29268022, 'candidate_file': '4533859', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from query_17958964.emb
inside metric
example=Example({'query_file': 17958964, 'candidate_file': '9941351', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)


Average Metric: 27 / 33  (81.8):  25%|██▌       | 33/130 [00:30<01:30,  1.07it/s]

Loading embeddings from candidate_18456846.emb
Loading embeddings from query_22002351.emb


Average Metric: 28 / 35  (80.0):  26%|██▌       | 34/130 [00:31<01:30,  1.06it/s]

inside metric
example=Example({'query_file': 22002351, 'candidate_file': '18456846', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)


Average Metric: 29 / 36  (80.6):  27%|██▋       | 35/130 [00:32<01:18,  1.21it/s]

Loading embeddings from candidate_342649.emb
Loading embeddings from candidate_10636659.emb
Loading embeddings from candidate_53850543.emb


Average Metric: 32 / 39  (82.1):  29%|██▉       | 38/130 [00:34<01:02,  1.47it/s]

Loading embeddings from query_5749615.emb
Loading embeddings from query_31742648.emb
Loading embeddings from query_14638316.emb
inside metric
example=Example({'query_file': 5749615, 'candidate_file': '342649', 'cites': True}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
inside metric
example=Example({'query_file': 14638316, 'candidate_file': '10636659', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
inside metric
example=Example({'query_file': 31742648, 'candidate_file': '53850543', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from candidate_1915951.emb

Average Metric: 34 / 42  (81.0):  32%|███▏      | 41/130 [00:36<01:04,  1.38it/s]

Loading embeddings from query_3498240.emb
inside metric
example=Example({'query_file': 3498240, 'candidate_file': '1915951', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False],
    resolved=False
)
Loading embeddings from query_17630799.emb
inside metric
example=Example({'query_file': 17630799, 'candidate_file': '16219282', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)


Average Metric: 35 / 43  (81.4):  32%|███▏      | 42/130 [00:37<01:08,  1.29it/s]

Loading embeddings from candidate_215812180.emb
Loading embeddings from candidate_3812096.emb


Average Metric: 36 / 44  (81.8):  34%|███▍      | 44/130 [00:38<01:07,  1.27it/s]

Loading embeddings from candidate_5280973.emb
Loading embeddings from query_1325297.emb
Loading embeddings from query_2197102.emb
Loading embeddings from query_11283266.emb
inside metric
example=Example({'query_file': 1325297, 'candidate_file': '215812180', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False],
    resolved=False
)


Average Metric: 39 / 47  (83.0):  36%|███▌      | 47/130 [00:40<00:55,  1.51it/s]

inside metric
example=Example({'query_file': 2197102, 'candidate_file': '3812096', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from candidate_7103019.emb
Loading embeddings from candidate_8954831.emb
Loading embeddings from candidate_207015066.emb
inside metric
example=Example({'query_file': 11283266, 'candidate_file': '5280973', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from query_3501880.emb
Loading embeddings from query_11718977.emb
Loading embeddings from query_14846121.emb
inside metric
example=Example({'query_file': 3501880, 'candidate_file': '7103019', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions

Average Metric: 42 / 52  (80.8):  39%|███▉      | 51/130 [00:42<00:44,  1.79it/s]

inside metric
example=Example({'query_file': 11718977, 'candidate_file': '8954831', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
inside metric
example=Example({'query_file': 14846121, 'candidate_file': '207015066', 'cites': True}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from query_9673612.emb
Loading embeddings from candidate_15538672.emb
inside metric
example=Example({'query_file': 9673612, 'candidate_file': '296275', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from query_53475.emb
Loading embeddings from candidate_12559157.emb
inside metr

Average Metric: 43 / 53  (81.1):  40%|████      | 52/130 [00:44<00:44,  1.74it/s]

inside metric
example=Example({'query_file': 49363931, 'candidate_file': '12559157', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from candidate_13663736.emb


Average Metric: 43 / 53  (81.1):  41%|████      | 53/130 [00:44<01:05,  1.17it/s]

Loading embeddings from candidate_2261077.emb
Loading embeddings from query_27985376.emb
Loading embeddings from query_53475.emb
Loading embeddings from candidate_6664074.emb


Average Metric: 44 / 54  (81.5):  41%|████      | 53/130 [00:46<01:05,  1.17it/s]

inside metric
example=Example({'query_file': 53475, 'candidate_file': '2261077', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
inside metric
example=Example({'query_file': 27985376, 'candidate_file': '13663736', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from query_12808563.emb
Loading embeddings from candidate_5182310.emb


Average Metric: 45 / 55  (81.8):  42%|████▏     | 54/130 [00:46<01:27,  1.16s/it]

Loading embeddings from query_111335.embLoading embeddings from candidate_7318157.emb

inside metric
example=Example({'query_file': 12808563, 'candidate_file': '6664074', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)


Average Metric: 46 / 56  (82.1):  43%|████▎     | 56/130 [00:48<01:14,  1.01s/it]

Loading embeddings from candidate_9719320.embLoading embeddings from query_888444.emb

inside metric
example=Example({'query_file': 111335, 'candidate_file': '5182310', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
inside metric
example=Example({'query_file': 888444, 'candidate_file': '7318157', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from query_11283266.emb


Average Metric: 47 / 57  (82.5):  44%|████▍     | 57/130 [00:49<01:18,  1.08s/it]

inside metricLoading embeddings from candidate_14191393.emb

example=Example({'query_file': 11283266, 'candidate_file': '9719320', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Error for example in dev set: 		 negative seek value -1
Loading embeddings from query_14068125.emb


Average Metric: 49 / 59  (83.1):  45%|████▌     | 59/130 [00:50<01:03,  1.13it/s]

inside metric
example=Example({'query_file': 14068125, 'candidate_file': '14191393', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False],
    resolved=False
)
Loading embeddings from candidate_45100885.emb
Loading embeddings from candidate_1901454.emb
Loading embeddings from candidate_11579427.emb
Loading embeddings from query_13680434.emb


Average Metric: 50.0 / 61  (82.0):  46%|████▌     | 60/130 [00:51<01:02,  1.11it/s]

Loading embeddings from query_1733200.emb
Loading embeddings from query_14024903.emb
inside metric
example=Example({'query_file': 13680434, 'candidate_file': '45100885', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False],
    resolved=False
)
inside metric
example=Example({'query_file': 1733200, 'candidate_file': '1901454', 'cites': True}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
inside metric
example=Example({'query_file': 14024903, 'candidate_file': '11579427', 'cites': True}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)


Average Metric: 51.0 / 62  (82.3):  48%|████▊     | 62/130 [00:53<00:53,  1.26it/s]

Loading embeddings from candidate_9794173.emb
Loading embeddings from candidate_5388558.emb


Average Metric: 51.0 / 63  (81.0):  48%|████▊     | 63/130 [00:53<00:52,  1.29it/s]

Loading embeddings from query_13266306.emb
Loading embeddings from query_46921483.emb
Loading embeddings from candidate_14285133.emb
inside metric
example=Example({'query_file': 13266306, 'candidate_file': '9794173', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)


Average Metric: 52.0 / 65  (80.0):  49%|████▉     | 64/130 [00:54<00:45,  1.46it/s]

inside metric
example=Example({'query_file': 46921483, 'candidate_file': '5388558', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from query_27985376.emb
Loading embeddings from candidate_10440730.emb
Loading embeddings from candidate_16655955.emb
Loading embeddings from candidate_655973.emb
inside metric
example=Example({'query_file': 27985376, 'candidate_file': '14285133', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from query_52012534.emb


Average Metric: 53.0 / 66  (80.3):  51%|█████     | 66/130 [00:55<00:38,  1.68it/s]

Loading embeddings from query_9951086.emb
Loading embeddings from candidate_3488076.emb
Loading embeddings from query_22002351.emb
Loading embeddings from query_14470504.emb
inside metric
example=Example({'query_file': 9951086, 'candidate_file': '16655955', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False],
    resolved=False
)
inside metric
example=Example({'query_file': 22002351, 'candidate_file': '655973', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)


Average Metric: 55.0 / 68  (80.9):  52%|█████▏    | 68/130 [00:56<00:38,  1.63it/s]

inside metric
example=Example({'query_file': 52012534, 'candidate_file': '10440730', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
inside metric
example=Example({'query_file': 14470504, 'candidate_file': '3488076', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)


Average Metric: 58.0 / 71  (81.7):  55%|█████▍    | 71/130 [00:59<00:44,  1.33it/s]

Loading embeddings from candidate_1778892.emb
Loading embeddings from query_11718977.emb
Loading embeddings from candidate_5357461.emb
Loading embeddings from candidate_15718014.emb
Loading embeddings from candidate_1402069.embLoading embeddings from query_8402000.emb

inside metric
example=Example({'query_file': 11718977, 'candidate_file': '1778892', 'cites': True}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)


Average Metric: 59.0 / 73  (80.8):  55%|█████▌    | 72/130 [01:01<00:52,  1.11it/s]

Loading embeddings from query_13266306.emb
Loading embeddings from query_46921483.emb
inside metric
example=Example({'query_file': 8402000, 'candidate_file': '5357461', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from candidate_15184255.emb
inside metric
example=Example({'query_file': 46921483, 'candidate_file': '1402069', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
inside metric
example=Example({'query_file': 13266306, 'candidate_file': '15718014', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)


Average Metric: 60.0 / 74  (81.1):  57%|█████▋    | 74/130 [01:02<00:50,  1.12it/s]

Loading embeddings from query_17751516.emb
Loading embeddings from candidate_49558658.emb
inside metric
example=Example({'query_file': 17751516, 'candidate_file': '15184255', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from query_3041910.emb


Average Metric: 62.0 / 76  (81.6):  58%|█████▊    | 75/130 [01:03<00:51,  1.07it/s]

Loading embeddings from candidate_15613461.emb
inside metric
example=Example({'query_file': 3041910, 'candidate_file': '49558658', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from query_11984437.emb
Loading embeddings from candidate_10460485.emb
Loading embeddings from candidate_33036025.emb
Loading embeddings from candidate_3128453.emb


Average Metric: 63.0 / 77  (81.8):  59%|█████▉    | 77/130 [01:04<00:35,  1.48it/s]

inside metricLoading embeddings from candidate_14121435.emb

example=Example({'query_file': 11984437, 'candidate_file': '15613461', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from query_5299262.emb
Loading embeddings from query_11152703.emb
Loading embeddings from query_5971084.emb
Loading embeddings from candidate_5553697.emb
Loading embeddings from query_49541818.emb


Average Metric: 64.0 / 78  (82.1):  59%|█████▉    | 77/130 [01:04<00:35,  1.48it/s]

inside metric
example=Example({'query_file': 11152703, 'candidate_file': '33036025', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
inside metric
example=Example({'query_file': 5299262, 'candidate_file': '10460485', 'cites': True}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from query_13808159.emb
inside metric
example=Example({'query_file': 5971084, 'candidate_file': '3128453', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
inside metric
example=Example({'query_file': 49541818, 'candidate_file': '14121435', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), re

Average Metric: 65.0 / 79  (82.3):  61%|██████    | 79/130 [01:05<00:39,  1.30it/s]

Loading embeddings from candidate_11166004.emb
inside metric
example=Example({'query_file': 13808159, 'candidate_file': '5553697', 'cites': True}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)


Average Metric: 66.0 / 81  (81.5):  62%|██████▏   | 81/130 [01:07<00:43,  1.12it/s]

Loading embeddings from query_800241.emb
Loading embeddings from candidate_52008963.emb
inside metric
example=Example({'query_file': 800241, 'candidate_file': '11166004', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from query_4566846.emb


Average Metric: 68.0 / 84  (81.0):  64%|██████▍   | 83/130 [01:09<00:35,  1.31it/s]

inside metric
example=Example({'query_file': 4566846, 'candidate_file': '52008963', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False],
    resolved=False
)
Loading embeddings from candidate_2274347.emb
Loading embeddings from query_888444.emb
Loading embeddings from candidate_10268887.emb


Average Metric: 70.0 / 86  (81.4):  66%|██████▌   | 86/130 [01:11<00:31,  1.38it/s]

Loading embeddings from candidate_6927139.emb
inside metric
example=Example({'query_file': 888444, 'candidate_file': '2274347', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from candidate_128024.emb
Loading embeddings from query_29268022.emb
Loading embeddings from query_62176925.emb
Loading embeddings from query_4714486.emb


Average Metric: 72.0 / 88  (81.8):  67%|██████▋   | 87/130 [01:12<00:36,  1.17it/s]

inside metric
example=Example({'query_file': 62176925, 'candidate_file': '6927139', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Error during API call: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}
inside metric
example=Example({'query_file': 29268022, 'candidate_file': '10268887', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from query_9343097.emb
inside metric
example=Example({'query_file': 4714486, 'candidate_file': '128024', 'cites': True}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predic

Average Metric: 72.0 / 90  (80.0):  69%|██████▉   | 90/130 [01:15<00:37,  1.06it/s]

Loading embeddings from query_19631671.emb
inside metric
example=Example({'query_file': 19631671, 'candidate_file': '17307323', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)


Average Metric: 73.0 / 91  (80.2):  70%|███████   | 91/130 [01:16<00:41,  1.06s/it]

Loading embeddings from candidate_49668609.emb
Loading embeddings from candidate_14898829.emb
Loading embeddings from query_10124817.emb
Loading embeddings from query_1323414.emb


Average Metric: 74.0 / 92  (80.4):  71%|███████   | 92/130 [01:19<01:00,  1.59s/it]

Loading embeddings from candidate_1011111.emb
inside metric
example=Example({'query_file': 10124817, 'candidate_file': '49668609', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
inside metric
example=Example({'query_file': 1323414, 'candidate_file': '14898829', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from candidate_53310159.emb
Loading embeddings from query_32722198.emb


Average Metric: 77.0 / 95  (81.1):  72%|███████▏  | 94/130 [01:21<00:41,  1.14s/it]

Loading embeddings from candidate_14068874.emb
inside metric
example=Example({'query_file': 32722198, 'candidate_file': '1011111', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from query_1604520.emb
Loading embeddings from query_51895111.emb
inside metric
example=Example({'query_file': 1604520, 'candidate_file': '53310159', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False],
    resolved=False
)


Average Metric: 77.0 / 96  (80.2):  73%|███████▎  | 95/130 [01:23<00:37,  1.06s/it]

inside metric
example=Example({'query_file': 51895111, 'candidate_file': '14068874', 'cites': True}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from candidate_206611975.emb


Average Metric: 77.0 / 96  (80.2):  74%|███████▍  | 96/130 [01:23<00:42,  1.25s/it]

Error during API call: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}
Loading embeddings from query_29268022.emb
Loading embeddings from query_52148328.emb


Average Metric: 78.0 / 98  (79.6):  75%|███████▍  | 97/130 [01:26<00:53,  1.62s/it]

Error for example in dev set: 		 shapes (0,) and (1536,) not aligned: 0 (dim 0) != 1536 (dim 0)
inside metric
example=Example({'query_file': 52148328, 'candidate_file': '206611975', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False],
    resolved=False
)


Average Metric: 78.0 / 98  (79.6):  75%|███████▌  | 98/130 [01:27<00:48,  1.53s/it]

Loading embeddings from candidate_6726135.emb
Loading embeddings from query_17978942.emb
Loading embeddings from candidate_4332898.emb


Average Metric: 79.0 / 99  (79.8):  75%|███████▌  | 98/130 [01:29<00:48,  1.53s/it]

inside metric
example=Example({'query_file': 17978942, 'candidate_file': '6726135', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from query_9470579.emb


Average Metric: 80.0 / 100  (80.0):  76%|███████▌  | 99/130 [01:30<00:59,  1.92s/it]

Loading embeddings from candidate_52843977.emb
inside metric
example=Example({'query_file': 9470579, 'candidate_file': '4332898', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from query_23580923.emb


Average Metric: 81.0 / 101  (80.2):  77%|███████▋  | 100/130 [01:32<00:50,  1.68s/it]

inside metric
example=Example({'query_file': 23580923, 'candidate_file': '52843977', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)


Average Metric: 81.0 / 101  (80.2):  78%|███████▊  | 101/130 [01:32<00:46,  1.60s/it]

Loading embeddings from candidate_16311309.emb
Loading embeddings from candidate_206929190.emb
Loading embeddings from query_11984437.emb
Loading embeddings from query_1634096.emb


Average Metric: 83.0 / 103  (80.6):  78%|███████▊  | 102/130 [01:34<00:49,  1.77s/it]

inside metric
example=Example({'query_file': 11984437, 'candidate_file': '16311309', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from candidate_4718290.emb
inside metric
example=Example({'query_file': 1634096, 'candidate_file': '206929190', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from query_8402000.emb


Average Metric: 84.0 / 104  (80.8):  79%|███████▉  | 103/130 [01:36<00:39,  1.45s/it]

inside metric
example=Example({'query_file': 8402000, 'candidate_file': '4718290', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)


Average Metric: 84.0 / 104  (80.8):  80%|████████  | 104/130 [01:37<00:42,  1.63s/it]

Loading embeddings from candidate_1225103.emb
Loading embeddings from query_3081080.emb


Average Metric: 85.0 / 105  (81.0):  81%|████████  | 105/130 [01:42<01:03,  2.55s/it]

inside metric
example=Example({'query_file': 3081080, 'candidate_file': '1225103', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from candidate_8835382.emb
Loading embeddings from query_888444.emb


Average Metric: 86.0 / 106  (81.1):  81%|████████  | 105/130 [01:44<01:03,  2.55s/it]

inside metric
example=Example({'query_file': 888444, 'candidate_file': '8835382', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from candidate_25531666.emb
Loading embeddings from candidate_28347739.emb


Average Metric: 86.0 / 106  (81.1):  82%|████████▏ | 106/130 [01:45<01:08,  2.85s/it]

Loading embeddings from query_14638316.emb
Loading embeddings from query_21724528.emb


Average Metric: 87.0 / 107  (81.3):  82%|████████▏ | 106/130 [01:46<01:08,  2.85s/it]

inside metric
example=Example({'query_file': 14638316, 'candidate_file': '25531666', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from candidate_393948.emb


Average Metric: 88.0 / 108  (81.5):  82%|████████▏ | 107/130 [01:48<00:56,  2.45s/it]

inside metric
example=Example({'query_file': 21724528, 'candidate_file': '28347739', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from candidate_1181640.emb
Loading embeddings from query_586137.emb


Average Metric: 88.0 / 109  (80.7):  84%|████████▍ | 109/130 [01:49<00:38,  1.81s/it]

inside metricLoading embeddings from query_1520250.emb

example=Example({'query_file': 586137, 'candidate_file': '393948', 'cites': True}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from candidate_12425538.emb
Loading embeddings from candidate_6259144.emb
inside metric
example=Example({'query_file': 1520250, 'candidate_file': '1181640', 'cites': True}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from query_10148124.emb


Average Metric: 88.0 / 110  (80.0):  85%|████████▍ | 110/130 [01:50<00:29,  1.46s/it]

Loading embeddings from candidate_5591866.embLoading embeddings from query_800241.emb

Loading embeddings from candidate_1657636.emb


Average Metric: 89.0 / 111  (80.2):  85%|████████▌ | 111/130 [01:51<00:27,  1.44s/it]

inside metricLoading embeddings from candidate_14864236.emb

example=Example({'query_file': 800241, 'candidate_file': '6259144', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from query_5971084.emb
inside metric
example=Example({'query_file': 10148124, 'candidate_file': '12425538', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from query_12018209.emb
Loading embeddings from query_14470504.emb


Average Metric: 91.0 / 113  (80.5):  87%|████████▋ | 113/130 [01:53<00:19,  1.13s/it]

inside metric
example=Example({'query_file': 5971084, 'candidate_file': '5591866', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
inside metric
example=Example({'query_file': 12018209, 'candidate_file': '1657636', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
inside metric
example=Example({'query_file': 14470504, 'candidate_file': '14864236', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from candidate_8861952.emb


Average Metric: 93.0 / 115  (80.9):  88%|████████▊ | 114/130 [01:54<00:15,  1.05it/s]

Loading embeddings from candidate_6601979.emb
Loading embeddings from query_1634096.emb
Loading embeddings from candidate_3654334.emb
Loading embeddings from query_17958964.emb


Average Metric: 94.0 / 116  (81.0):  88%|████████▊ | 115/130 [01:55<00:12,  1.17it/s]

Loading embeddings from query_929001.emb
inside metric
example=Example({'query_file': 1634096, 'candidate_file': '8861952', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
inside metric
example=Example({'query_file': 17958964, 'candidate_file': '6601979', 'cites': True}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)


Average Metric: 94.0 / 117  (80.3):  89%|████████▉ | 116/130 [01:56<00:12,  1.13it/s]

Loading embeddings from candidate_15475.emb
inside metric
example=Example({'query_file': 929001, 'candidate_file': '3654334', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from candidate_5304179.emb


Average Metric: 95.0 / 119  (79.8):  91%|█████████ | 118/130 [01:57<00:11,  1.03it/s]

Loading embeddings from query_27985376.emb
Loading embeddings from query_51895111.emb
inside metric
example=Example({'query_file': 27985376, 'candidate_file': '15475', 'cites': True}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
inside metric
example=Example({'query_file': 51895111, 'candidate_file': '5304179', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from candidate_8767528.emb


Average Metric: 96.0 / 120  (80.0):  92%|█████████▏| 119/130 [01:58<00:09,  1.11it/s]

Loading embeddings from candidate_19592640.emb
Loading embeddings from candidate_25364178.emb


Average Metric: 96.0 / 120  (80.0):  92%|█████████▏| 120/130 [01:59<00:08,  1.16it/s]

Loading embeddings from query_1634096.emb
Loading embeddings from query_49907522.emb
Loading embeddings from query_24927879.emb


Average Metric: 97.0 / 121  (80.2):  92%|█████████▏| 120/130 [01:59<00:08,  1.16it/s]

inside metric
example=Example({'query_file': 1634096, 'candidate_file': '19592640', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)


Average Metric: 99.0 / 123  (80.5):  95%|█████████▍| 123/130 [02:01<00:05,  1.21it/s]

inside metric
example=Example({'query_file': 49907522, 'candidate_file': '8767528', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
inside metric
example=Example({'query_file': 24927879, 'candidate_file': '25364178', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from candidate_8728653.emb
Loading embeddings from candidate_1087898.emb
Loading embeddings from candidate_38794958.emb
Loading embeddings from query_32722198.embLoading embeddings from query_1520250.emb

Loading embeddings from candidate_18470977.emb
Loading embeddings from query_14676755.emb
Loading embeddings from candidate_16545839.emb


Average Metric: 100.0 / 124  (80.6):  95%|█████████▌| 124/130 [02:03<00:06,  1.04s/it]

Loading embeddings from query_14470504.emb
inside metric
example=Example({'query_file': 32722198, 'candidate_file': '1087898', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
inside metric
example=Example({'query_file': 14676755, 'candidate_file': '38794958', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
inside metric
example=Example({'query_file': 1520250, 'candidate_file': '8728653', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Loading embeddings from query_19631671.emb


Average Metric: 103.0 / 127  (81.1):  98%|█████████▊| 127/130 [02:03<00:01,  1.83it/s]

inside metric
example=Example({'query_file': 14470504, 'candidate_file': '18470977', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
inside metric
example=Example({'query_file': 19631671, 'candidate_file': '16545839', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)


Average Metric: 104.0 / 128  (81.2):  98%|█████████▊| 128/130 [02:04<00:00,  2.18it/s]

Loading embeddings from candidate_8412218.emb
Loading embeddings from query_10148124.emb


Average Metric: 105.0 / 129  (81.4):  99%|█████████▉| 129/130 [02:13<00:03,  3.01s/it]

inside metric
example=Example({'query_file': 10148124, 'candidate_file': '8412218', 'cites': False}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)


Average Metric: 105.0 / 130  (80.8): 100%|██████████| 130/130 [02:13<00:00,  1.03s/it]

Loading embeddings from candidate_2127100.emb
Loading embeddings from query_53079158.emb
inside metric
example=Example({'query_file': 53079158, 'candidate_file': '2127100', 'cites': True}) (input_keys={'candidate_file', 'query_file'}), result=Prediction(
    predictions=[False, False, False, False, False, False, False, False, False, False],
    resolved=False
)
Average Metric: 105.0 / 130  (80.8%)





80.77

In [24]:
llm.inspect_history(n=5)





Predict if the two chunks are related by a citation.

---

Follow the following format.

Query Chunk: Query chunk to compare to the candidate chunk.
Candidate Chunk: Candidate chunk to compare to the query chunk.
Answer: either True or False (Respond with a single bool value)

---

Query Chunk: erwithmoreneighborsinthefor- eignlanguage. Followingthemethodin Zhuetal.(2017),there will be no edge between Chinese entity 福斯特 (Foust)and English entity Pistons, which implies awrongfactthat 福斯特 (Foust)doesnotbelong toPistons. Ourmethodenrichesthemissingrela- tion between entities 福斯特 (Foust)and活塞队 (Pistons)in incomplete Chinese KB through cor- responding English common neighbors, Allstar, NBA,etc.,asillustratedinFigure 1. 4.2 Comparable Sentences Generation To supervise the cross-lingual representation learningofwords,weautomaticallygeneratecom- parable sentences as cross-lingual training data. Comparable sentences are not translated paired sentences,butsentenceswiththesametopicindif- fere

In [26]:
PdfReader('darwin/query_papers/53079158.pdf').pages[-1].extract_text()

'237TomasMikolov,QuocVLe,andIlyaSutskever.2013b.\nExploitingsimilaritiesamonglanguagesformachine\ntranslation. CoRR.\nTomas Mikolov, Ilya Sutskever, Kai Chen, Gregory S.\nCorrado, and Jeffrey Dean. 2013c. Distributed rep-\nresentations of words and phrases and their compo-\nsitionality. In NIPS.\nDavid Milne and Ian H. Witten. 2008. An effective,\nlow-cost measure of semantic relatedness obtained\nfromwikipedialinks. In AAAI.\nMike Mintz, Steven Bills, Rion Snow, and Daniel Ju-\nrafsky.2009. Distantsupervisionforrelationextrac-\ntionwithoutlabeleddata. In ACL/IJCNLP .\nAditya Mogadala and Achim Rettinger. 2016. Bilin-\ngual word embeddings from parallel and non-\nparallel corpora for cross-language text classiﬁca-\ntion. In HLT-NAACL .\nThien Huu Nguyen, Nicolas Fauceglia, Mariano Ro-\ndriguez Muro, Oktie Hassanzadeh, Alﬁo Massimil-\nianoGliozzo,andMohammadSadoghi.2016. Joint\nlearning of local and global features for entity link-\ningvianeuralnetworks. In COLING.\nSebastian Ruder, Iva