In [None]:
from google.colab import drive
import pandas as pd

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
CSV_PATH = '/content/papers.csv'
df = pd.read_csv(CSV_PATH, engine='python')

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,id,abstract,authors,url
0,0,constraint-preserving hybrid finite element me...,1907.00084,maxwell's equations describe the evolution of ...,"['yakov berchenko-kogan', 'ari stern']",https://arxiv.org/abs/1907.00084
1,1,rewriting structured cospans,2001.09029,to support the study of compositional networks...,['daniel cicala'],https://arxiv.org/abs/2001.09029
2,2,hybridization and postprocessing in finite ele...,2008.00149,we hybridize the methods of finite element ext...,"['gerard awanou', 'maurice fabien', 'johnny gu...",https://arxiv.org/abs/2008.00149
3,3,novel approach to synchronisation of wearable ...,2107.03147,synchronisation of wireless inertial measureme...,"['andreas spilz', 'michael munz']",https://arxiv.org/abs/2107.03147
4,4,functional equivariance and conservation laws ...,2111.10042,preservation of linear and quadratic invariant...,"['robert i. mclachlan', 'ari stern']",https://arxiv.org/abs/2111.10042


In [None]:
!pip install pytextrank spacy

Collecting pytextrank
  Downloading pytextrank-3.3.0-py3-none-any.whl.metadata (12 kB)
Collecting icecream>=2.1 (from pytextrank)
  Downloading icecream-2.1.8-py3-none-any.whl.metadata (1.5 kB)
Collecting colorama>=0.3.9 (from icecream>=2.1->pytextrank)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting executing>=2.1.0 (from icecream>=2.1->pytextrank)
  Downloading executing-2.2.1-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting asttokens>=2.0.1 (from icecream>=2.1->pytextrank)
  Downloading asttokens-3.0.0-py3-none-any.whl.metadata (4.7 kB)
Downloading pytextrank-3.3.0-py3-none-any.whl (26 kB)
Downloading icecream-2.1.8-py3-none-any.whl (15 kB)
Downloading asttokens-3.0.0-py3-none-any.whl (26 kB)
Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading executing-2.2.1-py2.py3-none-any.whl (28 kB)
Installing collected packages: executing, colorama, asttokens, icecream, pytextrank
Successfully installed asttokens-3.0.0 colorama-0.4.6 executing-2.2

In [None]:
import pytextrank
import spacy

# Cohere's embedding model has a 512 token context window
# This is insufficient, as abstract + title is likely much longer
# Perform sentence summarization to get a succinct version within context limit
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("textrank")

TOKENS_PER_WORD = 1.5  # Guesstimate
CONTEXT_WINDOW = 512   # Specific to Cohere's embedding model

texts = df['title'] + ' ' + df['abstract']

trimmed_data = []

for doc in nlp.pipe(texts, batch_size=32):
    tr = doc._.textrank
    tokens = 0
    summary_sentences = []
    for sentence in tr.summary(limit_sentences=10, preserve_order=True):
        summary_sentences.append(sentence.text)
        tokens += TOKENS_PER_WORD * len(sentence.text.split())
        if tokens >= CONTEXT_WINDOW:
            break
    trimmed_data.append(" ".join(summary_sentences))


In [None]:
!pip install pylatexenc

Collecting pylatexenc
  Downloading pylatexenc-2.10.tar.gz (162 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/162.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.6/162.6 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pylatexenc
  Building wheel for pylatexenc (setup.py) ... [?25l[?25hdone
  Created wheel for pylatexenc: filename=pylatexenc-2.10-py3-none-any.whl size=136817 sha256=0c385ef1118ff9b36be025309a4fcee8a8d5ec505909aae487a64ffb21f29dfa
  Stored in directory: /root/.cache/pip/wheels/06/3e/78/fa1588c1ae991bbfd814af2bcac6cef7a178beee1939180d46
Successfully built pylatexenc
Installing collected packages: pylatexenc
Successfully installed pylatexenc-2.10


In [None]:
from pylatexenc.latex2text import LatexNodes2Text

# Some LaTeX commands/formatting cause a 400 error when POST'ing to the embedding API
# Parse out LaTeX, leaving only natural language
conv = LatexNodes2Text()
data = []
for latex_doc in trimmed_data:
  try:
    raw_text = conv.latex_to_text(latex_doc)
    data.append(raw_text[:2048]) # API limitation, can only have 2048 chars
  except IndexError as e:
    continue


In [None]:
import time
import tqdm
import requests
from google.colab import userdata
import json
import numpy as np

MODEL_NAME = 'cohere-embed-multilingual'
BATCH_SIZE = 96 # Maximum supported by Cohere's free API
ENDPOINT = 'https://us.inference.heroku.com/v1/embeddings'
HEADER = {
    "Authorization": f"Bearer {userdata.get('EMBEDDING_KEY')}",
    "Content-Type": "application/json"
}

embeddings = []
for i in tqdm.tqdm(range(0, len(data), BATCH_SIZE)):
  payload = {
          "model": MODEL_NAME,
          "input": data[i:i+BATCH_SIZE],
          "input_type": "search_document",
          "allow_ignored_params": True
  }
  for attempt in range(4): #backoff for rate limits
    try:
      resp = requests.post(url=ENDPOINT, headers=HEADER, json=payload)
      resp.raise_for_status()
      for idx, embedding in enumerate(resp.json()['data']):
        embeddings.append((i+idx, embedding['embedding']))
      break
    except requests.exceptions.HTTPError as e:
      if e.errno == 429:
        time.sleep(2 ** attempt)
      else:
        print(f'Failure in batch {i}')
        break

  with open('embeddings.pkl', 'wb') as fp:
    pickle.dump(embeddings, fp)


In [None]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m55.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [None]:
import pickle

with open('embeddings.pkl', 'rb') as fp:
  embeddings = pickle.load(fp)

In [None]:
# We'll use FAISS for fast retrieval in kNN
paper_index = faiss.IndexFlatIP(len(embeddings[0]))
paper_index.add(embeddings)
faiss.write_index(paper_index, 'paper_index.faiss')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.drop(columns=df.columns[0], inplace=True)


(57792, 5)
title       57787
id          57792
abstract    57790
authors     55971
url         57792
dtype: int64


In [None]:
df2.to_csv('papers_trimmed.csv')