In [3]:
import io
import requests
import pandas as pd
import re

In [10]:
def getHtml(doc_num, mid_string):
  doc_id = str(0) * (5 - len(str(doc_num))) + str(doc_num)
  url = "http://arxiv.org/abs/"+ mid_string+"."+ doc_id
  response=requests.get(url)
  html = response.text
  return html

def getTitle(html):
  return html[html.index('<title>') + 20:html.index('</title>')]

def getAbstract(text):
    start_ind = text.index('"citation_abstract" content=') + 29
    from_abs = text[start_ind:]
    next_new_line = [m.start() for m in re.finditer(r'" />',from_abs)][0] + start_ind
    abstract = text[start_ind:next_new_line]
    abstact = abstract.replace('-','')
    abstact = abstract.replace('\n',' ')
    return abstact

def getAuthors(html):
  auts = html[html.index('citation_author')+ 16:html.index('"citation_date" content="')]
  auts = auts.replace('"citation_author"', '')
  authors = []
  quote_positions = [m.start() for m in re.finditer(r'"', auts)]
  for i in range(0,len(quote_positions), 2):
    name = auts[quote_positions[i]+1: quote_positions[i+1]].replace(' ', '')
    authors.append(name.split(','))
  return authors

def getDate(text):
    start_ind = text.index('"citation_date" content="') + 25
    from_abs = text[start_ind:]
    next_new_line = [m.start() for m in re.finditer(r'" />',from_abs)][0] + start_ind
    date = text[start_ind:next_new_line]
    year = int(date[:4])
    month = int(date[5:7])
    return [date, year, month]

def getYear(arr):
  return arr[1]

def getMonth(arr):
  return arr[2]

def getLink(doc_num):
  doc_id = str(0) * (5 - len(str(doc_num))) + str(doc_num)
  url = "http://arxiv.org/pdf/2301."+ doc_id + ".pdf"
  return url

In [7]:
def buildArchive(archive, num_of_docs, starting_num, mid_string):
  doc_num = starting_num
  lst = []
  while(doc_num<=num_of_docs):
      try:
        html = getHtml(doc_num, mid_string)
      except:
        return archive
      row = {}
      row['DocId'] = doc_num
      try:
        row['Title']= getTitle(html)
      except:
        row['Title'] = ''
      try:
        row['Date']= getDate(html)
        row['Month'] = row['Date'][2]
        row['Year'] = row['Date'][1]

      except:
        row['Date'] = ['',0,0]
      try:
        row['Authors']= getAuthors(html)
      except:
        row['Authors'] = []
      try:
        row['Abstract'] = getAbstract(html)
      except:
        row['Abstract'] = ''
      doc_id = str(0) * (5 - len(str(doc_num))) + str(doc_num)
      row['Link'] = "http://arxiv.org/pdf/"+ mid_string + "." + doc_id + ".pdf"
      row['Cornell_Index'] = mid_string
      doc_num = doc_num + 1
      lst.append(row)
      #archive.loc[len(archive)] = row
      #archive = archive.append(row, ignore_index = True)
  archive = pd.DataFrame(lst)  
  return archive

In [13]:
archive = pd.DataFrame()
#buildArchive(archive, num_of_docs, starting_num, mid_string)
archive = buildArchive(archive, 5000, 1, '2304')

In [15]:
archive

Embedding the archive

In [16]:
from transformers import BertModel, BertTokenizerFast
from utils.setupFunctions import getStopwords
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
mini_lm_model = BertModel.from_pretrained("Models/")
mini_lm_tokenizer = BertTokenizerFast.from_pretrained("Models/Tokenizer")
stopwords = getStopwords()

In [None]:
def preprocess(text):
  return_str = []
  for word in text.split(' '):
    if (word.isalnum()):
      if(word.lower() not in stopwords):
        return_str.append(word)
  return return_str

In [None]:
archive['Title_processed'] = archive['Title'].apply(preprocess)
archive['Abstract_processed'] = archive['Abstract'].apply(preprocess)
archive['TandA'] = archive['Title_processed'] + ' ' + archive['Abstract_processed']

In [None]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

def getEmbeddings(text, settings):    
    #Tokenize sentences
    #mini_lm_model = BertModel.from_pretrained("Models/")
    #mini_lm_tokenizer = BertTokenizerFast.from_pretrained("Models/Tokenizer")

    encoded_input = mini_lm_tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors='pt',is_split_into_words=True)

    #Compute token embeddings
    with torch.no_grad():
      model_output = mini_lm_model(**encoded_input)

    #Perform pooling. In this case, mean pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    
    return sentence_embeddings

In [None]:
archive['Embeddings'] = torch.dtype
#corpus['Month'] = corpus['Date'].apply(getMonth)
for i in range(0, len(archive)):
  archive.at[i, 'Embeddings'] = getEmbeddings(archive.iloc[i]['raw_keywords'],mini_lm_model, mini_lm_tokenizer)