In [2]:
"""
https://api.semanticscholar.org/api-docs/graph#tag/Paper-Data
	
"""
import requests
from pprint import pprint
import pandas as pd

## Paper Query

In [None]:
""" 
{
"paperId": "b0d555a9ea67285fccd2ef8d887907bcc811f67a",
"title": "PositionRank: An Unsupervised Approach to Keyphrase Extraction from Scholarly Documents"
}
] 

- offset: int
- limit: int
- query: string[required]
- fields: string; comma-separated list of fields to return
	- paperId
	- url
	- title
	- abstract
	- venue
	- year
	- referenceCount
	- citationCount
	- influentialCitationCount
	- isOpenAccess
	- fieldsOfStudy
	- s2FieldsOfStudy
	- authors 
		- authorId
		- name
	- tldr
	- embedding
	

"""


In [None]:
""" https://api.semanticscholar.org/graph/v1/paper/b0d555a9ea67285fccd2ef8d887907bcc811f67a?fields=paperId,url,title,venue,year,referenceCount,citationCount,influentialCitationCount,isOpenAccess,fieldsOfStudy,abstract,tldr,embedding
 """


In [60]:
from dataclasses import dataclass

@dataclass
class Paper:
	paper_id: str
	parent_id: str
	title: str
	reference_count: int = 0
	citation_count: int = 0
	influential_citation_count: int = 0
	published_date: str = None
	paper_type: str = None
	venue: str = None
	isOpenAccess: bool = False
	fieldsOfStudy: str = None
	abstract: str = None
	paper_link: str = None
	download_link: str = None
	tldr: str = None
	embedding: str = None
	


In [22]:
p = Paper(paper_id="b0d555a9ea67285fccd2ef8d887907bcc811f67a",
		  parent_id="",
		  title="PositionRank: An Unsupervised Approach to Keyphrase Extraction from Scholarly Documents",
		  reference_count=0,
		  citation_count=0,
		  influential_citation_count=0,
		  published_date=None,
		  paper_type="",
		  venue="",
		  isOpenAccess=False,
		  fieldsOfStudy="",
		  abstract="",
		  paper_link="",
		  download_link="")

df = pd.DataFrame([p])
df.to_csv("papers.csv")

In [12]:
param = {
	"query": "PositionRank: An Unsupervised Approach to Keyphrase Extraction from Scholarly Documents",
	"fields": "paperId,url,title,venue,year,referenceCount,citationCount,influentialCitationCount,isOpenAccess,fieldsOfStudy,abstract",
	
}
res = requests.get(
    'https://api.semanticscholar.org/graph/v1/paper/search',params=param)

pprint(res.status_code)
pprint(res.json())

200
{'data': [{'abstract': 'The large and growing amounts of online scholarly data '
                       'present both challenges and opportunities to enhance '
                       'knowledge discovery. One such challenge is to '
                       'automatically extract a small set of keyphrases from a '
                       'document that can accurately describe the document’s '
                       'content and can facilitate fast information '
                       'processing. In this paper, we propose PositionRank, an '
                       'unsupervised model for keyphrase extraction from '
                       'scholarly documents that incorporates information from '
                       'all positions of a word’s occurrences into a biased '
                       'PageRank. Our model obtains remarkable improvements in '
                       'performance over PageRank models that do not take into '
                       'account word positions as well as

In [38]:
felids = ["computer science",'ai']
result = ''.join([f'{f},' for f in felids])
result

'computer science,ai,'

In [43]:
# https://api.semanticscholar.org/graph/v1/paper/b0d555a9ea67285fccd2ef8d887907bcc811f67a
param = {
	"fields": "paperId,url,title,venue,year,referenceCount,citationCount,influentialCitationCount,isOpenAccess,fieldsOfStudy,abstract",
}
res = requests.get(
    'https://api.semanticscholar.org/graph/v1/paper/b0d555a9ea67285fccd2ef8d887907bcc811f67a', params=param)

pprint(res.status_code)
pprint(res.json())

data = res.json()
p = Paper(
    paper_id="b0d555a9ea67285fccd2ef8d887907bcc811f67a",
    parent_id="",
    title= data["title"],
    reference_count=data["referenceCount"],
    citation_count=data["citationCount"],
    influential_citation_count=data["influentialCitationCount"],
    published_date=data["year"],
    paper_type="Article",
    venue=data["venue"],
    isOpenAccess=data["isOpenAccess"],
    fieldsOfStudy=''.join([f'{f},' for f in data["fieldsOfStudy"]]),
    abstract=data["abstract"],
    paper_link=data["url"],
    download_link=""
)
df = pd.DataFrame([p])

200
{'abstract': 'The large and growing amounts of online scholarly data present '
             'both challenges and opportunities to enhance knowledge '
             'discovery. One such challenge is to automatically extract a '
             'small set of keyphrases from a document that can accurately '
             'describe the document’s content and can facilitate fast '
             'information processing. In this paper, we propose PositionRank, '
             'an unsupervised model for keyphrase extraction from scholarly '
             'documents that incorporates information from all positions of a '
             'word’s occurrences into a biased PageRank. Our model obtains '
             'remarkable improvements in performance over PageRank models that '
             'do not take into account word positions as well as over strong '
             'baselines for this task. Specifically, on several datasets of '
             'research papers, PositionRank achieves improvements as 

In [41]:
df

Unnamed: 0,paper_id,parent_id,title,reference_count,citation_count,influential_citation_count,published_date,paper_type,venue,isOpenAccess,fieldsOfStudy,abstract,paper_link,download_link
0,b0d555a9ea67285fccd2ef8d887907bcc811f67a,,PositionRank: An Unsupervised Approach to Keyp...,43,176,26,2017,Article,ACL,True,"Computer Science,",The large and growing amounts of online schola...,https://www.semanticscholar.org/paper/b0d555a9...,


In [42]:
df.to_csv("data/info_ss/info_full_ss.csv", index=False, header=False, mode="a")

In [29]:
class SemanticScholar:
	def __init__(self, api_key = None):
		self.api_key = api_key
		self.base_url = "https://api.semanticscholar.org/graph/v1/paper"
		
	def searchPaperByQuery(self, query):
		endpoint = self.getEndpointForSearch(query)
		res = requests.get(**endpoint)
		self.processSearchResult(res.json().get("data"))
	
	def getEndpointForSearch(self, query):
		param = {
                    "query": query,
                    "fields": "paperId,url,title,venue,year,referenceCount,citationCount,influentialCitationCount,isOpenAccess,fieldsOfStudy,abstract",
                }
		return {
			"url": f"{self.base_url}/search",
			"params": param
		}
		
	def processSearchResult(self, papers):
		pprint(papers)
		

In [30]:
ss = SemanticScholar()
ss.searchPaperByQuery("PositionRank: An Unsupervised Approach to Keyphrase Extraction from Scholarly Documents")

[{'abstract': 'The large and growing amounts of online scholarly data present '
              'both challenges and opportunities to enhance knowledge '
              'discovery. One such challenge is to automatically extract a '
              'small set of keyphrases from a document that can accurately '
              'describe the document’s content and can facilitate fast '
              'information processing. In this paper, we propose PositionRank, '
              'an unsupervised model for keyphrase extraction from scholarly '
              'documents that incorporates information from all positions of a '
              'word’s occurrences into a biased PageRank. Our model obtains '
              'remarkable improvements in performance over PageRank models '
              'that do not take into account word positions as well as over '
              'strong baselines for this task. Specifically, on several '
              'datasets of research papers, PositionRank achieves improve

## Paper Details

In [None]:
""" 
https://api.semanticscholar.org/graph/v1/paper/b0d555a9ea67285fccd2ef8d887907bcc811f67a/references?fields=paperId,url,title,venue,year,referenceCount,citationCount,influentialCitationCount,isOpenAccess,fieldsOfStudy,abstract

 """

In [2]:
from dataclasses import dataclass
from collections import deque
from uuid import uuid4
@dataclass
class Paper:
	uuid: str
	paper_id: str
	parent_id: str
	title: str
	reference_count: int = 0
	citation_count: int = 0
	influential_citation_count: int = 0
	published_date: str = None
	paper_type: str = None
	venue: str = None
	isOpenAccess: bool = False
	abstract: str = None
	paper_link: str = None
	download_link: str = None
	tldr: str = None
	embedding: str = None


In [11]:
class SemanticScholar:
	def __init__(self, api_key=None):
		self.api_key = api_key
		self.base_url = "https://api.semanticscholar.org/graph/v1/paper"

	def searchPaperByQuery(self, query):
		endpoint = self.getEndpointForSearch(query)
		res = requests.get(**endpoint)
		self.processSearchResult(res.json().get("data"))

	def getEndpointForSearch(self, query):
		param = {
				"query": query,
				"fields": "paperId,url,title,venue,year,referenceCount,citationCount,influentialCitationCount,isOpenAccess,abstract",
			}
		return {
				"url": f"{self.base_url}/search",
				"params": param
			}

	def processSearchResult(self, papers):
		pprint(papers)
		
	def savePaperDetails(self, n=1):
		wt = pd.read_csv("data/info_ss/waiting_ss.csv")
		next_papers = wt[wt['paper_id'].notnull()][:n]
		next_papers_list = next_papers.to_dict('records')
		q = deque(next_papers_list)
		
		while q:
			p = q.popleft()
			uuid = p['uuid']
			parent_id = p['parent_id']
			paper_id = p['paper_id']
			endpoint = self.getEndpointForPaperDetails(paper_id)
			
			res = requests.get(**endpoint)
			print(res.status_code)
			print(res.reason)
			paperDf = self.processPaperDetails(
				res.json(), parent_id=parent_id, uuid=uuid)
			# print(paperDf.to_dict('records'))
			try:
				self.savePaperDetailsToFinalCsv(paperDf)
				self.awaitsReferences(res.json())
				self.removePaperFromWaitingList(paper_id)
			except Exception as e:
				print(e)
		
	def saveRefsDetails(self, paper_id):	
		# get references for paper
		refsDf,waitingDf = self.getReferencesForPaper(paper_id)
		self.saveReferencesToFinalCsv(refsDf)
		self.saveReferencesToWaitingCsv(waitingDf)
		
	def getEndpointForPaperDetails(self, paper_id):
		return {
			"url": f"{self.base_url}/{paper_id}",
			"params": {
				"fields": "paperId,url,title,venue,year,referenceCount,citationCount,influentialCitationCount,isOpenAccess,fieldsOfStudy,abstract,tldr,embedding,references",
			}
		}

	def processPaperDetails(self, data,parent_id,uuid):
		# pprint(data)
		# print(data["title"])
		embedding_model = data['embedding']['model']
		embedding_vector = ''.join([f"{v}," for v in data['embedding']['vector']])
		embedding = embedding_model + "," + embedding_vector
		print(embedding)
		tldr_model = data['tldr']['model']
		tldr_text= data['tldr']['text']
		tldr = tldr_model + "," + tldr_text
		print(tldr)
		
		p = Paper(
			uuid=uuid,
			paper_id=data["paperId"],
			parent_id=parent_id,
			title=data["title"],
			reference_count=data["referenceCount"],
			citation_count=data["citationCount"],
			influential_citation_count=data["influentialCitationCount"],
			published_date=data["year"],
			paper_type="Article",
			venue=data["venue"],
			isOpenAccess=data["isOpenAccess"],
			abstract=data["abstract"],
			paper_link=data["url"],
			download_link="",
			tldr=tldr,
			embedding=embedding
		)
		return pd.DataFrame([p])

	def savePaperDetailsToFinalCsv(self, paperDf):
		# save to info_full
		paperDf.to_csv("data/info_ss/info_full_ss.csv",
		               index=False, header=False, mode="a")

	def awaitsReferences(self, res):
		refs = res['references']
		parent_id = res['paperId']
		papers = [{
			'uuid': uuid4(),
			'paper_id': ref['paperId'], 
			'parent_id': parent_id, 
			'title': ref['title']
			}
			for ref in refs]
		print(f"{len(papers)} references found")
		waitingDf = pd.DataFrame(papers)
		self.saveReferencesToWaitingCsv(waitingDf)
	
	def removePaperFromWaitingList(self, paper_id):
		wt = pd.read_csv("data/info_ss/waiting_ss.csv")
		indx = wt.index[wt['paper_id'] == paper_id]
		wt.drop(indx, inplace=True)
		wt.to_csv("data/info_ss/waiting_ss.csv", index=False)
	
	def getReferencesForPaper(self, paper_id):
		endpoint = self.getEndpointForReferences(paper_id)
		res = requests.get(**endpoint)
		refsDf = self.processReferences(res.json().get("data"), parent_id=paper_id)
		return refsDf

	def saveReferencesToFinalCsv(self, refsDf):
		# save to info_full
		refsDf.to_csv("data/info_ss/info_full_ss.csv",
		              index=False, header=False, mode="a")

	def saveReferencesToWaitingCsv(self, waitingDf):
		waitingDf.to_csv("data/info_ss/waiting_ss.csv",
		                 index=False, header=False, mode="a")
	
	def getEndpointForReferences(self, paper_id):
		return {
			"url": f"{self.base_url}/{paper_id}/references",
			"params": {
				"fields": "paperId,url,title,venue,year,referenceCount,citationCount,influentialCitationCount,isOpenAccess,fieldsOfStudy,abstract",
			}
		}

	def processReferences(self, references, parent_id):
		print(f"{len(references)} references found")
		waiting_refs = []
		refs = []
		for ref in references:
			cp = ref.get("citedPaper")
			if cp['paperId']:
				waiting_refs.append(cp['paperId'])
			p = Paper(
				paper_id=cp["paperId"],
				parent_id=parent_id,
				title=cp["title"],
				reference_count=cp["referenceCount"],
				citation_count=cp["citationCount"],
				influential_citation_count=cp["influentialCitationCount"],
				published_date=cp["year"],
				paper_type="",
				venue=cp["venue"],
				isOpenAccess=cp["isOpenAccess"],
				abstract=cp["abstract"],
				paper_link=cp["url"],
				download_link=""
			)
			refs.append(p)
		return [pd.DataFrame(refs),pd.DataFrame(waiting_refs)]
		

In [13]:
ss = SemanticScholar()
# ss.savePaperDetails("b0d555a9ea67285fccd2ef8d887907bcc811f67a")
ss.savePaperDetails()

200
OK
specter@v0.1.1,-5.0251593589782715,-0.7668672204017639,3.24619197845459,-1.0699576139450073,2.3169546127319336,-0.025080472230911255,3.0044264793395996,1.562121868133545,-2.0360097885131836,0.927882194519043,-0.9849504828453064,0.08354397118091583,0.6153993010520935,2.4519529342651367,0.5937599539756775,1.0783541202545166,0.7500835657119751,-1.8462162017822266,2.287376880645752,-1.217930555343628,0.11329089105129242,0.4722680449485779,-2.7950921058654785,-1.9535257816314697,1.428817629814148,-1.9235090017318726,7.70772647857666,4.994366645812988,1.3278181552886963,1.4620716571807861,-0.3095279633998871,-4.983679294586182,1.2845163345336914,-0.6975395679473877,-2.7052910327911377,-4.623964309692383,0.9212973713874817,7.18109655380249,-1.3138294219970703,-0.3099125027656555,-2.054319143295288,-1.2314403057098389,3.8607091903686523,2.223982810974121,-1.8897730112075806,-1.9884285926818848,3.0197958946228027,2.0717334747314453,-3.8850531578063965,-1.0382630825042725,3.71438837051391

In [72]:
wt = pd.read_csv("data/info_ss/waiting_ss.csv")
wt.isnull().sum()

paper_id     14
parent_id     0
title         0
dtype: int64

In [74]:
print(wt.shape)
print(wt[wt['paper_id'].notnull()].shape)


(131, 3)
(117, 3)


In [53]:
# get all not null values
p = wt[wt['paper_id'].notnull()][:5]
print(p.shape)
p = p.to_dict('records')
p

(1, 3)


[{'paper_id': 'b0d555a9ea67285fccd2ef8d887907bcc811f67a',
  'parent_id': nan,
  'title': 'PositionRank: An Unsupervised Approach to Keyphrase Extraction from Scholarly Documents'}]

In [54]:
q = deque(p)
q.popleft()

{'paper_id': 'b0d555a9ea67285fccd2ef8d887907bcc811f67a',
 'parent_id': nan,
 'title': 'PositionRank: An Unsupervised Approach to Keyphrase Extraction from Scholarly Documents'}

In [9]:
id = '03589e1917debe6df148cac8963fd008e4140237'
indx = wt.index[wt['paper_id']== id]
wt.drop(indx, inplace=True)
wt.to_csv("data/info_ss/waiting_ss.csv", index=False)

## Download

In [145]:
import ssl
import requests
from requests.adapters import HTTPAdapter
from urllib3.poolmanager import PoolManager
from urllib3.util import ssl_

CIPHERS = "ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-SHA384:ECDHE-ECDSA-AES256-SHA384:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-SHA256:AES256-SHA"


class TLSAdapter(HTTPAdapter):

    def __init__(self, ssl_options=0, *args, **kwargs):
        self.ssl_options = ssl_options
        super().__init__(*args, **kwargs)

    def init_poolmanager(self, *args, **kwargs):
        context = ssl_.create_urllib3_context(
            ciphers=CIPHERS, cert_reqs=ssl.CERT_REQUIRED, options=self.ssl_options)
        self.poolmanager = PoolManager(*args, ssl_context=context, **kwargs)


def download(url, fName):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.53",
    }
    adapter = TLSAdapter(ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1)

    with requests.session() as session:
        session.mount('http://', adapter)
        session.mount('https://', adapter)  
        response = session.get(url, headers=headers)
        # print(response.status_code)  # 200
        if response.status_code == 200:
            with open(f"data/papers_pdf/{fName}.pdf", 'wb') as f:
                f.write(response.content)
            session.close()
            return 0,0
        else:
            session.close()
            return response.status_code, response.reason

In [7]:
import requests
from requests.adapters import HTTPAdapter
from requests.adapters import HTTPAdapter, Retry
from requests.packages import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

retry_strategy = Retry(
    total=3,
    backoff_factor=1,
    status_forcelist=[403,406, 429, 500, 502, 503, 504],
)



def downloadSS(url, fName):
	headers = {
		"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.53",
        "Accept": '*/*',
	}

	session = requests.Session()
	adapter = HTTPAdapter(max_retries=retry_strategy)
	session.mount('http://', adapter)
	session.mount('https://', adapter)
	res = requests.get(url, headers=headers,verify=False, timeout=100)

	if res.status_code == 200:
		with open(f"data/papers_pdf/{fName}.pdf", "wb") as f:
			f.write(res.content)
		session.close()
		return 0, 0
	else:
		session.close()
		return res.status_code, res.reason


In [41]:
code, msg = downloadSS("http://www.aclweb.org/anthology/P/P17/P17-1102.pdf",
                       "b0d555a9ea67285fccd2ef8d887907bcc811f67a")
print(code,msg)

0 0


In [8]:
# http://aclweb.org/anthology//D/D14/D14-1150.pdf    406 Not Acceptable
downloadSS("http://aclweb.org/anthology//D/D14/D14-1150.pdf",'new' )


(0, 0)

In [7]:
inf = pd.read_csv("data/info_ss/info_full_ss.csv")
# inf = pd.read_csv("data/info_ss/info_full_ss.csv", nrows=5000)

inf.shape


(26367, 18)

In [8]:
small = inf[['uuid', 'paper_id','title','paper_link','doi','download_link']]
small.shape


(26367, 6)

In [9]:
all_available_dl = small[small['download_link'].notnull()]
all_available_dl.shape

(26367, 6)

In [10]:
inf[inf['download_link'] == "No links found"].shape

(3327, 18)

In [11]:
all_available_dl.to_csv("data/info_ss/papers.csv", index=False)

In [208]:
def f(row):
	if not pd.isnull(row['download_link']):
		if row['download_link'].find("https:https") !=-1:
			row['download_link']=row['download_link'].replace("https:https://", "https://")
			# print(row['download_link'])
		if row['download_link'].find("https://sci-hub.sehttps:") != -1:
			row['download_link']=row['download_link'].replace("https://sci-hub.sehttps://", "https://")
			# print(row['download_link'])

	return row
	


In [1]:
inf = inf.apply(f,axis='columns')

NameError: name 'inf' is not defined

In [205]:
inf.shape

(25172, 18)

In [210]:
inf.to_csv("data/info_ss/info_full_ss.csv", index=False)

In [211]:
d = all_available_dl.apply(f, axis='columns')

In [195]:
# https://sci-hub.sehttps://moscow.sci-hub.se/2285/5b41d4d2d3b3c572c8324d5b731ad1bb/amitay2000.pdf#navpanes=0&view=FitH;https:https://zero.sci-hub.st/2285/5b41d4d2d3b3c572c8324d5b731ad1bb/amitay2000.pdf#navpanes=0&view=FitH

# https://sci.bban.top/pdf/10.1109/ICICTA.2008.180.pdf#view=FitH;https://sci-hub.sehttps://zero.sci-hub.se/4876/8910d66cc317ac627be0ef6bd2859b85/wang2008.pdf#navpanes=0&view=FitH;https:https://moscow.sci-hub.st/4876/8910d66cc317ac627be0ef6bd2859b85/wang2008.pdf#navpanes=0&view=FitH

s = "https://sci-hub.sehttps://moscow.sci-hub.se/2285/5b41d4d2d3b3c572c8324d5b731ad1bb/amitay2000.pdf#navpanes=0&view=FitH;https:https://zero.sci-hub.st/2285/5b41d4d2d3b3c572c8324d5b731ad1bb/amitay2000.pdf#navpanes=0&view=FitH"
print(s.find("https:https://"))
print(s.find("https://sci-hub.sehttps://"))

print(s.replace("https:https://","https://"))
print(s.replace("https://sci-hub.sehttps://","https://"))

118
0
https://sci-hub.sehttps://moscow.sci-hub.se/2285/5b41d4d2d3b3c572c8324d5b731ad1bb/amitay2000.pdf#navpanes=0&view=FitH;https://zero.sci-hub.st/2285/5b41d4d2d3b3c572c8324d5b731ad1bb/amitay2000.pdf#navpanes=0&view=FitH
https://moscow.sci-hub.se/2285/5b41d4d2d3b3c572c8324d5b731ad1bb/amitay2000.pdf#navpanes=0&view=FitH;https:https://zero.sci-hub.st/2285/5b41d4d2d3b3c572c8324d5b731ad1bb/amitay2000.pdf#navpanes=0&view=FitH


In [131]:
dl = pd.read_csv("data/info_ss/downloaded.csv")
dl_dict = dl.to_dict('records')
print(len(dl_dict))
dl_dict[:3]


46


[{'uuid': '8bde6cb8-296d-4593-883f-b768103e1e69',
  'paper_id': 'b0d555a9ea67285fccd2ef8d887907bcc811f67a'},
 {'uuid': '77c4043a-1b88-4852-88b2-9994130f023e',
  'paper_id': 'f7fe3f870ef5e1a74600c8808c07732cd2e5142d'},
 {'uuid': '0d2c02bf-eece-4eba-9621-8128c107fcc4',
  'paper_id': 'ff30cca624b6a64d561310f564873a63fb413ef1'}]

In [132]:
uuids = [d['uuid'] for d in dl_dict]
uuids[:2]


['8bde6cb8-296d-4593-883f-b768103e1e69',
 '77c4043a-1b88-4852-88b2-9994130f023e']

In [133]:
paperIds = [d['paper_id'] for d in dl_dict]
paperIds[:2]

['b0d555a9ea67285fccd2ef8d887907bcc811f67a',
 'f7fe3f870ef5e1a74600c8808c07732cd2e5142d']

In [150]:
all_available_dl = inf[inf['download_link'].notnull()][:]
all_available_dl.shape

(78, 16)

In [135]:
next_dl = all_available_dl[~all_available_dl['uuid'].isin(uuids)]
next_dl.shape

(44, 16)

In [147]:
from time import sleep

dl = pd.read_csv("data/info_ss/downloaded.csv")
dl_dict = dl.to_dict('records')
uuids = [d['uuid'] for d in dl_dict]
paperIds = [d['paper_id'] for d in dl_dict]
all_available_dl = inf[inf['download_link'].notnull()]
next_dl = all_available_dl[~all_available_dl['uuid'].isin(uuids)]
next_dl_dict = next_dl.to_dict('records')[:2]

for d in next_dl_dict:
	download_links = d['download_link']
	paper_id = d['paper_id']
	uuid = d['uuid']
	links = download_links.split(";")
	for link in links:
		sleep(1)
		print(link)
		if link == 'No links found':
			dlDf = pd.DataFrame([{
				'uuid': uuid,
				'paper_id': paper_id,
			}])
			dlDf.to_csv("data/info_ss/downloaded.csv", index=False, header=False, mode="a")
		else:
			if paper_id in paperIds:
				dlDf = pd.DataFrame([{
								'uuid': uuid,
								'paper_id': paper_id,
							}])
				dlDf.to_csv("data/info_ss/downloaded.csv",
							index=False, header=False, mode="a")
				print("already downloaded")
				break
			else:
				try:
					code, msg = download(link, paper_id)
					if code == 0:
						dlDf = pd.DataFrame([{
										'uuid': uuid,
										'paper_id': paper_id,
									}])
						dlDf.to_csv("data/info_ss/downloaded.csv",
									index=False, header=False, mode="a")
						print("downloaded")
						break
					else:
						print(f"{code} {msg}")
				except Exception as e:
					print(e)
					# break

http://anthology.aclweb.org/C/C10/C10-1101.pdf
already downloaded
http://www.hlt.utdallas.edu/~vince/papers/acl14-keyphrase.pdf
already downloaded


## Graph

In [6]:
import pandas as pd
from __future__ import annotations
from collections import defaultdict, deque
from typing import List, DefaultDict, Deque, Dict
AdjacencyList = DefaultDict[str, List[str]]
Levels = List[List[Dict]]
from pandas import DataFrame

### Info manager

In [7]:
class InfoManager:
	def __init__(self, file_path: str=None,df: DataFrame = None) -> None:
		if file_path:
			self.inf: DataFrame = pd.read_csv(file_path)
		elif df is not None:
			self.inf = df

	def get_info_by_uuid(self, id: str) -> List[dict]:
		f = self.inf[self.inf['uuid'] == id]
		return f.to_dict('records')[0]

	def get_info_by_uuidIds(self, ids: List[str]) -> List[dict]:
		f = self.inf[self.inf['uuid'].isin(ids)]
		return f.to_dict('records')

	def get_info_by_key(self, key: str, value: str) -> List[dict]:
		f = self.inf[self.inf[key] == value]
		return f.to_dict('records')

	def get_info_keys_by_uuid(self, id: str, keys: List[str]) -> str:
		f = self.inf[self.inf['uuid'] == id]
		return f.iloc[0][keys].to_dict()

	def get_refs_by_uuid(self, uuid: str) -> List[str]:
		f = self.inf[self.inf['parent_id'] == uuid]
		return f.to_dict('records')

	def get_parent_info_by_child_id(self, child_id: str) -> List[dict]:
		f = self.inf[self.inf['uuid'] == child_id]
		parent_id = f.iloc[0]['parent_id']
		p = self.inf[self.inf['uuid'] == parent_id]
		return p.to_dict('records')[0]
		

In [10]:
im = InfoManager(file_path='data/info_ss/info_full_ss.csv')

In [8]:
im.get_info_key_by_uuid("8bde6cb8-296d-4593-883f-b768103e1e69",['title','uuid','paper_id','citation_count'])

{'title': 'PositionRank: An Unsupervised Approach to Keyphrase Extraction from Scholarly Documents',
 'uuid': '8bde6cb8-296d-4593-883f-b768103e1e69',
 'paper_id': 'b0d555a9ea67285fccd2ef8d887907bcc811f67a',
 'citation_count': 176}

In [None]:
refs= im.get_refs_by_parent_id('8bde6cb8-296d-4593-883f-b768103e1e69')
len(refs)

40

In [None]:
im.get_level_no_by_id(l,'8bde6cb8-296d-4593-883f-b768103e1e69')

0

### Graph

In [8]:
class Graph:
	def __init__(self) -> None:
		self.graph: AdjacencyList = defaultdict(list)
		self.levelOrderList: Levels = []
		self.levelOrderIdsList: Levels = []
		self.inf: DataFrame|None = None
		

	@classmethod
	def init_from_csv(cls, csv_path: str) -> Graph:
		g = cls()
		# gpd = pd.read_csv(csv_path)
		g.inf = pd.read_csv(csv_path, usecols=['paper_id', 'uuid', 'parent_id'])
		
		# only not null values
		# gpd = gpd[gpd['paper_id'].notnull()]
		print(g.inf.shape)
		for _, row in g.inf.iterrows():
			g.addEdge(row['parent_id'], row['uuid'])
		return g

	def addEdge(self, parent_id: str, id: str) -> None:
		self.graph[parent_id].append(id)

	def levelOrderFull(self, root: str, im: InfoManager) -> List[List[Dict]]:
		self.levelOrderList = []
		q: Deque = deque()
		q.append(root)
		while q:
			currentLevel: List[Dict] = []
			currentQLength = len(q)
			for _ in range(currentQLength):
				currentNode = q.popleft()
				# currentLevel.append(currentNode)
				p = im.get_info_key_by_uuid(currentNode, ['paper_id', 'uuid', 'parent_id'])
				currentLevel.append(p)
				for children in self.graph[currentNode]:
					q.append(children)

			self.levelOrderList.append(currentLevel)
		
		return self.levelOrderList
	
	def levelOrderIdsOnly(self, root: str) -> List[List[Dict]]:
		self.levelOrderIdsList = []
		q: Deque = deque()
		q.append(root)
		while q:
			currentLevel: List[Dict] = []
			currentQLength = len(q)
			for _ in range(currentQLength):
				currentNode = q.popleft()
				currentLevel.append(currentNode)
				for children in self.graph[currentNode]:
					q.append(children)

			self.levelOrderIdsList.append(currentLevel)

		return self.levelOrderIdsList
	
	def get_level_no_by_id(self, levels: List[List[Dict]], id: int) -> int:
		for i, l in enumerate(levels):
			for d in l:
				if d['uuid'] == id:
					return i
		return -1 
	
	def alreadyProcessed(self, id: str, levels: List[List[str]]) -> bool:
		print(len(levels))
		for i in range(len(levels) - 1):
			print(i)
			# if id in levels[i]:
			# return True
	


In [9]:
g = Graph.init_from_csv('data/info_ss/info_full_ss.csv')

(24750, 3)


In [64]:
[(k,len(v)) for k,v in g.graph.items() ][:5]

[(nan, 1),
 ('8bde6cb8-296d-4593-883f-b768103e1e69', 39),
 ('ade6eb61-946a-49f3-835d-74804cb9de3e', 20),
 ('be2af112-6c57-4598-9813-d02fff71055b', 59),
 ('899a51ab-0231-4baf-bd9e-a1b15c743623', 23)]

In [10]:
im = InfoManager(df=g.inf)
l_full = g.levelOrderFull(root="8bde6cb8-296d-4593-883f-b768103e1e69", im=im)
print(f"total levels: {len(l_full)}")
for i, level in enumerate(l_full):
	print(f"level {i}: {len(level)}")

total levels: 5
level 0: 1
level 1: 39
level 2: 1056
level 3: 24388
level 4: 61


In [44]:
l_full[0]


[{'paper_id': 'b0d555a9ea67285fccd2ef8d887907bcc811f67a',
  'uuid': '8bde6cb8-296d-4593-883f-b768103e1e69',
  'parent_id': nan}]

In [65]:
l = g.levelOrderIdsOnly(root="8bde6cb8-296d-4593-883f-b768103e1e69")
print(f"total levels: {len(l)}")
for i, level in enumerate(l):
	print(f"level {i}: {len(level)}")


total levels: 5
level 0: 1
level 1: 39
level 2: 1037
level 3: 23249
level 4: 61


In [66]:
l[0]

['8bde6cb8-296d-4593-883f-b768103e1e69']

### find wrongly referenced papers

In [10]:
l = g.levelOrderIdsOnly(root="8bde6cb8-296d-4593-883f-b768103e1e69")
print(f"total levels: {len(l)}")
for i, level in enumerate(l):
	print(f"level {i}: {len(level)}")

total levels: 5
level 0: 1
level 1: 39
level 2: 1037
level 3: 23249
level 4: 61


In [11]:
all_papers_upto_l3 = [p for p in l[0] + l[1] + l[2]]
all_papers_upto_l3[:3]


['8bde6cb8-296d-4593-883f-b768103e1e69',
 'ade6eb61-946a-49f3-835d-74804cb9de3e',
 'be2af112-6c57-4598-9813-d02fff71055b']

In [12]:
im = InfoManager("data/info_ss/info_full_ss.csv")


In [13]:
paper = im.get_info_key_by_uuid(l[0][0],
['title','uuid','paper_id','published_date','parent_id'])
print(type(paper['published_date']))

<class 'float'>


In [14]:
p = im.get_refs_by_uuid("8bde6cb8-296d-4593-883f-b768103e1e69")

In [15]:
count = []
for i, level in enumerate(l):
	# print(f"level {i}: {len(level)}")
	if i == 3:
		break
	for parent_uuid in level:
		root = im.get_info_keys_by_uuid(parent_uuid,
									['title', 'uuid', 'paper_id', 'published_date'])
		published_date = root['published_date']
		paper_id = root['paper_id']
		title = root['title']
		
		refs = im.get_refs_by_uuid(parent_uuid)
		for ref in refs:
			child_published_date = ref['published_date']
			child_uuid = ref['uuid']
			child_title = ref['title']
			child_paper_id = ref['paper_id']
			if child_published_date > published_date:
				count.append((i, paper_id, title, child_paper_id, child_title, child_published_date))


In [16]:
count

[]

In [93]:
# 
for i, level in enumerate(l):
	# print(f"level {i}: {len(level)}")
	if i == 3:
		break
	for parent_uuid in level:
		root = im.get_info_keys_by_uuid(parent_uuid,
									['title', 'uuid', 'paper_id', 'published_date'])
		published_date = root['published_date']
		paper_id = root['paper_id']
		title = root['title']
	
		refs = im.get_refs_by_uuid(parent_uuid)
		for ref in refs:
			child_published_date = ref['published_date']
			child_uuid = ref['uuid']
			child_title = ref['title']
			child_paper_id = ref['paper_id']
			if child_published_date > published_date:
				info = {
					'parent_uuid': parent_uuid,
					'parent_published_date': published_date,
					'parent_paper_id': paper_id,
					'parent_title': title,
					'child_uuid': child_uuid,
					'child_published_date': child_published_date,
					'child_paper_id': child_paper_id,
					'child_title': child_title
				}
				df = pd.DataFrame([info])
				df.to_csv('data/info_ss/wrong_refs.csv', mode='a', header=False, index=False)


9d7c00f6-d0e0-4a53-aeb4-66b1b63a9e56 2008.0 f7fe3f870ef5e1a74600c8808c07732cd2e5142d Graph-Based Keyword Extraction for Single-Document Summarization 85de4ba2-2a61-449b-bd95-7a5f5fad2088 2009.0 29626c9a0fdd78e16d5e6e34ab159edbefb75e15 Vector-Space Model


In [40]:
wg = pd.read_csv('data/info_ss/wrong_refs.csv')
wg.shape

(795, 8)

In [41]:
uwrg = wg['child_uuid'].unique()
uwrg = uwrg.tolist()
print(len(uwrg))
uwrg[:3]

795


['278d18a7-c54b-4e1a-862b-9604abd75412',
 '6821aac5-ce59-4064-8a0a-d939892b306b',
 '92310b63-d41b-4ba2-adca-6fb49d4c0a7b']

In [53]:
all_paper = []
for wrg in uwrg:
	l = g.levelOrderIdsOnly(root=wrg)
	# print(f"total levels: {len(l)}")
	for i, level in enumerate(l):
		# print(f"level {i}: {len(level)}")
		all_paper.extend(level)


In [54]:
len(all_paper)


1167

In [60]:
inf=inf[~inf['uuid'].isin(uwrg)]
inf.shape


(24750, 18)

In [25]:
import pandas as pd
inf = pd.read_csv('data/info_ss/info_full_ss_new.csv')
inf.shape


(24750, 17)

In [32]:
def get_parent_papers_paperid(row):
	try:
		if pd.isnull(row["parent_id"]):
			return None

		parent = im.get_info_keys_by_uuid(row["parent_id"], [
                    'paper_id'])
		parents_paperid = parent['paper_id']
		return parents_paperid
	except Exception as e:
		# print(row)
		pass


In [33]:
inf['parent_paper_id'] = inf.apply(get_parent_papers_paperid, axis=1)


In [35]:
inf[inf['parent_paper_id'].isnull()].iloc[1:].shape


(363, 18)

In [36]:
inf = inf.iloc[1:]
inf[~inf['parent_paper_id'].isnull()].shape


(24386, 18)

In [38]:
inf = inf[~inf['parent_paper_id'].isnull()]
inf.shape

(24386, 18)

In [39]:
inf.to_csv('data/info_ss/info_full_ss_new.csv', index=False)

### analyze graph

In [62]:
inf.to_csv("data/info_ss/info_full_ss.csv", index=False)

In [25]:
all_papers_upto_l3 = [p for p in l[0] + l[1] +l[2] ]
all_papers_upto_l3[:3]

['8bde6cb8-296d-4593-883f-b768103e1e69',
 'ade6eb61-946a-49f3-835d-74804cb9de3e',
 'be2af112-6c57-4598-9813-d02fff71055b']

In [58]:
inf = pd.read_csv('data/info_ss/info_full_ss.csv')
inf.shape

(25545, 18)

In [27]:
inf[inf['uuid'].isin(all_papers_upto_l3)].shape

(1096, 18)

In [28]:
inf[inf['uuid'].isin(all_papers_upto_l3)]['reference_count'].sum()

33837

In [40]:
wt = pd.read_csv("data/info_ss/waiting_ss.csv")
wt.shape

(8431, 4)

In [84]:
inf.shape

(25545, 18)

## Cross - Check

In [3]:
inf = pd.read_csv("data/info_ss/info_full_ss.csv")
inf.shape

(24750, 17)

In [5]:
inf.duplicated(subset=['uuid']).sum()

1

In [6]:
infUuids = inf['uuid'].to_list()
print(len(infUuids))
print(infUuids[:2])

26367
['8bde6cb8-296d-4593-883f-b768103e1e69', 'ade6eb61-946a-49f3-835d-74804cb9de3e']


In [16]:
wt = pd.read_csv("data/info_ss/waiting_ss.csv")
wt.shape

(8431, 4)

In [17]:
wt.duplicated(subset=['uuid']).sum()

0

In [15]:
wt[wt['uuid'].isin(infUuids)].shape

(0, 4)

In [12]:
wt = wt[~wt['uuid'].isin(infUuids)]
wt.shape


(8431, 4)

In [13]:
wt.to_csv("data/info_ss/waiting_ss.csv", index=False)

In [69]:
l[3][:2]

['f6e588ee-31ea-4510-a3bf-cba617cfd9ba',
 'aaa698a9-3da4-4048-8d5d-180cb0afbe2d']

In [8]:
forL2 = wt[wt['parent_id'].isin(l[2])]
forL2.shape

(19074, 4)

In [90]:
forL2[forL2['paper_id'].notnull()].shape

(15419, 4)

In [85]:
wt[wt['parent_id'].isin(l[0] + l[1] + l[2])].shape

(23840, 4)

In [75]:
wt = wt[wt['parent_id'].isin(l[0] + l[1] + l[2] + l[3][:5])]
wt.shape

(23910, 4)

In [76]:
wt.to_csv("data/info_ss/waiting_ss.csv", index=False)

## Info Recover

In [16]:
paper = pd.read_csv("data/info_recover/not_found.csv")
paper.shape

(183, 6)

In [15]:
paper[paper['download_link']=='No links found'].shape

(183, 6)

In [21]:
paper[(paper['download_link']!='No links found;Processed') | (paper['download_link']=='No links found')].shape

(35, 6)

In [10]:
paper.columns

Index(['uuid', 'paper_id', 'title', 'paper_link', 'doi', 'download_link'], dtype='object')

In [11]:
paper.to_dict('records')[:2]

[{'uuid': '3fd01299-134c-43f0-83c7-24427c18bb5c',
  'paper_id': 'ee800bab67e177e4676578f621db59a0faa736cd',
  'title': 'Human-competitive automatic topic indexing',
  'paper_link': 'https://www.semanticscholar.org/paper/ee800bab67e177e4676578f621db59a0faa736cd',
  'doi': nan,
  'download_link': 'No links found'},
 {'uuid': '2e260db8-e653-4d24-8c45-780d139097c7',
  'paper_id': '7b04a11fecf21832194cc2f1b03bebb69c399443',
  'title': 'Crawling the web : discovery and maintenance of large-scale web data',
  'paper_link': 'https://www.semanticscholar.org/paper/7b04a11fecf21832194cc2f1b03bebb69c399443',
  'doi': nan,
  'download_link': 'No links found'}]

In [12]:
# replace
paper.loc[paper['uuid'] ==
          '3fd01299-134c-43f0-83c7-24427c18bb5c', "download_link"] = "New"
paper.head(n=2)

Unnamed: 0,uuid,paper_id,title,paper_link,doi,download_link
0,3fd01299-134c-43f0-83c7-24427c18bb5c,ee800bab67e177e4676578f621db59a0faa736cd,Human-competitive automatic topic indexing,https://www.semanticscholar.org/paper/ee800bab...,,New
1,2e260db8-e653-4d24-8c45-780d139097c7,7b04a11fecf21832194cc2f1b03bebb69c399443,Crawling the web : discovery and maintenance o...,https://www.semanticscholar.org/paper/7b04a11f...,,No links found


## Misc.

In [6]:
inf = pd.read_csv("data/info_ss/info_full_ss.csv")
inf.shape, inf.columns

((24750, 18),
 Index(['uuid', 'paper_id', 'parent_id', 'title', 'reference_count',
        'citation_count', 'influential_citation_count', 'published_date',
        'paper_type', 'venue', 'isOpenAccess', 'abstract', 'paper_link', 'doi',
        'download_link', 'tldl', 'embedding', 'downloaded'],
       dtype='object'))

In [7]:
inf.drop(columns=['downloaded'], inplace=True)
inf.columns, inf.shape


(Index(['uuid', 'paper_id', 'parent_id', 'title', 'reference_count',
        'citation_count', 'influential_citation_count', 'published_date',
        'paper_type', 'venue', 'isOpenAccess', 'abstract', 'paper_link', 'doi',
        'download_link', 'tldl', 'embedding'],
       dtype='object'),
 (24750, 17))

In [8]:
inf.to_csv("data/info_ss/info_full_ss.csv", index=False)

In [9]:
inf = pd.read_csv("data/info_ss/info_full_ss.csv")
inf.shape, inf.columns


((24750, 17),
 Index(['uuid', 'paper_id', 'parent_id', 'title', 'reference_count',
        'citation_count', 'influential_citation_count', 'published_date',
        'paper_type', 'venue', 'isOpenAccess', 'abstract', 'paper_link', 'doi',
        'download_link', 'tldl', 'embedding'],
       dtype='object'))