# Dataset Builder 
To build dataset, pdf articles from following links were taken:
- [30 most important articles recommended by Ilya Sutskever](https://arc.net/folder/D0472A20-9C20-4D3F-B145-D2865C0A9FEE)
- [Habr article with other possible recommendation articles](https://habr.com/ru/companies/ruvds/articles/721150/)
- [Deep Learning Papers Reading Roadmap (raw README file)](https://github.com/floodsung/Deep-Learning-Papers-Reading-Roadmap)

## Web Scrapping of articles

In [3]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from langchain.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
from dotenv import load_dotenv
load_dotenv('../.envrc')

True

In [2]:
def find_pdf_on_page(page_link):
    try:
        page = requests.get(page_link)
        link = re.findall(r'http.*pdf',page.text)
    except:
        return 'no link found'
    
    return link[0] if link else 'no link found'
    

In [3]:
# idk where did I found this webpage
page = requests.get('https://arc.net/folder/D0472A20-9C20-4D3F-B145-D2865C0A9FEE')
print('Status code:', page.status_code)

soup = BeautifulSoup(page.text, "html.parser")
links = soup.findAll('a')

pdf_list_arc = [link['href'] for link in links if '.pdf' in link['href']] # pdf_files only
all_links_arc = [link['href'] for link in links if 'PJLV-iieNCbK-css' in link['class']] # pdf + websites
arc_links_info = [{'orig': link, 'pdf': link} for link in pdf_list_arc] # pdf + websites

print('Number of pdfs:', len(pdf_list_arc))
print('Overall number:', len(all_links_arc))

Status code: 200
Number of pdfs: 22
Overall number: 27


In [4]:
page = requests.get('https://habr.com/ru/companies/ruvds/articles/721150/')
print('Status code:', page.status_code)

soup = BeautifulSoup(page.text, "html.parser")
links = soup.findAll('li')

all_links_habr = []
for link in links:
    for href in link.findAll('a'):
        all_links_habr.append(href['href'])
all_links_habr = [link for link in all_links_habr if 'habr' not in link and link.startswith('http')]

habr_links_info = [] # list to store "website link": "pdf link" pairs
for link in all_links_habr:
    if 'pdf' not in link:
        habr_links_info.append({
            'orig':link,
            'pdf': find_pdf_on_page(link)
        })
    else:
        habr_links_info.append({
            'orig':link,
            'pdf': link
        })

Status code: 200


In [8]:
page_link = 'https://raw.githubusercontent.com/floodsung/Deep-Learning-Papers-Reading-Roadmap/master/README.md'

page = requests.get(page_link)
github_links = re.findall(r'http.*pdf',page.text)
github_links_info = [{'orig': link, 'pdf': link} for link in github_links]

In [9]:
all_links = habr_links_info + github_links_info + arc_links_info

In [10]:
# some arxiv links gave wrong pdf link, which is solved by following code
for val in all_links:
    if val['pdf'] == "http://arxiv.org/pdf":
        val['pdf'] = val['orig'].replace('abs', 'pdf')

In [11]:
# check link correction and append only right links
links = []
for i in all_links:
    try:
        data = requests.get(i['pdf'])
        if data.status_code==200:
            links.append(i)
    except:
        pass

In [12]:
df = pd.DataFrame(links)
df.to_csv('db_links.csv')

## Dataset building
I used PyPDF reader to create a dataset. And stored it as csv file to easy the work of evaluation.

In [4]:
df = pd.read_csv('db_links.csv')

In [5]:
pages = []
for i in range(0, len(df)):
    try:
        loader = PyPDFLoader(df.loc[i, 'pdf'])
        pages.extend(loader.load_and_split())
    except:
        continue

invalid pdf header: b'\x89PNG\r'
EOF marker not found
EOF marker not found
EOF marker not found
Ignoring wrong pointing object 2 65536 (offset 0)
Ignoring wrong pointing object 34 65536 (offset 0)
Ignoring wrong pointing object 92 65536 (offset 0)
Ignoring wrong pointing object 145 65536 (offset 0)
Ignoring wrong pointing object 206 65536 (offset 0)
Ignoring wrong pointing object 274 65536 (offset 0)
Ignoring wrong pointing object 330 65536 (offset 0)
Ignoring wrong pointing object 372 65536 (offset 0)
invalid pdf header: b'<!DOC'
EOF marker not found


In [6]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2500,
    chunk_overlap=20,
    length_function=len,
)

In [22]:
docs = []
for doc in pages:
    texts = text_splitter.split_text(doc.page_content)
    for idx, text in enumerate(texts):
        docs.append({
            'source': doc.metadata['source'],
            'page': doc.metadata['page'],
            'text': text
        })

In [26]:
data = pd.DataFrame(docs)
data['id'] = data.index
data.head()
data.to_csv('article_info.csv')