## Import modules

In [1]:
import requests as r
from bs4 import BeautifulSoup as bs
import numpy as np, pandas as pd
import warnings 
import re
import tqdm
import numpy as np

warnings.filterwarnings('ignore')

url_up = 'https://repositorio.up.edu.pe/'
url = 'https://repositorio.up.edu.pe/handle/11354/6'

In [2]:
def get_html(url, element = 'div', css = 'artifact-description'):
	response = r.get(url, verify=False)
	html = bs(response.content, 'html.parser')
	s_items = html.find_all(element, class_ = css)

	return s_items
items = get_html(url)

### Get all subitems

`get_subitems` retrieves all the reference links. For example, in theses, it obtains the links for "bachelor's," "literature," and "master's." This process is repeated until there are no more sub-items within the links.

In [3]:
def get_subitems(s_items, master_dir="UP"):
	links = []
	num_thesis = []
	name_links = []
	link_df = pd.DataFrame()

	for item in tqdm.tqdm(s_items):

		master = item.find('h4')

		sublinks = url_up + master.find('a')['href']
		subitem = master.find('span').string

		text_without_spaces = re.sub('[\s\t]+', '', subitem)
		link_ref = len(text_without_spaces)

		if link_ref < 2:
			continue

		sub_text = master.getText()
		num_tesis = re.search(r'\[(\d+)\]', sub_text).group(1)
		num_tesis = int(num_tesis)

		links.append(sublinks)
		name_links.append(subitem)
		num_thesis.append(num_tesis)
	link_df = link_df.assign(
		dir_name=name_links, num_thesis=num_thesis, links=links,
		master_dir = master_dir
		)
	return link_df

main_urls_df=get_subitems(items)
main_urls_df


100%|██████████| 23/23 [00:00<00:00, 22811.30it/s]


Unnamed: 0,dir_name,num_thesis,links,master_dir
0,Bachillerato,2,https://repositorio.up.edu.pe//handle/11354/2921,UP
1,Licenciatura,272,https://repositorio.up.edu.pe//handle/11354/20,UP
2,Maestría,1637,https://repositorio.up.edu.pe//handle/11354/9,UP



Obtaining all sub-items (folders) to iterate through the metadata of each thesis. Sub-items are filtered to those with less than 21 theses, as they require a different scraping approach compared to those with more than 20 theses.


In [None]:
main_urls = main_urls_df.links.to_numpy()
main_rep = main_urls_df.dir_name.to_numpy()
sub_cat_df = pd.DataFrame()
for i, main_url in tqdm.tqdm(enumerate(main_urls)) :
	bachiller_sub_urls_df = get_subitems(get_html(main_url), main_rep[i]) 

	sub_cat_df = pd.concat((sub_cat_df, bachiller_sub_urls_df))
# sub_cat_df

### Get metadata

The `get_thesis_info` function refers to the [code](https://github.com/d2cml-ai/llm4tesis/blob/main/code/scraper.py) to extract the metadata table.


In [5]:
# less than 20
sub_cat_20 = sub_cat_df.query('num_thesis<21')

In [50]:
sub_urls = sub_cat_20.links.to_numpy()
sub_dir = sub_cat_20.dir_name.to_numpy()
master_dir = sub_cat_20.master_dir.to_numpy()

This section of the code is responsible for obtaining all the links that reference a PDF within the metadata page, as there are multiple ways to locate them.

```python
pdf_links = meta_url.find_all('div', class_ = 'file-link')
pdf_ = ['none', 'none', 'none', 'none']
for i, pdf_link in enumerate(pdf_links):
	try:
		ref_a = pdf_link.find('a')
		try:
			ref = ref_a['href']
			pdf_[i] = (url_up + ref)
		except:
			pass
	except:
		pass
pdf_download = {
	'url_thesis': [url],
	'pdf_view': [pdf_[0]],
	'pdf_view1': [pdf_[1]],
	'pdf_download': [pdf_[2]],
	'pdf_download1': [pdf_[3]],
	'master_dir': [master_dir],
	'sub_dir': [sub_dir]
}

pdf_df_link = pd.DataFrame(pdf_download)
df1 = pd.concat((df, pdf_df_link), axis=1)
```

- The `get_thesis_metadata` function consolidates all the information into a single dataframe for each sub-item.


In [184]:
def get_thesis_info(url, master_dir = 'dir', sub_dir = 'sub_dir'):
	response=r.get(url, verify=False)
	meta_url = bs(response.content, 'html.parser')
	table = str(meta_url.find("table"))
	df = pd.read_html(str(table))[0][[0, 1]]
	df = df.groupby(0, as_index=False).agg("\n".join)
	df = df.set_index(0).T.reset_index().drop(columns=["index"])

	pdf_links = meta_url.find_all('div', class_ = 'file-link')
	pdf_ = ['none', 'none', 'none', 'none']
	for i, pdf_link in enumerate(pdf_links):
		try:
			ref_a = pdf_link.find('a')
			try:
				ref = ref_a['href']
				pdf_[i] = (url_up + ref)
			except:
				pass
		except:
			pass
	pdf_download = {
		'url_thesis': [url],
		'pdf_view': [pdf_[0]],
		'pdf_view1': [pdf_[1]],
		'pdf_download': [pdf_[2]],
		'pdf_download1': [pdf_[3]],
		'master_dir': [master_dir],
		'sub_dir': [sub_dir]
	}

	pdf_df_link = pd.DataFrame(pdf_download)
	df1 = pd.concat((df, pdf_df_link), axis=1)
	return df1

def get_thesis_metadata(sub_url, master_dir, sub_dir):
	thesis_urls=get_html(sub_url, 'h4', 'artifact-title')
	thesis_df_info=pd.DataFrame()
	link = []
	for i, thesis in enumerate(thesis_urls):
		href_thesis = thesis.find('a')['href']
		link_thesis = url_up + href_thesis + '?show=full'
		link.append(link_thesis)
		thesis_df = get_thesis_info(link_thesis, master_dir, sub_dir)
		thesis_df_info = pd.concat((thesis_df_info, thesis_df))
	return thesis_df_info
# a, b = get_thesis_url(sub_urls[0])

In [73]:
less_20 = pd.DataFrame()
for i, _url in tqdm.tqdm(enumerate(sub_urls)):
	df = get_thesis_metadata(_url, master_dir[i], sub_dir[i])
	less_20 = pd.concat((less_20, df))
less_20.to_csv('less_20.csv', index=False)

10it [00:31,  3.20s/it]


For sub-items with more than 20 theses, an additional loop is added to iterate through all the theses within a predefined range, typically set at increments of 20 by default.

In [142]:
# more than 20
sub_cat = sub_cat_df.query('num_thesis>=21')
sub_urls = sub_cat.links.to_numpy()
sub_dir = sub_cat.dir_name.to_numpy()
master_dir = sub_cat.master_dir.to_numpy()
num_thesis = sub_cat.num_thesis.to_numpy()

In [198]:
all_df = pd.DataFrame()
for i, url in tqdm.tqdm(enumerate(sub_urls)) :
	last_range = int(np.ceil(num_thesis[i] / 20))
	for page in range(last_range):
		page_url = url + f"/recent-submissions?offset={page * 20}"
		df = pd.DataFrame()
		try:
			df=get_thesis_metadata(page_url, master_dir[i], sub_dir[i])
			all_df = pd.concat((all_df, df))
		except:
			pass

# all_df

13it [52:07, 240.55s/it]


In [205]:
all_df.to_csv("more20.csv")

Download pdfs?

In [13]:
response = r.get(c)
with open(n, 'wb') as file:
	file.write(response.content)