# Web Scraping of RI UFRN

## 1. Importing the required libraries

In [None]:
# Importing the required libraries.
import scrapy, csv, re, pandas as pd, numpy as np
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from scrapy.crawler import CrawlerProcess

## 2. Defining the Spider class

In [None]:
def clean_text(text):
    text = re.sub(r"\s+", " ", text, flags=re.IGNORECASE).strip()
    text = text.replace("- ", "-").replace("\ufeff", "")
    return text

In [None]:
# Creating the data repository.
data = list()

In [None]:
# Setting the URL.
url_base = "https://repositorio.ufrn.br/handle/123456789/11949"

In [None]:
# Definition of Spider class.
class Spider_RI_UFRN(scrapy.Spider):
    name = "scraper_ri_ufrn"

    # Start point to run the spider.
    def start_requests(self):
        yield scrapy.Request(url=url_base, callback=self.parse_categories)

    def parse_categories(self, response):
        # Getting the relative URLs of the categories (PhD and MSc' Thesis).
        css = "#content > div:nth-child(3) > div > div.col-md-9 > div.container.row > div > div.list-group"
        soup = BeautifulSoup(response.text, "html.parser")
        soup = soup.select(css).pop()
        urls = [(item.string.strip(), item["href"]) for item in soup.select("a")]
        for url in urls:
            yield response.follow(url=url[1], callback=self.parse_thesis_links,
                                  meta={"category": url[0]})

    def parse_thesis_links(self, response):
        category = response.meta["category"] if "category" in response.meta else None
        links = response.meta["links"] if "links" in response.meta else dict()

        # Getting the relative URLs of the documents.
        if category not in links:
            links[category] = list()
        css = "#content > div:nth-child(3) > div > div.col-md-9 > table"
        soup = BeautifulSoup(response.text, "html.parser")
        urls = soup.select(css).pop()
        urls = [item["href"] for item in urls.find_all("a") if item.has_attr("href")]
        if len(urls) > 0:
            links[category].extend(urls)

        # Navigating among the next pages.
        css = "#content > div:nth-child(3) > div > div.col-md-9 > div:nth-child(7)"
        soup = soup.select(css).pop()
        url = soup.find_all("a", string=re.compile(r"(next|próximo)", flags=re.IGNORECASE))
        if len(url) > 0:
            url = url[0]["href"]
            yield response.follow(url=url, callback=self.parse_thesis_links,
                                  meta={"category": category, "links": links})
        else:
            for url in links[category]:
                yield response.follow(url=url, callback=self.parse_data,
                                      meta={"category": category})

    def parse_data(self, response):
        category = response.meta["category"] if "category" in response.meta else None

        # Getting the HTML of page.
        soup = BeautifulSoup(response.text, "html.parser")
        css = "table.table.itemDisplayTable"
        html = soup.select(css).pop()

        # Extracting the data.
        record = {"category": category}
        try:
            for tag in html.select("table > tr")[:-1]:
                label = tag.select_one("tr > td.metadataFieldLabel")
                label = label.text.strip().lower().replace(":", "")
                record[label] = tag.select_one("tr > td.metadataFieldValue")
                record[label] = record[label].text.strip().replace("Resumo", "") \
                    if record[label] is not None else None
            css = "div.panel.panel-info > table > tr:nth-child(2) > td:first-child > a"
            record["document_url"] = soup.select_one(css)["href"]
            record["document_url"] = urljoin(url_base, record["document_url"])
        except Exception as e:
            print(f"ERROR-DEBUG: error in extract the feature {label} in {response.url}")
        data.append(record)

## 3. Executing the Spider

In [None]:
# Execution Process to run the spider.
process = CrawlerProcess()
process.crawl(Spider_RI_UFRN)
process.start()

In [None]:
# Checking the data.
len(data)

## 4. Preprocessing the data

In [None]:
# Creating the dataframe.
df = pd.DataFrame(data)

In [None]:
# Listing the five first records.
df.info()

In [None]:
# Handling the None values.
df.replace({np.nan: None}, inplace=True)

In [None]:
# Renaming the columns.
df.rename(columns={"keywords": "auth_keywords", "issue date": "defense_date",
                   "portuguese abstract": "pt_abstract", "abstract": "en_abstract",
                   "other titles": "col1", "embargoed until": "col2"}, inplace=True)

In [None]:
# Removing the unecessary columns.
df.drop(columns=["col1", "col2"], inplace=True)

In [None]:
# Normalizing the column "category".
df.category = df.category.apply(lambda x: "PhD" if "Doutorado" in x else \
    "MSc" if "Mestrado" in x else "Other")

In [None]:
# Normalizing the columns "authors" and "advisor".
df.loc[df.authors.notnull(), "authors"] = df.loc[df.authors.notnull(), "authors"].apply(
    lambda x: f'{x.split(",")[1].strip()} {x.split(",")[0].strip()}')
df.loc[df.advisor.notnull(), "advisor"] = df.loc[df.advisor.notnull(), "advisor"].apply(
    lambda x: f'{x.split(",")[1].strip()} {x.split(",")[0].strip()}')

In [None]:
# Normalizing the column "auth_keywords".
df.loc[df.auth_keywords.notnull(), "auth_keywords"] = df.loc[
    df.auth_keywords.notnull(), "auth_keywords"].apply(lambda x: tuple(
        [clean_text(k).strip() for k in x.split(";")
            if len(clean_text(k).strip())]))

In [None]:
# Normalizing the columns "title", "citation", "pt_abstract" and "en_abstract".
df.loc[df.title.notnull(), "title"] = df.loc[
    df.title.notnull(), "title"].apply(clean_text)
df.loc[df.citation.notnull(), "citation"] = df.loc[
    df.citation.notnull(), "citation"].apply(clean_text)
df.loc[df.pt_abstract.notnull(), "pt_abstract"] = df.loc[
    df.pt_abstract.notnull(), "pt_abstract"].apply(clean_text)
df.loc[df.en_abstract.notnull(), "en_abstract"] = df.loc[
    df.en_abstract.notnull(), "en_abstract"].apply(clean_text)

In [None]:
# Checking the result.
df.head()

## 4. Saving the data

In [None]:
# Saving the data into a CSV file.
df.to_csv("ppgeec_phd_msc_thesis.csv", header=0, index=False, quoting=csv.QUOTE_ALL)