# Notebook to extract data.
In this notebook, we extract and organize information from all the datasets available in the UC Irvine Machine Learning Repository. This resource is widely used by researchers, educators, and practitioners as a benchmark collection for testing machine learning algorithms.

## Imports 

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

### Constants and data structures

In [2]:
fill_void_info = []

In [3]:
main_url = "https://archive.ics.uci.edu"
url = "https://archive.ics.uci.edu/datasets?take=678&sort=desc&orderBy=NumHits&search="

### Functions

In [None]:
def get_link_info(link):
    """
    Extracts metadata from a dataset web page hosted on the UC Irvine Machine Learning Repository.

    Parameters:
    - link (str): URL of the dataset page to extract information from.

    Returns:
    - tuple:
        - date (str or None): The date string found in the page header, if available.
        - dataset_information (str or None): A combined string containing the dataset title,
          subtitle, and descriptive information. If extraction fails, both values will be None.

    Notes:
    - If the extraction fails for any reason (e.g., invalid URL, page structure changes),
      the function appends the link to a global list called `fill_void_info` for later review.
    """
    try:
        dataset_response = requests.get(link)
        dataset_soup = BeautifulSoup(dataset_response.content)
        date = dataset_soup.find('h2').text
        title = dataset_soup.find('h1').text
        subtitle = dataset_soup.find('div', class_='relative flex flex-col gap-4 bg-base-100 p-4 shadow').text
        info = dataset_soup.find('div', class_='p-4 pt-0').text
        dataset_information = f"""{title} {subtitle} {info}"""
        return date, dataset_information
    except:
        fill_void_info.append(link)
        return None, None

### Reading and processing information

In [4]:
response = requests.get(url)
response_text = response.text
soup = BeautifulSoup(response_text)
desiredlist = soup.find_all('a')

In [5]:
relevant_links_part = set(
    [ref.get_attribute_list('href')[0] 
     for ref in desiredlist 
     if ref.get_attribute_list('href')[0].startswith('/dataset/')]
)

In [6]:
relevant_links = [f'{main_url}{ref}' for ref in relevant_links_part]
datasets_info = [get_link_info(link) for link in relevant_links]
final_data = pd.DataFrame(datasets_info).rename(columns={0:'date', 1:'Total Description'}).dropna()

In [11]:
pd.set_option('display.max_rows', 1000)
final_data['date_dt'] = pd.to_datetime(final_data.date.str.replace('Donated on ', '').str.replace('Linked on ', ''))
# Remove duplicate rows based on the 'Total Description' column to keep only unique entries
final_data.drop_duplicates('Total Description', inplace=True)
# Filter the data to only include entries from the year 2010 onward
final_data = final_data[final_data.date_dt >= '2010']
final_data.groupby(pd.Grouper(key='date_dt', freq='Y')).count().date.plot()

In [20]:
# Save data
final_data.to_parquet('../data/external/UC_Irvine_ML_Repo.parquet')