# Degrees

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
BASE_URL = 'https://portal.uned.es/portal{0}'
DEGREES_URL = 'https://portal.uned.es/portal/page?_pageid=93,1643102&_dad=portal&_schema=PORTAL'
DEGREE_URL = 'https://portal.uned.es/portal/pls/portal/url/page/UNED_MAIN/GRADOS/{0}'
UNIVERSITY_NAME = 'universidad-nacional-de-educacion-a-distancia-grados'

SUBJECT_COLUMNS = [
    'code',
    'name',
    'url',
    'subject_type',
    'credits',
    'period',
    'course_number'
]

DEGREE_COLUMNS = [
    'code',
    'name',
    'university',
    'area',
    'faculty',
    'url',
    'price_1',
    'price_2',
    'price_3',
    'price_4',
    'content',
    'skills',
    'profile',
    'opportunities',
    'num_subjects',
    'capacity',
    'min_grade',
    'description',
    ''
]

In [3]:
# http://portal.uned.es/pls/portal/docs/PAGE/UNED_MAIN/INFORMACIONGENERAL/PDF'S/PRECIOS-GRADOSUE2019-20.PDF

PRICES = {
    'GRADO EN MATEMÁTICAS': [13.39, 18.97, 41.74, 57.55],
    'GRADO EN LENGUA Y LITERATURA ESPAÑOLAS': [13.00, 18.97, 41.74, 57.55],
    'GRADO EN ESTUDIOS INGLESES: LENGUA, LITERATURA Y CULTURA': [13.00, 18.97, 41.74, 57.55],
    'GRADO EN CIENCIA POLÍTICA Y DE LA ADMINISTRACIÓN': [13.00, 18.97, 41.74, 57.55],
    'GRADO EN DERECHO': [13.00, 18.97, 41.74, 57.55],
    'GRADO EN SOCIOLOGÍA': [13.00, 18.97, 41.74, 57.55],
    'GRADO EN CC. JURÍDICAS DE LAS ADMINISTRACIONES PÚBLICAS': [13.00, 18.97, 41.74, 57.55],
    'GRADO EN GEOGRAFÍA E HISTORIA': [13.00, 18.97, 41.74, 57.55],
    'GRADO EN HISTORIA DEL ARTE': [13.00, 18.97, 41.74, 57.55],
    'GRADO EN FILOSOFÍA': [13.00, 18.97, 41.74, 57.55],
    'GRADO EN ANTROPOLOGÍA SOCIAL Y CULTURAL': [13.00, 18.97, 41.74, 57.55],
    'GRADO EN TRABAJO SOCIAL': [13.20, 22.95, 50.49, 69.62],
    'GRADO EN ADMINISTRACIÓN Y DIRECCIÓN DE EMPRESAS': [13.20, 20.89, 47.09, 64.84],
    'GRADO EN ECONOMÍA': [13.20, 20.89, 47.09, 64.84],
    'GRADO EN TURISMO': [13.20, 20.89, 47.09, 64.84],
    'GRADO EN PSICOLOGÍA': [15.95, 22.95, 50.49, 69.62],
    'GRADO EN CRIMINOLOGÍA': [15.95, 22.95, 50.49, 69.62],
    'GRADO EN PEDAGOGÍA': [14.80, 22.95, 50.49, 69.62],
    'GRADO EN EDUCACIÓN SOCIAL': [14.80, 22.95, 50.49, 69.62],
    'GRADO EN QUÍMICA': [20.48, 30.60, 67.32, 92.82],
    'GRADO EN FÍSICA': [20.48, 30.60, 67.32, 92.82],
    'GRADO EN CIENCIAS AMBIENTALES': [20.48, 30.60, 67.32, 92.82],
    'GRADO EN ING. EN  ELECTRÓNICA INDUSTRIAL Y AUTOMÁTICA': [21.60, 30.60, 67.32, 92.82],
    'GRADO EN INGENIERÍA ELÉCTRICA': [21.60, 30.60, 67.32, 92.82],
    'GRADO EN INGENIERÍA MECÁNICA': [21.60, 30.60, 67.32, 92.82],
    'GRADO EN INGENIERÍA EN TECNOLOGÍAS INDUSTRIALES': [21.60, 30.60, 67.32, 92.82],
    'GRADO EN INGENIERÍA INFORMÁTICA': [21.60, 30.60, 67.32, 92.82],
    'GRADO EN INGENIERÍA EN TECNOLOGÍAS DE LA INFORMACIÓN': [21.60, 30.60, 67.32, 92.82]
}

In [4]:
page = requests.get(DEGREES_URL)
soup = BeautifulSoup(page.content, 'html.parser')

In [5]:
def get_degree_list(soup):
    degree_areas = list(map(lambda x: x.getText(), soup.select('.tabla_grados > table > thead > tr > th')))
    degree_tables = soup.select('.tabla_grados > table')
    degree_list = []

    for index, degree_table in enumerate(degree_tables):
        area = degree_areas[index]

        degree_data = degree_table.select('tbody')
        for degree_row in degree_data:
            for degree_cell in degree_row.find_all(True):
                row = degree_cell.find_all('td')
                if (len(row)):
                    code = row[0].getText()
                    name = row[1].getText()
                    url = DEGREE_URL.format(code)
                    faculty = row[2].getText()

                    degree_data = [code, name, UNIVERSITY_NAME, area, faculty, url]
                    
                    for price in PRICES[name]:
                            degree_data.append(price)

                    degree_list.append(degree_data)

    return degree_list

In [6]:
def set_url_base(url):
    return BASE_URL.format(url)

In [7]:
def get_degree_content(soup):
    return soup.select('.contenido_texto')[0].get_text().replace(u'\xa0', u'').replace(u'\xad', u'').replace(u'\n', u' ')

In [8]:
def get_degree_info_links(soup):
    return soup.select('.menu_guia > ul > li > a')

def get_degree_data_links(soup):
    return soup.select('#plan_estudios > li > a')

def get_degree_links(soup):
    links = {}
    info_links = get_degree_info_links(soup)
    data_links = get_degree_data_links(soup)
    
    links['skills_url'] = set_url_base(info_links[2].get('href'));
    links['profile_url'] = set_url_base(info_links[3].get('href'));
    links['opportunities_url'] = set_url_base(info_links[5].get('href'));
    links['subjects_url'] = set_url_base(data_links[1].get('href'));

    return links

In [9]:
def get_degree_soups(degree_list):
    degree_soups = []

    for degree in degree_list:
        page = requests.get(degree[5])
        degree_soups.append(BeautifulSoup(page.content, 'html.parser'))
    
    return degree_soups

### Skills

In [10]:
def get_degree_skills(degree):
    page = requests.get(degree['skills_url'])
    soup = BeautifulSoup(page.content, 'html.parser')
    degree['skills'] = get_degree_content(soup)
    
    return degree

### Profile

In [11]:
def get_degree_profile(degree):
    page = requests.get(degree['profile_url'])
    soup = BeautifulSoup(page.content, 'html.parser')
    degree['profile'] = get_degree_content(soup)
    
    return degree

### Opportunities

In [12]:
def get_degree_opportunities(degree):
    page = requests.get(degree['opportunities_url'])
    soup = BeautifulSoup(page.content, 'html.parser')
    degree['opportunities'] = get_degree_content(soup)

    return degree

### Subjects

In [13]:
def set_degree_subjects(tables):
    course_list = []
    course_number = 0

    for table in tables:
        course_number += 1
        courses = table.select('tbody')
        for course in courses:
            for course_cell in course.find_all('tr'):
                if 'class' in course_cell.attrs:
                    if 'hidden-xs' not in course_cell['class']:
                        semester = course_cell.select('td')[0].get_text(strip=True).replace(u'\xa0', u'')
                else:
                    cells = course_cell.find_all('td', {'headers': True})
                    
                    if (len(cells)):
                        code = cells[0].get_text(strip=True)
                        name = cells[1].get_text(strip=True)
                        url = set_url_base(cells[1].select('a')[0].get('href').replace(u' ', u'').replace(u'\r', u'').replace(u'\n', u''))
                        subject_type = cells[2].get_text(strip=True)
                        credits = cells[3].get_text(strip=True)

                        course_list.append([code, name, url, subject_type, credits, semester, course_number])

    return course_list

def get_degree_subjects(links):
    page = requests.get(links['subjects_url'])
    soup = BeautifulSoup(page.content, 'html.parser')
    tables = soup.select('.plegatabla.tabla_asignaturas')

    return set_degree_subjects(tables)

In [14]:
def get_degree_info(soup):
    links = get_degree_links(soup)
    skills = get_degree_skills(links)
    profile = get_degree_profile(links)
    opportunities = get_degree_opportunities(links)
    
    return [skills, profile, opportunities]

In [15]:
def save_degree(degree_soups, degree_list):
    for index, soup in enumerate(degree_soups):
        links = get_degree_links(soup)
        content = get_degree_content(soup)
        skills = get_degree_skills(links)
        profile = get_degree_profile(links)
        opportunities = get_degree_opportunities(links)
        subjects = get_degree_subjects(links)

        degree_list[index].append(content)
        degree_list[index].append(skills)
        degree_list[index].append(profile)
        degree_list[index].append(opportunities)
        degree_list[index].append(len(subjects))

        degree_subjects_df = pd.DataFrame(subjects, columns=SUBJECT_COLUMNS)
        degree_subjects_df.to_csv('{0}-degree-{1}.csv'.format(UNIVERSITY_NAME, degree_list[index][0]))

    degrees_df = pd.DataFrame(degree_list, columns=DEGREE_COLUMNS)
    degrees_df.to_csv('{0}.csv'.format(UNIVERSITY_NAME))

In [16]:
degree_list = get_degree_list(soup)
degree_soups = get_degree_soups(degree_list)

save_degree(degree_soups, degree_list)