In [10]:
from requests import get
from bs4 import BeautifulSoup
import re

In [2]:
def clean_answer(input_text):
    '''function that cleans the answer texts'''
    answer = input_text.strip('\n')
    answer = answer.replace('\n',' ')
    return answer  

In [3]:
def clean_links(link_list):
    '''function that cleans links depending on their target location'''
    clean_links = []
    
    for link in link_list:
        if link == None: # no link in answer
            clean_links.append(None)
            pass
        
        else:
            link = link['href']
            if link[0] == '?': # target is sub page of the faq
                f_link = 'https://www.rug.nl/education/faq/' + link
                clean_links.append(f_link)
            
            elif link[0] == '/': # target is sub page of the rug site
                f_link = 'https://www.rug.nl' + link
                #print(f_link)
                clean_links.append(f_link)
            
            else: # target is a different link
                clean_links.append(link)
                #print(link)

    return clean_links

In [16]:
def retrieve_data(question_list, answer_list, link_list, url, heading_list):
    """function retrieves questions and answers from the specific RUG pages"""
    response = get(url)
    html_soup = BeautifulSoup(response.text, 'html.parser')
    data = html_soup.findAll('div',{'class':'rug-clearfix rug-theme--content rug-mb'})
    
    for links in data:
        link = links.findAll('a')
        
        for a in link:
            if a['href'][0] == "?": #check if link is an internal one
                current_url = 'https://www.rug.nl/education/faq/'+a['href']
                response = get(current_url)
                html_soup = BeautifulSoup(response.text, 'html.parser')
                questions = html_soup.find_all(['h2'])
                content = html_soup.findAll('div',{'class':'rug-clearfix rug-theme--content rug-mb'})
                s_content = html_soup.find('div',{'class':'rug-clearfix rug-theme--content rug-mb'})
                overzicht_url.append(current_url)
                
                if len(content) == len(questions): #check structure of current questions and answers
                    for question, answer in zip(questions, content):      
                        if question.text in heading_list: #check if question contains sub questions
                            links = answer.find_all('a')

                            #access the subquestions
                            for a in links:
                                q = a.text
                                
                                if a['href'][0] == "?":
                                    current_url = 'https://www.rug.nl/education/faq/'+a['href']
                                    overzicht_url.append(current_url)
                                else: 
                                    current_url = a['href']
                                    overzicht_url.append(current_url)

                                response = get(current_url)
                                html_soup = BeautifulSoup(response.text, 'html.parser')
                                ans = html_soup.find('div',{'class':'rug-clearfix rug-theme--content rug-mb'})

                                #catch errors when external page is formatted differently
                                try:
                                    answer = clean_answer(ans.text)
                                    ans_link = ans.a
                                    link_list.append(ans_link)
                                    answer_list.append(answer)
                                    question_list.append(q)
                                
                                #Structure of external url is different
                                except(AttributeError, KeyError) as error: 
                                    ans = html_soup.find(['p'])
                                    ans_link = ans.a
                                    link_list.append(ans_link)
                                    answer = clean_answer(ans.text)
                                    answer_list.append(answer)
                                    question_list.append(q)
                        else:      
                            question_list.append(question.text)
                            ans_link = answer.a
                            link_list.append(ans_link)
                            answer = clean_answer(answer.text)
                            answer_list.append(answer)   
                            
                else:
                    question = html_soup.find(['h1'])
                    question_list.append(question.text)
                    ans_link = s_content.a
                    link_list.append(ans_link)
                    answer = clean_answer(s_content.text)
                    answer_list.append(answer)
                    
            else:
                question = a.text
                ans_link = links.a
                link_list.append(ans_link)
                answer = clean_answer(links.text)
                question_list.append(question)
                answer_list.append(answer)
        
    return question_list, answer_list, link_list, 


In [5]:
def write_aiml_file(question_list, answer_list, clean_links):
    '''function that automatically generates aiml file from questions, answers and links in answers'''
    
    with open('aiml_base.aiml', 'w') as f:
        for vraag, antwoord, link in zip(question_list, answer_list, clean_links):

            if link is not None: # there is a link in the answer
                f.write('<category>'+'\n'+ '<pattern>'+ '\n'+ vraag+ '\n'+ '</pattern>'+'\n'+
                '<template>'+ '\n'+ antwoord + '\n'+ '<button>' + '\n' + '<text>' + '\n'+ 'Klik hier' 
                + '\n' + '</text>' + '\n' + '<url>'+ '\n' +link+ '\n' + '</url>' + '\n' + '</button>' + '\n' '</template>'+'\n'+'</category>'+'\n'+'\n')

            else:
                f.write('<category>'+'\n'+
              '<pattern>'+ '\n'+ vraag+ '\n'+ '</pattern>'+'\n'+
              '<template>'+ '\n'+ antwoord+ '\n'+ '</template>'+'\n'+
              '</category>'+'\n')
    f.close()

In [17]:
# all pages from the FAQ except immigration
url_list = ['https://www.rug.nl/education/faq/?tcid=verint_3_7394_7394',
            'https://www.rug.nl/education/faq/?tcid=verint_3_7395_7395',
            'https://www.rug.nl/education/faq/?tcid=verint_3_7412_7412', 
            'https://www.rug.nl/education/faq/?tcid=verint_3_7398_7398', 
            'https://www.rug.nl/education/faq/?tcid=verint_3_7399_7399',
            'https://www.rug.nl/education/faq/?tcid=verint_3_7400_7400',
            'https://www.rug.nl/education/faq/?tcid=verint_3_7402_7402', 
            'https://www.rug.nl/education/faq/?tcid=verint_3_7401_7401']

#add headings which contain sub_links to this list
headling_list =['Aanmelding en toelating', 'Inschrijven', 'Herinschrijven', 
                'Uitschrijven', 'Studielink', 'Meer informatie ...', 'Collegegeld', 
                'Financiële ondersteuning', 'Profileringsfonds', 'Studiefinanciering (DUO)', 
                'Studiebeurzen', 'Studiekosten',
                "Aanmelding- en selectieprocedure","Onderwijs - honoursprogramma","Bindend Studieadvies (BSA)", 
                "Studeren met een functiebeperking", "Studiedips en andere studieproblemen.", "Studiekeuze", 
                "Studievertraging", "Honours College", "Academische ontwikkeling", "Titulatuur / graden", "Alumni",
                'Adressen en openingstijden', 'Voorzieningen', 'Formulieren (overzicht)', 
                'Aanvragen verklaringen (student)', 'Regelingen (overzicht)', 'Handleidingen (overzicht)']

In [19]:
question_list = []
answer_list = []
link_list = []
overzicht_url = []

for url in url_list:
    question_list, answer_list, link_list = retrieve_data(question_list, answer_list, link_list, url, heading_list)

print(len(overzicht_url))

179


In [20]:
print(len(overzicht_url))
print(overzicht_url)

179
['https://www.rug.nl/education/faq/?tcid=verint_3_7394_7842', 'https://www.rug.nl/education/faq/?tcid=verint_3_7394_7843', 'https://www.rug.nl/education/faq/?tcid=verint_3_7394_7844', 'https://www.rug.nl/education/faq/?tcid=verint_3_7394_7846', 'https://www.rug.nl/education/faq/?tcid=verint_3_7394_7847', 'https://www.rug.nl/education/faq/?tcid=verint_3_7394_7848', 'https://www.rug.nl/education/faq/?tcid=verint_3_7394_7849', 'https://www.rug.nl/education/faq/?tcid=verint_3_7394_7850', 'https://www.rug.nl/education/faq/?tcid=verint_3_7394_7851', 'https://www.rug.nl/education/faq/?tcid=verint_3_7394_15100', 'https://www.rug.nl/education/faq/?tcid=verint_3_7394_7853', 'https://www.rug.nl/education/faq/?tcid=verint_3_7394_7855', 'https://www.rug.nl/education/faq/?tcid=verint_3_7394_7856', 'https://www.rug.nl/education/faq/?tcid=verint_3_7394_7857', 'https://www.rug.nl/education/faq/?tcid=verint_3_7394_7858', 'https://www.rug.nl/education/faq/?tcid=verint_3_7394_7859', 'https://www.rug.n

In [8]:
# Remove the questionmarks from the questions - to make it compatible for AIML

for vraag in question_list:
    question_list[question_list.index(vraag)] = vraag.replace('?', '')

In [11]:
# Remove 'Stel je vraag' at the end from an answer

for antwoord in answer_list:
    if re.search('Stel je vraag', antwoord):
        answer_list[answer_list.index(antwoord)] = re.sub('\Stel je vraag$', '', antwoord)
    else:
        pass

In [12]:
clean_links = clean_links(link_list)
write_aiml_file(question_list, answer_list, clean_links)

In [None]:
#inspect questions and answers
for vraag, antwoord in zip(question_list, answer_list):
    print(vraag,antwoord,'\n')