In [1]:
%load_ext autoreload
%autoreload 2
import requests
import pandas as pd
import numpy as np
from time import sleep 
from bs4 import BeautifulSoup

In [2]:
import re 

def clean_text(text, cleaning_tag=0):
    if cleaning_tag == 1:
        text = text.replace("\nOutcompute to outcompete | Growing our own timber\n", "")
    elif cleaning_tag == 2:
        text = text.replace("Best regards,", "")
        text = text.replace("Gayathriy", "")
    # Define regular expression patterns to match the strings to be removed
    pattern1 = r"This post was modified \d+ weeks ago \d+ times by"
    pattern2 = r"This post was modified \d+ weeks ago by"

    # Use re.sub() to replace the matched patterns with an empty string
    text = re.sub(pattern1, '', text)
    text = re.sub(pattern2, '', text)
    return text

def scrape_data(url, title, cleaning_tag=0):
    '''
    Inputs:
        - URL
        - title (str)
        - cleaning_tag (int)
            - if tag = 1, remove 'Outcompute to outcompete | Growing our own timber' from post
            - if tag = 2, remove 'Best regards,Gayathriy' from post
    Outputs:
        - dataframe with two columns - title and text 
    '''
    source = requests.get(url).text
    docs = [BeautifulSoup(source, "html.parser")] 
    nestedqns = [doc.find_all('a', class_='wpforo-topic-title') for doc in docs]
    questions = []
    answers = []

    for category in nestedqns:
        for qn in category:
            questions.append(qn['title'])
            sleep(np.random.random()/2)
            ans_source = requests.get(qn['href']).text
            ans_doc = BeautifulSoup(ans_source, "html.parser")
            answer_div = ans_doc.find('div', class_='post-wrap wpfn-2 wpf-answer-wrap')
            if answer_div:
                answer = answer_div.find('div', class_="wpforo-post-content").text
                answers.append(answer)
            else:
                answers.append("")

    df = pd.DataFrame({
        'questions' : questions,
        'answers' : answers,
    })
    '''
    # Combine the question and answer columns into a new text column
    df['text'] = df.apply(lambda row: row['questions'] + ' ' + row['answers'], axis=1)
    # Drop questions and answers columns
    df.drop(['questions', 'answers'], axis=1, inplace=True)
    '''
    # Add title column
    df['title'] = title 
    # Re-arrange columns
    df['answers'] = df['answers'].apply(clean_text, cleaning_tag = cleaning_tag)
    # Print the updated dataframe
    print(df)

    return df 

In [3]:
faq_urls = {
    '100E': 'https://epoch.aisingapore.org/community/faq-100e/',
    'AI Apprencticeship Program (AIAP)': 'https://epoch.aisingapore.org/community/faq-aiap/',
    'Hiring AI Talent': 'https://epoch.aisingapore.org/community/hiring-ai-talents/',
    'LearnAI: AI for Everyone (AI4E)': 'https://epoch.aisingapore.org/community/faq-ai4e/',
    'LearnAI: AI for Industry (AI4I)': 'https://epoch.aisingapore.org/community/faq-ai4i/',
    'LearnAI: AI for Students (AI4S)': 'https://epoch.aisingapore.org/community/faq-ai4s/',
    'LearnAI: AI for Kids (AI4K)': 'https://epoch.aisingapore.org/community/faq-ai4k/',
    'LearnAI: AI Student Outreach Program': 'https://epoch.aisingapore.org/community/faq-aisop/',
    'LearnAI: Teacher Work Attachment Program': 'https://epoch.aisingapore.org/community/teacher-work-attachment-plus-programme-twapp/',
    'LearnAI: My LearnAI Account': 'https://epoch.aisingapore.org/community/my-learnai-account/',
    'LearnAI: Fees': 'https://epoch.aisingapore.org/community/fees-and-payment/',
    'LearnAI: Promotions': 'https://epoch.aisingapore.org/community/promotions/',
    'LearnAI: Datacamp': 'https://epoch.aisingapore.org/community/faq-datacamp/',
    'LearnAI: International Maths Day': 'https://epoch.aisingapore.org/community/international-maths-day/'
}

tags = {
    '100E': 1,
    'AI Apprencticeship Program (AIAP)': 1,
    'Hiring AI Talent': 1,
    'LearnAI: AI for Everyone (AI4E)': 2,
    'LearnAI: AI for Industry (AI4I)': 2,
    'LearnAI: AI for Students (AI4S)': 2,
    'LearnAI: AI for Kids (AI4K)': 2,
    'LearnAI: AI Student Outreach Program': 2,
    'LearnAI: Teacher Work Attachment Program': 2,
    'LearnAI: My LearnAI Account': 2,
    'LearnAI: Fees': 2,
    'LearnAI: Promotions': 2,
    'LearnAI: Datacamp': 2,
    'LearnAI: International Maths Day': 2
}

In [4]:
dfs = {}

for title, url in faq_urls.items():
    dfs[title] = scrape_data(url = url, title=title, cleaning_tag=tags[title])    

                                           questions  \
0  What are the differences between 100 Experimen...   
1  How does an organisation apply for the 100 Exp...   
2  What are my responsibilities as a Project Spon...   
3           Can I have AI apprentices on my project?   
4  Who can be the Principal Investigator (PI) or ...   
5  How long does it take for my proposal to be ap...   
6                What if I do not have any datasets?   
7            When is the cash contribution required?   
8             What does 'in-kind contribution' mean?   
9                        Who are the AI apprentices?   

                                             answers title  
0  \n\n\n\n \n100E4I\n100E4R\n\n\n\nCollaboration...  100E  
1  \nAISG will provide the templates for an organ...  100E  
2  \nTo ensure a successful outcome from the 100E...  100E  
3  \nAI apprentices will be assigned for all 7-mo...  100E  
4  \nThe 100E PI (and Co-PI) has to be a full-tim...  100E  
5  \nThe review o

In [5]:
concatenated_df = pd.concat(dfs.values())

In [4]:
concatenated_df = pd.read_csv('scraped_data.csv', index_col=0)

In [5]:
concatenated_df.head()

Unnamed: 0,questions,answers,title
0,What are the differences between 100 Experimen...,\n\n\n\n \n100E4I\n100E4R\n\n\n\nCollaboration...,100E
1,How does an organisation apply for the 100 Exp...,\nAISG will provide the templates for an organ...,100E
2,What are my responsibilities as a Project Spon...,\nTo ensure a successful outcome from the 100E...,100E
3,Can I have AI apprentices on my project?,\nAI apprentices will be assigned for all 7-mo...,100E
4,Who can be the Principal Investigator (PI) or ...,\nThe 100E PI (and Co-PI) has to be a full-tim...,100E
