In [42]:
import time
import re
import csv
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import re
from selenium.webdriver.common.by import By

In [28]:
base_url = 'https://www.imigrasi.go.id'
initial_url = 'https://www.imigrasi.go.id/faq/visa'

url_wni = 'https://www.imigrasi.go.id/custom/view?type=wni-category'
url_wna = 'https://www.imigrasi.go.id/custom/view?type=wna-category'
url_uu_imigrasi = 'https://www.imigrasi.go.id/uu_imigrasi'

web_urls = {
    "urls": [url_wni, url_wna],
    "topic": ["Warga Negara Indonesia", "Warga Negara Asing"],
    "classes": ["cta-box2 col-4 d-flex align-items-center justify-content-center text-decoration-none", "cta-box2 col-4 d-flex align-items-center justify-content-center text-decoration-none"]
    }

In [None]:
def initiate_page(link, driver):
    '''
    This function is used to open the page and instruct driver to interact with the page. 
    This function will later be called by the main function inside a loop for multiple page scraping.
    '''
    driver.get(link)

    time.sleep(2) 

    last_height = driver.execute_script("return document.body.scrollHeight")
    
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        time.sleep(2)
        new_height = driver.execute_script("return document.body.scrollHeight")
        
        if driver.find_elements(By.CLASS_NAME, "footer"):
            break

        if new_height == last_height:
            break
        last_height = new_height

    driver.execute_script("window.scrollBy(100,0)") # add side scroll

    html = driver.page_source
    soup = bs(html, 'html.parser')
    return soup        

In [30]:
def extract_category_groups(link, driver, class_name, topic):
    soup = initiate_page(link, driver)
    categories_links = {
        'topic': [],
        'title': [],
        'link': []
    }
    link_tag = soup.find_all('a', class_=class_name)
    for link in link_tag:
        categories_links['topic'].append(topic)
        categories_links['title'].append(link.text.strip())
        categories_links['link'].append(link['href'])
    
    return categories_links

In [31]:
def find_links_with_classes(soup, required_classes):
    """
    Find all <a> tags that contain all required classes.
    """
    return soup.find_all(
        'a',
        class_=lambda c: c and all(rc in c for rc in required_classes)
    )

In [32]:
def extract_categories(link, driver):
    soup = initiate_page(link, driver)
    categories_links = {
        'title': [],
        'link': []
    }
    link_tag = find_links_with_classes(soup, ["btn-type", "text-start"])
    for link in link_tag:
        categories_links['title'].append(link.text.strip())
        categories_links['link'].append(link['href'])
    
    return categories_links

In [33]:
def extract_questions(title, link, driver, topic, question=None):
    soup = initiate_page(link, driver)
    qa = { 
        "topic": [],
        "category": [],
        "question": [], 
        "answer_link": [],
        "answer": []
    }
    if (topic == 'FAQ'):
        questions_list = soup.find_all('a', class_='accordion-button collapsed')

        for q in questions_list:
            answerLink = q['href']
            answer_full_link = f'{base_url}{answerLink}'

            answer_page = initiate_page(answer_full_link, driver)
            answer_div = answer_page.find(id=re.compile("^faq-content-"))

            answer = answer_div.text.strip() if answer_div else 'No answer found'
            qa["topic"].append(topic)
            qa["category"].append(title)
            qa["question"].append(q.get_text())
            qa["answer_link"].append(answer_full_link),
            qa["answer"].append(answer)
    else:
        accordions = soup.find_all('div', class_='accordion-item')

        for div in accordions:
            sub_question_header = div.find('h3', class_='accordion-header')
            sub_question = sub_question_header.text.strip()

            answer_div = div.find(id=re.compile("-content-"))
            answer = answer_div.text.strip() if answer_div else 'No answer found'

            full_question = f"{question} {sub_question}"
            qa["topic"].append(topic)
            qa["category"].append(title)
            qa["question"].append(full_question)
            qa["answer_link"].append(link),
            qa["answer"].append(answer)
    
    return qa

In [34]:
def clean_text_preserve_linebreaks(text):
    import re
    text = text.replace('\xa0', ' ').replace('\ufeff', '')
    text = text.replace('\t', ' ')
    text = re.sub(r'[ ]{2,}', ' ', text)
    text = re.sub(r'[ \t]*\n[ \t]*', '\n', text)
    return text.strip()

In [35]:
def merge_and_save_data(qa_all, driver, file_path):
    merged_qa = {
        "topic": [],
        "category": [],
        "question": [],
        "answer_link": [],
        "answer": []
    }

    for qa in qa_all:
        for key in merged_qa:
            merged_qa[key] += qa.get(key, [])

    print("#############################")
    print(merged_qa)
    driver.quit()

    df_raw = pd.DataFrame(merged_qa)
    df_raw.to_csv(file_path, index=False, encoding='utf-8-sig')
    # df_raw.to_csv('../data/raw/imigrasi_faq_raw.csv', index=False, encoding='utf-8-sig')
    return df_raw

SCRAPE FAQ

In [36]:
cat_links = ['/faq/m-paspor', '/faq/affidavit', '/faq/izin-tinggal', '/faq/kartu-perjalanan-pebisnins-apec', '/faq/paspor', '/faq/pelaporan-orang-asing', '/faq/sduwhv', '/faq/visa']
print("The following dataset is taken from:\n")

for link in cat_links:
    print(f"https://www.imigrasi.go.id{link}")

The following dataset is taken from:

https://www.imigrasi.go.id/faq/m-paspor
https://www.imigrasi.go.id/faq/affidavit
https://www.imigrasi.go.id/faq/izin-tinggal
https://www.imigrasi.go.id/faq/kartu-perjalanan-pebisnins-apec
https://www.imigrasi.go.id/faq/paspor
https://www.imigrasi.go.id/faq/pelaporan-orang-asing
https://www.imigrasi.go.id/faq/sduwhv
https://www.imigrasi.go.id/faq/visa


In [37]:
def start_scrape_faq():
    driver = webdriver.Chrome()

    categories_link = extract_categories(initial_url, driver)
    print(categories_link)

    qa_all = []
        
    for i, link in enumerate(categories_link['link']):
        cat_title = categories_link['title'][i]
        cat_link = f'{base_url}{link}'
        qa = extract_questions(cat_title, cat_link, driver, "FAQ")
        print(f'{len(qa["question"])} questions extracted from {cat_link}')
        qa_all.append(qa)
    
    return merge_and_save_data(qa_all, driver, '../data/raw/imigrasi_faq_raw.csv')

def clean_faq(df_raw):
    df = df_raw.copy()
    df['question'] = df['question'].apply(clean_text_preserve_linebreaks)
    df['answer'] = df['answer'].apply(clean_text_preserve_linebreaks)
    df['type'] = 'faq'
    df.head()
    df.to_csv("../data/raw/imigrasi_faq_cleaned.csv", index=False)
    print("Data cleaned and saved to imigrasi_faq_cleaned.csv")
    return df

In [38]:
df_faq_raw = start_scrape_faq()
df_faq_clean = clean_faq(df_faq_raw)

{'title': ['M-Paspor', 'Affidavit', 'Izin Tinggal', 'Kartu Perjalanan Pebisnis APEC', 'Paspor', 'Pelaporan Orang Asing', 'SDUWHV', 'Visa'], 'link': ['/faq/m-paspor', '/faq/affidavit', '/faq/izin-tinggal', '/faq/kartu-perjalanan-pebisnins-apec', '/faq/paspor', '/faq/pelaporan-orang-asing', '/faq/sduwhv', '/faq/visa']}
6 questions extracted from https://www.imigrasi.go.id/faq/m-paspor
2 questions extracted from https://www.imigrasi.go.id/faq/affidavit
5 questions extracted from https://www.imigrasi.go.id/faq/izin-tinggal
6 questions extracted from https://www.imigrasi.go.id/faq/kartu-perjalanan-pebisnins-apec
8 questions extracted from https://www.imigrasi.go.id/faq/paspor
1 questions extracted from https://www.imigrasi.go.id/faq/pelaporan-orang-asing
8 questions extracted from https://www.imigrasi.go.id/faq/sduwhv
9 questions extracted from https://www.imigrasi.go.id/faq/visa
#############################
{'topic': ['FAQ', 'FAQ', 'FAQ', 'FAQ', 'FAQ', 'FAQ', 'FAQ', 'FAQ', 'FAQ', 'FAQ', '

SCRAPE WEB

In [13]:
print("The following dataset is taken from:\n")

for link in web_urls['urls']:
    print(f"https://www.imigrasi.go.id{link}")

The following dataset is taken from:

https://www.imigrasi.go.idhttps://www.imigrasi.go.id/custom/view?type=wni-category
https://www.imigrasi.go.idhttps://www.imigrasi.go.id/custom/view?type=wna-category


In [18]:
def start_scrape_web():
    driver = webdriver.Chrome()

    all_category_groups = []
    for i, url in enumerate(web_urls['urls']):
        categories_link = extract_category_groups(url, driver, web_urls['classes'][i], web_urls['topic'][i])
        all_category_groups.append(categories_link)

    # manually add UU imigrasi
    all_category_groups.append({
        'topic': ['Regulasi UU Imigrasi'],
        'title': ['Regulasi UU Imigrasi'],
        'link': ['/uu_imigrasi']
    })

    print(all_category_groups)

    qa_all = []

    for group in all_category_groups:
        for i, catLink in enumerate(group['link']):
            categories = extract_categories(f'{base_url}{catLink}', driver)
            
            for index, link in enumerate(categories['link']):
                sub_title = categories['title'][index]
                cat_title = group['title'][i]
                full_question = f'{cat_title} {sub_title}'
                cat_topic = group['topic'][i]
                cat_link = f'{base_url}{link}'
                qa = extract_questions(cat_title, cat_link, driver, cat_topic, full_question)
                print(f'{len(qa["question"])} topics extracted from {cat_link}')
                qa_all.append(qa)
    
    print(qa_all)

    return merge_and_save_data(qa_all, driver, '../data/raw/imigrasi_web_raw.csv')

def clean_faq_web(df_raw):
    df = df_raw.copy()
    df['question'] = df['question'].apply(clean_text_preserve_linebreaks)
    df['answer'] = df['answer'].apply(clean_text_preserve_linebreaks)
    df['type'] = 'web'
    df.head()
    df.to_csv("../data/raw/imigrasi_web_cleaned.csv", index=False)
    print("Data cleaned and saved to imigrasi_web_cleaned.csv")
    return df

In [22]:
df_web_raw = start_scrape_web()
df_web_clean = clean_faq_web(df_web_raw)

[{'topic': ['Warga Negara Indonesia', 'Warga Negara Indonesia', 'Warga Negara Indonesia', 'Warga Negara Indonesia'], 'title': ['Kartu Perjalanan Pebisnis APEC', 'Paspor Baru', 'Penggantian Paspor', 'Ubah Data Paspor'], 'link': ['/wni/kartu-perjalanan-pebisnis-apec', '/wni/paspor-baru', '/wni/ganti-paspor', '/wni/rubah-paspor']}, {'topic': ['Warga Negara Asing', 'Warga Negara Asing', 'Warga Negara Asing'], 'title': ['Daftar Subjek VoA, BVK & Calling Visa', 'Izin Tinggal Keimigrasian', 'Permohonan Visa Republik Indonesia'], 'link': ['/wna/daftar-negara-voa-bvk-calling-visa', '/wna/izin-tinggal-keimigrasian', '/wna/permohonan-visa-republik-indonesia']}, {'topic': ['Regulasi UU Imigrasi'], 'title': ['Regulasi UU Imigrasi'], 'link': ['/uu_imigrasi']}]
5 topics extracted from https://www.imigrasi.go.id/wni/paspor-baru/masyarakat-umum
5 topics extracted from https://www.imigrasi.go.id/wni/paspor-baru/anak-dibawah-17-tahun
5 topics extracted from https://www.imigrasi.go.id/wni/paspor-baru/anak

Combine Dataset

In [39]:
df_faq_clean.head()

Unnamed: 0,topic,category,question,answer_link,answer,type
0,FAQ,M-Paspor,Apa yang harus saya lakukan jika aplikasi M-Pa...,https://www.imigrasi.go.id/faq/m-paspor/apa-ya...,Jika mengalami Request Timed Out (RTO) silakan...,faq
1,FAQ,M-Paspor,Apa yang harus saya lakukan jika tidak bisa lo...,https://www.imigrasi.go.id/faq/m-paspor/apa-ya...,"Apabila tidak dapat login M-Paspor, harap mema...",faq
2,FAQ,M-Paspor,Apa yang harus saya lakukan jika tidak bisa me...,https://www.imigrasi.go.id/faq/m-paspor/apa-ya...,Silakan menunjukkan QR Code dan bukti bayar ya...,faq
3,FAQ,M-Paspor,Bagaimana Cara Menutup Akun pada Aplikasi M-Pa...,https://www.imigrasi.go.id/faq/m-paspor/bagaim...,"Sebelum menghapus atau menutup akun M-Paspor, ...",faq
4,FAQ,M-Paspor,Bagaimana caranya untuk mendapatkan kuota pada...,https://www.imigrasi.go.id/faq/m-paspor/bagaim...,Kuota yang terdapat pada aplikasi M-Paspor mer...,faq


In [24]:
df_web_clean.head()

Unnamed: 0,topic,category,question,answer_link,answer,type
0,Warga Negara Indonesia,Paspor Baru,Paspor Baru Masyarakat Umum Informasi Umum,https://www.imigrasi.go.id/wni/paspor-baru/mas...,Permohonan paspor biasa dapat diajukan oleh wa...,web
1,Warga Negara Indonesia,Paspor Baru,Paspor Baru Masyarakat Umum Persyaratan,https://www.imigrasi.go.id/wni/paspor-baru/mas...,Kartu tanda penduduk (KTP) yang masih berlaku ...,web
2,Warga Negara Indonesia,Paspor Baru,Paspor Baru Masyarakat Umum Prosedur,https://www.imigrasi.go.id/wni/paspor-baru/mas...,Lakukan pendaftaran melalui aplikasi M-Paspor ...,web
3,Warga Negara Indonesia,Paspor Baru,Paspor Baru Masyarakat Umum Mekanisme Penerbitan,https://www.imigrasi.go.id/wni/paspor-baru/mas...,Pemeriksaan kelengkapan dan keabsahan persyara...,web
4,Warga Negara Indonesia,Paspor Baru,Paspor Baru Masyarakat Umum Biaya,https://www.imigrasi.go.id/wni/paspor-baru/mas...,Paspor biasa nonelektronik 48 halaman: Rp350.0...,web


In [40]:
df_combined = pd.concat([df_faq_clean, df_web_clean], ignore_index=True)
df_combined.head()

Unnamed: 0,topic,category,question,answer_link,answer,type
0,FAQ,M-Paspor,Apa yang harus saya lakukan jika aplikasi M-Pa...,https://www.imigrasi.go.id/faq/m-paspor/apa-ya...,Jika mengalami Request Timed Out (RTO) silakan...,faq
1,FAQ,M-Paspor,Apa yang harus saya lakukan jika tidak bisa lo...,https://www.imigrasi.go.id/faq/m-paspor/apa-ya...,"Apabila tidak dapat login M-Paspor, harap mema...",faq
2,FAQ,M-Paspor,Apa yang harus saya lakukan jika tidak bisa me...,https://www.imigrasi.go.id/faq/m-paspor/apa-ya...,Silakan menunjukkan QR Code dan bukti bayar ya...,faq
3,FAQ,M-Paspor,Bagaimana Cara Menutup Akun pada Aplikasi M-Pa...,https://www.imigrasi.go.id/faq/m-paspor/bagaim...,"Sebelum menghapus atau menutup akun M-Paspor, ...",faq
4,FAQ,M-Paspor,Bagaimana caranya untuk mendapatkan kuota pada...,https://www.imigrasi.go.id/faq/m-paspor/bagaim...,Kuota yang terdapat pada aplikasi M-Paspor mer...,faq


In [None]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1582 entries, 0 to 1581
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   topic        1582 non-null   object
 1   category     1582 non-null   object
 2   question     1582 non-null   object
 3   answer_link  1582 non-null   object
 4   answer       1582 non-null   object
 5   type         1582 non-null   object
dtypes: object(6)
memory usage: 74.3+ KB
Data cleaned and saved to imigrasi_combined_topics.csv


In [None]:
df_combined['full_context'] = 'Question: ' + df_combined['question'] + '; Answer: ' + df_combined['answer'] + '; Reference: ' + df_combined['answer_link']
df_combined = df_combined.dropna()

df_combined.to_csv('../data/imigrasi_combined_topics_full_context.csv', index=False)