In [None]:
# !pip install -qq bs4 selenium webdriver-manager

In [1]:
import os
import json
import requests

from typing import Optional
from time import sleep

from tqdm import tqdm
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException

In [65]:
path = "../urls.txt"
with open(path, 'r') as file:
    urls = file.read().split('\n')

In [45]:
class VisaSubclass:

    def __init__(self, url: str, driver: webdriver.Chrome = None, is_child=False) -> None:
        self.url = url
        self.driver = driver if driver else self.get_driver()
        self.is_child = is_child
        self.is_parent = False
        self.visa_type = self.get_visa_type()
        self.visa_name = self.get_visa_name()
        self.raw_html = self.get_raw_html()
        self.start_process()
        

    def parse_and_save(self):
        if self.is_parent == False:
            self.doc = self.parse()
            self.save()


    def get_visa_type(self) -> str:
        return (
            self.url.split('visa-listing/')[-1]
            .replace('/', '_')
        )


    def get_visa_name(self) -> str:
        return (
            self.visa_type.replace('_', ': ')
            .replace('-', ' ')
            .title()
        )

    def get_raw_html(self):
        self.driver.get(self.url)
        sleep(0.5)
        soup = BeautifulSoup(self.driver.page_source, "lxml")
        return soup


    def find_all_data(self):
        args = {
            "name": "input",
            "attrs": {
                "name": "ctl00$PlaceHolderMain$PageSchemaHiddenField$Input"
            }
        }
        fulldata = self.raw_html.findAll('div', {'class': 'hidden'})
        for tag in fulldata:
            if (raw_json := tag.find(**args)):
                return json.loads(raw_json.get('value'))
            else:
                continue


    def parse(self):
        return f"""# {self.visa_name}\n
## 1. Overview\n{self._parse_overview()}\n\n
## 2. About this visa\n{self._parse_about()}\n\n
## 3. Elegibility\n{self._parse_eligibility()}\n\n
## 4. Step by Step\n{self._parse_stepbystep()}\n\n
## 5. When you have this visa\n{self._parse_when()}""".replace('\xa0', ' ').replace('\t', '')


    @staticmethod
    def get_driver():
        driver = webdriver.Chrome(ChromeDriverManager().install())
        return driver


    @staticmethod
    def recursive_concat(list_of_dicts, ignore_keys: Optional[list]=None):
        if ignore_keys is None:
            ignore_keys = []
        doc = ""
        for criteria in list_of_dicts:
            criteria_doc = ""
            for key, value in criteria.items():
                if key in ignore_keys:
                    continue
                else:
                    if isinstance(value, str):
                        text = BeautifulSoup(value).text
                    elif isinstance(value, list):
                        text = VisaSubclass.recursive_concat(value)
                    else:
                        continue
                    criteria_doc += text + '\n'
            doc += criteria_doc + '\n\n'
        return doc


    @staticmethod
    def parse_bullets(about_dict):
        def bullet_parser(bullet_dict):
            title = bullet_dict['text']
            content = BeautifulSoup(bullet_dict['block']).text
            content = content.replace('\n', '\n- ').rstrip('- ').rstrip('\n')
            return f"{title}\n{content}"

        about_doc = ""
        for bullet in about_dict['content']:
            about_doc += bullet_parser(bullet) + '\n\n'

        return about_doc


    def _parse_eligibility(self):
        elegibility = (
                BeautifulSoup(
                self.data['applicant']['eligibility'].get('callToAction', {'body': ''})['body']
            ).text
            + 
            self.recursive_concat(
                self.data['applicant']['eligibility']['criteria']
            )
        )
        return elegibility


    def _parse_overview(self):
        overview = self.data['applicant']['overview']

        withvisa = BeautifulSoup(
            overview['withThisVisaHeading'] + '\n\n' + overview['withThisVisa']
        ).text

        notes = BeautifulSoup(
            overview['notesHeading'] + '\n\n' + overview['notes']
        ).text

        stay = BeautifulSoup(overview['visaStay']).text

        return f"""{withvisa}
        {notes}
        {stay}"""


    def _parse_about(self):
        about = self.data['applicant']['aboutVisa']
        return self.parse_bullets(about)


    def _parse_stepbystep(self):  
        return self.recursive_concat(
            self.data['applicant']['stepGuide']['steps'],
            ignore_keys=[
                'description', 
                'customButtonLink', 
                'customButtonText',
                'id',
                'collapsed',
                'hasCustomButton'
            ]
        )


    def _parse_when(self):
        return self.parse_bullets(
            self.data['applicant']['haveThisVisa']
        )


    def _check_if_parent(self):
        base_xpath = '//*[@id="tab-pane-1"]/ha-streams-root/form/' \
            'div[3]/div/ha-card-picker/div/div/div[{i}]/'\
            'ha-visa-card/div/div[1]/div/div[1]/div/h2/a'

        self.child_links = {}
        for i in range(1, 4):
            try:
                element = self.driver.find_element_by_xpath(
                    base_xpath.format(i=i)
                )
                self.child_links[i] = element.get_attribute('href')
            except NoSuchElementException:
                continue

        if len(self.child_links) == 0:
            return False
        else:
            return True



    def start_process(self):
        if self.is_child:
            self.is_parent = False
            self.data = self.find_all_data()
            return
        else:
            self.is_parent = self._check_if_parent()
            if self.is_parent:
                self.children = []
                for i, link in self.child_links.items():
                    child = VisaSubclass(
                        link,driver=self.driver, is_child=True
                    )
                    self.children.append(child)
            else:
                self.data = self.find_all_data()
            
            return


    def save(self, path="../data"):
        with open(f"{path}/{self.visa_type}.md", "w") as f:
            f.write(self.doc)

In [25]:
driver = VisaSubclass.get_driver()

In [47]:
errors = {}
for url in tqdm(urls):
    try:
        visa = VisaSubclass(url, driver=driver)
        visa.parse_and_save()
    except Exception as e:
        errors[url] = {
            "error": str(e),
            "traceback": str(traceback.format_exc()),
            "visa_dict": str(visa.__dict__)
        }
        continue

  text = BeautifulSoup(value).text
100%|██████████| 76/76 [03:18<00:00,  2.61s/it]


In [None]:
driver.close()

In [58]:
with open('errors.json', 'w') as f:
    json.dump(errors, f)