# Install requirement for crawl4ai

In [None]:
!crawl4ai-setup
!crawl4ai-doctor
!playwright install

# Load Requirements

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
from crawl4ai import AsyncWebCrawler, LXMLWebScrapingStrategy
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig

# Scraping

In [None]:
url = 'https://www.ipthailand.go.th/th/trademark-002.html'

In [None]:
async def crawler_function(url):
    browser_config = BrowserConfig(verbose=False) # verbose for logging

    run_config = CrawlerRunConfig(
      # word_count_threshold=10,        # Minimum words per content block
      exclude_external_links=True,    # Remove external links
      remove_overlay_elements=True,   # Remove popups/modals
      process_iframes=True            # Process iframe content
      )

    async with AsyncWebCrawler(config=browser_config) as crawler:
        result = await crawler.arun(
            url=url,
            config=run_config
        )
        # print(result.markdown)  # Print clean markdown content
        return result

In [None]:
result = await crawler_function(url)

In [None]:
print(result.markdown)

In [None]:
url = 'https://www.ipthailand.go.th/th/trademark-002.html'

In [None]:
result = await crawler_function(url)

In [None]:
print(result.html)

In [None]:
print(result.markdown)

In [None]:
from IPython.display import Markdown, display

In [None]:
display(Markdown(result.markdown[:5000])) 

## Crawl into another URL

In [None]:
soup = BeautifulSoup(result.html, 'html.parser')
print(soup.prettify())

In [None]:
divs = soup.find_all('div', class_='uk-width-medium-1-2')

In [None]:
for d in divs:
    print('https://www.ipthailand.go.th/'+d.find('a', class_='zx')['href'])

In [None]:
divs[0].find('a', class_='zx')['href']

In [None]:
'https://www.ipthailand.go.th/'+divs[0].find('a', class_='zx')['href']

In [None]:
async def content_crawler(url):
    config = CrawlerRunConfig(
        scraping_strategy=LXMLWebScrapingStrategy(),
        verbose=False
    )
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(url, config=config)

    return result

In [None]:
result = await content_crawler('https://www.ipthailand.go.th/'+divs[4].find('a', class_='zx')['href'])

In [None]:
print(result.markdown)
# display(Markdown(result.markdown[:50000])) 

In [None]:
datas = []
result = await crawler_function('https://www.ipthailand.go.th/th/trademark-002.html')
divs = soup.find_all('div', class_='uk-width-medium-1-2')
for d in divs:
    datas.append(await content_crawler('https://www.ipthailand.go.th/'+d.find('a', class_='zx')['href']))

# Transform into DataFrame

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame({
    'markdown' : [row.markdown for row in datas],
    'content' : [row.html for row in datas]
})

In [None]:
df

In [None]:
df['markdown'][0]

In [None]:
display(Markdown(df['markdown'][1][:25000]))

# Scraping PDF Files

In [None]:
result = await crawler_function('https://tpso.go.th/articles')

In [None]:
# display(Markdown(result.markdown))

In [None]:
soup = BeautifulSoup(result.html, 'html.parser')
print(soup.prettify())

In [None]:
divs = soup.find_all('a', class_='underline-offset-2 hover:underline')

In [None]:
divs

In [None]:
divs[0]['href']

In [None]:
import os

In [None]:
downloads_path = './pdf_downloads'
os.makedirs(downloads_path, exist_ok=True)

## More Scraping

In [None]:
datas = []
for d in divs:
    datas.append(await crawler_function('https://tpso.go.th'+d['href']))
    # print('https://tpso.go.th'+d['href'])

In [None]:
divs

## Download PDF

In [None]:
for row in datas:
    for uri in row.links['internal']:
        if 'uploads' in uri['href']: 
            name = uri['href'].split('/')[-1]
            res = requests.get(uri['href'])
            pdf = open(downloads_path+'/'+row.url.split('/')[-1]+'.pdf', 'wb')
            pdf.write(res.content)
            pdf.close()

# Transform into DataFrame

In [None]:
df2 = pd.DataFrame({
    'markdown' : [row.markdown for row in datas],
    'content' : [row.html for row in datas],
    'pdf_file' : [row.url.split('/')[-1]+'.pdf' for row in datas for uri in row.links['internal'] if 'uploads' in uri['href']]
})

In [None]:
df2

In [None]:
downloads_path = './json_file'
os.makedirs(downloads_path, exist_ok=True)

# Save to json line

In [None]:
df.to_json('./json_file/ipthailand.jsonl', orient='records', lines=True)
df2.to_json('./json_file/tpso.jsonl', orient='records', lines=True)