In [3]:
url_list = [
# 'http://www.kimtex.com',
# 'http://swamyweb.com',

#'http://www.kurashiki.com.br',
# 'http://www.unitexbd.com',

# 'http://www.beddinghouse.com',
# 'http://www.nineandco.com',
'http://www.carcemal.pt',
#'http://www.silsa.pt',
]

In [13]:
import vertexai
from google.cloud import aiplatform
from langchain.llms.vertexai import VertexAI

PROJECT_ID = "grainscanner"  # @param {type:"string"}
REGION = "asia-northeast3"  # @param {type:"string"}

# Initialize Vertex AI SDK
vertexai.init(project=PROJECT_ID, location=REGION)

# Text model instance integrated with langChain
llm = VertexAI(
    model_name="gemini-pro",#"text-bison",
    max_output_tokens=1024,
    temperature=0.4,
    top_p=0.8,
    top_k=40,
    verbose=True,
)


In [17]:
from langchain.chains import LLMChain, SimpleSequentialChain
from langchain.prompts import PromptTemplate
from langchain.output_parsers.json import SimpleJsonOutputParser
import json

RETRY_COUNT = 3

def get_string_item(dict_item, key):
    if isinstance(dict_item, dict):
        value = dict_item.get(key, '')
        if not value:
            return ""
        
        while isinstance(value, list) and value:
            value = value[0]
        return value
    
    return ""

def get_list_item(dict_item, key):
    if isinstance(dict_item, dict):
        value = dict_item.get(key, [])
        return value
    elif isinstance(dict_item, list):
        return dict_item
    
    return []


def get_url_list_with_llm(llm, a_tag_list):
    json_parser = SimpleJsonOutputParser()

    filter_template = """
        당신은 유능하고 경험 많은 웹 제작자입니다.
        다음 global 회사 홈페이지에 태그된  a태그 리스트를 보고 
        회사에 contact 할 수 있는 정보와 취급 product를 설명하는 a태그만 골라서 href attribute를 추출해주세요.

        Format instructions:
        ["href attribute", "href attribute"]

        -----------------------
        content :
        {a_tag_list}
    """

    filter_prompt_template = PromptTemplate(
        input_variables=["a_tag_list"], 
        template=filter_template, 
        output_parser=json_parser,
        partial_variables={
            "format_instructions": json_parser.get_format_instructions()
        }
    )

    chain = filter_prompt_template|llm|json_parser

    retry_count = 0
    last_error = None
    while retry_count < RETRY_COUNT:
        try:
            result_additional_filter_json = chain.invoke({"a_tag_list": a_tag_list})
            break
        except Exception as e:
            print('** error - ', e)
            last_error = str(e)
        retry_count += 1

    if result_additional_filter_json:
        print(result_additional_filter_json)
        return result_additional_filter_json
    else:
        return []


def get_firm_info_with_llm(llm, page_text):
    json_parser = SimpleJsonOutputParser()
    # {{{{"email": "test@email.com", "phone": "010-1234-5678", "address": "korea", "items": ["rice", "meal"]}}}}
    html_template = """
        당신은 유능한 global html parsor 입니다. 
        주어진 html 은 global 회사의 웹페이지입니다. 
        회사에 contact 할 수 있는 email, phone, fex, address 을 추출해주세요. 
        더불어 취급 물품이 있다면 해당 정보도 추출해주세요.
        
        결과는 json 으로 구성해서 전달해주세요.

        ------------
        content:
        {page_text}
    """

    html_prompt_template = PromptTemplate(
        input_variables=["page_text"], 
        template=html_template, 
        output_parser=json_parser,
        partial_variables={
            "format_instructions": json_parser.get_format_instructions()
        }
    )

    html_chain = html_prompt_template | llm | json_parser

    retry_count = 0
    result_html_json = None
    last_error = None
    while retry_count < RETRY_COUNT:
        try :
            result_html_json = html_chain.invoke({"page_text": page_text})
            break
        except Exception as e:
            print('** error - ', e)
            last_error = str(e)
        retry_count += 1

    if result_html_json:
        print(result_html_json)
        return result_html_json
    else:
        return {'error': last_error}

In [32]:
import requests
from bs4 import BeautifulSoup
from google.cloud import aiplatform
import time

from requests.adapters import HTTPAdapter, Retry

requests.adapters.DEFAULT_POOLSIZE = 100
retries = Retry(total=20)
# 세션을 생성하고 ConnectionPool 크기를 설정
session = requests.Session()
session.mount('https://', requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100, max_retries=retries))

from urllib.parse import urlparse

get_domain = lambda url: urlparse(url).netloc.replace('www.', '')

def change_protocol(url):
    # URL 파싱
    parsed_url = urlparse(url)
    # 프로토콜 추출
    protocol = parsed_url.scheme
    if protocol == 'http':
        return parsed_url._replace(scheme='https').geturl()
    elif protocol == 'https':
        return parsed_url._replace(scheme='http').geturl()

    return url

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

def extract_content(url, is_recursive=False):
    # 페이지 콘텐츠를 가져옵니다.
    time.sleep(0.5)
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        return response.status_code, soup
    elif response.status_code == 403:
        if not is_recursive:
            return extract_content(change_protocol(url), is_recursive=True)
    return response.status_code, None


def run(main_url):
    status, soup = extract_content(main_url)

    result = []

    if status == 200:
        a_list = soup.find_all('a', href=True)
        
        main_domain = get_domain(main_url)
        distinct_a_list = {}
        for link in a_list:
            print(link.get('href'), ' - ', get_domain(link.get('href')))
            if get_domain(link.get('href')).startswith(main_domain):
                distinct_a_list[link.get('href').strip('/')] = link
        print(distinct_a_list)
        page_list = get_url_list_with_llm(llm, distinct_a_list)
        firm_info = get_firm_info_with_llm(llm, soup.get_text())
        if firm_info:
            firm_info['homepage'] = main_url
            firm_info['extract_url'] = main_url

            result.append(firm_info)
        print(f' ** main({main_url}):', firm_info)

        for sub_url in page_list:
            sub_status, sub_soup = extract_content(sub_url)
            if sub_status == 200:
                sub_firm_info = get_firm_info_with_llm(llm, sub_soup.get_text())
                if sub_firm_info:
                    sub_firm_info['homepage'] = main_url
                    sub_firm_info['extract_url'] = sub_url
                    result.append(sub_firm_info)
                print(f' ** sub_link({sub_url}):', sub_firm_info)

    return result


In [33]:
result = []
for url in url_list:
    sub_result = run(url)
    if sub_result:
        result.extend(sub_result)

https://www.carcemal.pt:443/pt/  -  carcemal.pt:443
https://www.carcemal.pt:443/pt/  -  carcemal.pt:443
https://www.carcemal.pt:443/pt/1-empresa/  -  carcemal.pt:443
https://www.carcemal.pt:443/pt/4-produtos/  -  carcemal.pt:443
https://www.carcemal.pt:443/pt/8-galeria/  -  carcemal.pt:443
https://www.carcemal.pt:443/pt/2-contactos/  -  carcemal.pt:443
https://www.carcemal.pt:443/pt/  -  carcemal.pt:443
https://www.carcemal.pt:443/en/  -  carcemal.pt:443
https://www.carcemal.pt:443/pt/  -  carcemal.pt:443
https://www.carcemal.pt:443/pt/  -  carcemal.pt:443
https://www.carcemal.pt:443/pt/1-empresa/  -  carcemal.pt:443
https://www.carcemal.pt:443/pt/4-produtos/  -  carcemal.pt:443
https://www.carcemal.pt:443/pt/8-galeria/  -  carcemal.pt:443
https://www.carcemal.pt:443/pt/2-contactos/  -  carcemal.pt:443
https://www.carcemal.pt:443/pt/  -  carcemal.pt:443
https://www.carcemal.pt:443/en/  -  carcemal.pt:443
/pt/1-Empresa/  -  
https://www.carcemal.pt:443/pt/8-galeria/  -  carcemal.pt:443


In [57]:
result

[]

In [32]:
import pandas as pd

# 제공된 JSON 데이터
json_data = result

# DataFrame으로 변환
df = pd.json_normalize(json_data)

# DataFrame 출력
df.to_csv('./output/beautifulsoup_sample.csv')
sample_df = df.copy()


In [50]:

target_column_list = ['homepage', 'extract_url', 'company_name', 'email', 'phone', 'address', 'fax', 'products']

In [55]:
def merge_columns(df, total_columns, target_columns):
    merged_df = pd.DataFrame()

    for target_col in target_columns:
        related_cols = [col for col in total_columns if target_col in col]

        def merge_row(row):
            merged_data = []
            for item in row.dropna():
                if isinstance(item, dict):
                    for key, value in item.items():
                        merged_data.append({key: value})
                else:
                    merged_data.append(item)
            return merged_data

        merged_df[target_col] = df[related_cols].apply(merge_row, axis=1)

    return merged_df

# DataFrame에 merge_columns 함수를 적용
merged_df = merge_columns(sample_df, list(sample_df.columns), target_column_list)

# 결과 DataFrame의 첫 5행을 출력
merged_df.head(1)



Unnamed: 0,homepage,extract_url,company_name,email,phone,address,fax,products
0,[http://www.kimtex.com],[http://www.kimtex.com],[],[],[],[],[],[[Moda Kumaşlar]]


In [56]:

# 각 컬럼에 대해 리스트를 문자열로 변환하고, 홑따옴표 및 대괄호를 제거합니다.
for col in merged_df.columns:
    # 컬럼이 문자열이 아닌 경우 문자열로 변환
    if merged_df[col].dtype != 'object':
        merged_df[col] = merged_df[col].astype(str)

    # 리스트의 리스트 혹은 단일 리스트를 문자열로 변환
    merged_df[col] = merged_df[col].apply(lambda x: ',\n'.join([',\n'.join(map(str, sublist)) if isinstance(sublist, list) else str(sublist) for sublist in x]) if isinstance(x, list) else x)

    # 홑따옴표와 대괄호 제거
    merged_df[col] = merged_df[col].str.replace(r"\[|\]|\'", "", regex=True)

# 변경된 DataFrame의 첫 5행을 출력
merged_df.head()


Unnamed: 0,homepage,extract_url,company_name,email,phone,address,fax,products
0,http://www.kimtex.com,http://www.kimtex.com,,,,,,Moda Kumaşlar
1,http://www.kimtex.com,https://kimtex.com/iletisim,,info@kimtex.com,"+90 322 394 30 49,\n+90 212 296 70 98,\n+1 212...",Acıdere Osb Mahallesi Turgut Özal Blv. No:3/5 ...,,
2,http://swamyweb.com,http://swamyweb.com,Swamy Cotton Mill,info@swamycottonmill.co.in,(91) 421 2344042,"S.F.No.407/2, Peerchangadu, Tirupur-641663.",,"Cotton,\nViscose,\nLinen,\nWoven Fabrics"
3,http://swamyweb.com,https://swamyweb.com/contact-us,Swamy Cotton,info@swamycottonmill.co.in,+ 91 421 2344042,"S.F.No.407/2, Peerchangadu, Mangalam, Tirupur ...",+ 91 421 2345750,
4,http://swamyweb.com,http://swamyweb.com/demo/contact-us,,info@swamycottonmill.co.in,"+ 91 421 2344042,\n+ 91 421 2345750","S.F.No.407/2, Peerchangadu, Mangalam, Tirupur ...",,finest quality fabrics


In [57]:
merged_df.to_csv('./output/BeautifulSoup_sample_merged_string.csv')

200

In [40]:
from selenium import webdriver

# 웹드라이버 설정 (예: Chrome의 경우)
driver = webdriver.Chrome()

In [51]:
driver.get('http://www.aaaa.pt')
# 페이지의 HTML 가져오기
html_content = driver.page_source
soup = BeautifulSoup(html_content, 'html.parser')

In [53]:
driver.page_source

'<html lang="en"><head>\n  <meta charset="utf-8">\n  <meta http-equiv="x-ua-compatible" content="ie=edge">\n  <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">\n  <title>403 Forbidden</title>\n  <link rel="stylesheet" href="/error_docs/styles.css">\n</head>\n<body>\n<div class="page">\n  <div class="main">\n    <h1>Server Error</h1>\n    <div class="error-code">403</div>\n    <h2>Forbidden</h2>\n    <p class="lead">You do not have permission to access this document.</p>\n    <hr>\n    <p>That\'s what you can do</p>\n    <div class="help-actions">\n      <a href="javascript:location.reload();">Reload Page</a>\n      <a href="javascript:history.back();">Back to Previous Page</a>\n      <a href="/">Home Page</a>\n    </div>\n  </div>\n</div>\n\n</body></html>'