In [17]:
file_path = "/Users/yoonhae/Downloads/crawling_processed_(데이터팀 검토) 그레인스캐너 패커 크롤링 정리 - Gulfood_정리포맷_20240123_134035.csv"
import pandas as pd

df = pd.read_csv(file_path)

In [2]:
import vertexai
from google.cloud import aiplatform
from langchain.llms.vertexai import VertexAI

PROJECT_ID = "grainscanner"  # @param {type:"string"}
REGION = "asia-northeast3"  # @param {type:"string"}

# Initialize Vertex AI SDK
vertexai.init(project=PROJECT_ID, location=REGION)

# Text model instance integrated with langChain
llm = VertexAI(
    model_name="text-bison", #"gemini-pro",#"text-bison",
    max_output_tokens=1024,
    temperature=0.4,
    top_p=0.8,
    top_k=40,
    verbose=True,
    location=REGION
)

target_column_list = ['homepage', 'extract_url', 'company_name', 'mail', 'phone', 'address', 'fax', 'products', 'error']

In [14]:
from langchain.chains import LLMChain, SimpleSequentialChain
from langchain.prompts import PromptTemplate
from langchain.output_parsers.json import SimpleJsonOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
import json
import time

RETRY_COUNT = 3

def get_string_item(dict_item, key):
    if isinstance(dict_item, dict):
        value = dict_item.get(key, '')
        if not value:
            return ""
        
        while isinstance(value, list) and value:
            value = value[0]
        return value
    
    return ""

def get_list_item(dict_item, key):
    if isinstance(dict_item, dict):
        value = dict_item.get(key, [])
        return value
    elif isinstance(dict_item, list):
        return dict_item
    
    return []


def get_url_list_with_llm(llm, a_tag_list):
    json_parser = SimpleJsonOutputParser()

    filter_template = """
        당신은 유능하고 경험 많은 웹 제작자입니다.
        다음 global 회사 홈페이지에 태그된  a태그 리스트를 보고 
        회사에 contact 할 수 있는 정보를 담은 a태그와 
        취급 product를 설명하는 a태그만 골라서 href attribute를 추출해주세요.
        판단은 엄격하게 진행해주세요.

        Format instructions:
        ["href attribute", "href attribute"]

        -----------------------
        content :
        {a_tag_list}
    """

    filter_prompt_template = PromptTemplate(
        input_variables=["a_tag_list"], 
        template=filter_template, 
        output_parser=json_parser,
        partial_variables={
            "format_instructions": json_parser.get_format_instructions()
        }
    )

    chain = filter_prompt_template|llm|json_parser

    retry_count = 0
    last_error = None
    result_additional_filter_json = None
    while retry_count < RETRY_COUNT:
        try:
            result_additional_filter_json = chain.invoke({"a_tag_list": a_tag_list})
            break
        except Exception as e:
            print(f'    ** retry[{retry_count+1}] - ', e)
            last_error = str(e)
            time.sleep(1)
        retry_count += 1

    if result_additional_filter_json:
        print('   ', result_additional_filter_json)
        return result_additional_filter_json
    else:
        return {'error': last_error}

# 더불어 취급 products가 있다면 핵심 물품만 keyword list로 추출해주세요.
def get_firm_info_with_llm(llm, page_text):
    json_parser = SimpleJsonOutputParser()
    html_template = """
        당신은 유능한 global html parsor 입니다. 
        주어진 html 은 global 회사의 웹페이지입니다. 
        회사에 contact 할 수 있는 email, phone, fax, address 을 추출해주세요. 
        
        Format instructions:
        {format_instructions}

        ------------
        content:
        {page_text}
    """

    html_prompt_template = PromptTemplate(
        input_variables=["page_text"], 
        template=html_template, 
        output_parser=json_parser,
        partial_variables={
            "format_instructions": json_parser.get_format_instructions()# + "\n" + "json key are (email, phone, address, items)"
        }
    )

    html_chain = html_prompt_template | llm | json_parser
    retry_count = 0
    result_html_json = None
    last_error = None
    while retry_count < RETRY_COUNT:
        try :
            result_html_json = html_chain.invoke({"page_text": page_text})
            break
        except Exception as e:
            print(f'    ** retry[{retry_count+1}] - ', e)
            last_error = str(e)
            time.sleep(1)
        retry_count += 1

    if result_html_json:
        print('   ', '\n    '.join(json.dumps(result_html_json, indent=4).split('\n')))
        return result_html_json
    else:
        return {'error': last_error}

In [18]:
import requests
from bs4 import BeautifulSoup
from google.cloud import aiplatform
import time

from requests.adapters import HTTPAdapter, Retry

requests.adapters.DEFAULT_POOLSIZE = 100
retries = Retry(total=20)
# 세션을 생성하고 ConnectionPool 크기를 설정
session = requests.Session()
session.mount('https://', requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100, max_retries=retries))

from urllib.parse import urlparse

get_domain = lambda url: urlparse(url).netloc.replace('www.', '')

def change_protocol(url):
    # URL 파싱
    parsed_url = urlparse(url)
    # 프로토콜 추출
    protocol = parsed_url.scheme
    if protocol == 'http':
        return parsed_url._replace(scheme='https').geturl()
    elif protocol == 'https':
        return parsed_url._replace(scheme='http').geturl()

    return url

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

def extract_content(url, is_recursive=False):
    # 페이지 콘텐츠를 가져옵니다.

    try :
        time.sleep(0.5)
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            return response.status_code, soup
        elif response.status_code == 403:
            if not is_recursive:
                print(f'    ** page 조회 - retry')
                return extract_content(change_protocol(url), is_recursive=True)
            
        return response.status_code, None
    except Exception as e:
        return 500, str(e)


import pandas as pd

def to_dataframe(json_data):
    normalized_df = pd.json_normalize(json_data)
    total_column_list = list(normalized_df.columns)

    merged_df = pd.DataFrame(columns=target_column_list)

    for target_col in target_column_list:
        related_cols = [col for col in total_column_list if target_col in col]

        def merge_row(row):
            merged_data = []
            for item in row.dropna():
                if isinstance(item, dict):
                    merged_data.append(json.dumps(item, ensure_ascii=False))
                elif isinstance(item, list):
                    merged_data.extend([str(x) for x in item])
                elif not pd.isnull(item):
                    merged_data.append(item)
            if merged_data:
                return ';\n'.join(merged_data)
            else:
                return ''

        if related_cols:
            merged_df[target_col] = normalized_df[related_cols].apply(merge_row, axis=1)

    return merged_df


def is_string_not_none_nan(value):
    if pd.notna(value) and isinstance(value, str) and value:
        return True
    return False


def run(main_url):
    print(f'** [main page] 조회 - {main_url}')
    status, soup = extract_content(main_url)

    exists_email = False
    exists_phone = False

    result_df = pd.DataFrame(columns=target_column_list)
    if status == 200:
        print(f'  ** [llm] contact 정보 & products 추출')
        firm_info = {'homepage': main_url,
                     'extract_url': main_url}
        try :
            info = get_firm_info_with_llm(llm, soup.get_text().replace('\n\n\n', ''))
            if info:
                firm_info.update(info)
                sub_df = to_dataframe(firm_info)
                result_df = result_df.append(sub_df, ignore_index=True)
                exists_email = sub_df['mail'].apply(is_string_not_none_nan).any()
                exists_phone = sub_df['phone'].apply(is_string_not_none_nan).any()

                if exists_email and exists_phone:
                    return result_df
        except Exception as e:
            firm_info['error'] = str(e)

        print(f'  ** a tag 추출')
        a_list = soup.find_all('a', href=True)
        
        main_domain = get_domain(main_url)
        distinct_a_list = {}
        for link in a_list:
            if get_domain(link.get('href')).startswith(main_domain):
                distinct_a_list[link.get('href').strip('/')] = link
        
        if distinct_a_list:
            print(f'  ** [llm] 회사 정보와 관련있는 a tag 선별')
            page_list = get_url_list_with_llm(llm, [distinct_a_list.values()])
        
            for sub_url in page_list:
                print(f'  ** [sub page] 조회 - {sub_url}')
                sub_status, sub_soup = extract_content(sub_url)
                if sub_status == 200:
                    print(f'  ** [llm] contact 정보 & products 추출')
                    sub_firm_info = {'homepage': main_url,
                                    'extract_url': sub_url}
                    try :
                        info = get_firm_info_with_llm(llm, sub_soup.get_text().replace('\n\n\n', ''))
                        if info:
                            sub_firm_info.update(info)
                    except Exception as e:
                        sub_firm_info['error'] = str(e)
                    sub_df = to_dataframe(sub_firm_info)
                    result_df = result_df.append(sub_df, ignore_index=True)

                    exists_email = exists_email or sub_df['mail'].apply(is_string_not_none_nan).any()
                    exists_phone = exists_phone or sub_df['phone'].apply(is_string_not_none_nan).any()

                    if exists_email and exists_phone:
                        return result_df
                else:
                    print(f'    [{sub_status}] : 조회 실패')
        else:
            print(f'    [empty]')
    else:
        print(f'    [{status}] : 조회 실패')

    return result_df


In [23]:
# result = []
# for url in df['홈페이지 주소']:
#     if not pd.isnull(url) and not pd.isna(url):
#         sub_result = run(url)
#         if sub_result:
#             result.extend(sub_result)

result_total_df = pd.DataFrame(columns=target_column_list)
count = 0
for url in df['홈페이지 주소']:  # 45
    if not pd.isnull(url) and not pd.isna(url):        
        is_exists = (mail_df['homepage'] == url).any()
        if not is_exists:
            sub_result = run(url)
            if len(sub_result):
                result_total_df = result_total_df.append(sub_result, ignore_index=True)



** [main page] 조회 - https://deoleo.com/
  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "0034 91 558 95 05",
            "fax": null,
            "address": "C/ Marie Curie 7\n4\u00ba plta.,\n28521 Rivas Vaciamadrid,\nMadrid (Espa\u00f1a)."
        }
    }
  ** a tag 추출
  ** [llm] 회사 정보와 관련있는 a tag 선별


  result_df = result_df.append(sub_df, ignore_index=True)


    ['https://deoleo.com/contacto/', 'https://deoleo.com/productos/']
  ** [sub page] 조회 - https://deoleo.com/contacto/
  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "0034 91 558 95 05",
            "fax": null,
            "address": "C/ Marie Curie 7\n4\u00ba plta.,\n28521 Rivas Vaciamadrid,\nMadrid (Espa\u00f1a)."
        }
    }
  ** [sub page] 조회 - https://deoleo.com/productos/


  result_df = result_df.append(sub_df, ignore_index=True)


    [404] : 조회 실패
** [main page] 조회 - http://wingsagro.com


  result_total_df = result_total_df.append(sub_result, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "email": null,
        "phone": null,
        "fax": null,
        "address": null
    }
  ** a tag 추출
    [empty]
** [main page] 조회 - https://www.rkgghee.io/


  result_df = result_df.append(sub_df, ignore_index=True)
  result_total_df = result_total_df.append(sub_result, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+91 944 264 0228",
            "fax": null,
            "address": null
        }
    }
  ** a tag 추출
    [empty]
** [main page] 조회 - https://www.ak.goldenesia.com/


  result_df = result_df.append(sub_df, ignore_index=True)
  result_total_df = result_total_df.append(sub_result, ignore_index=True)


    [500] : 조회 실패
** [main page] 조회 - https://kayalfoods.in/
  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": "info@kayalfoods.in",
            "phone": [
                "+91 93837 88000",
                "+91 93847 88000"
            ],
            "fax": null,
            "address": [
                "KRK Building, 14-A, Pudur Vandi Pathai, PTR Nagar, Jawaharpuram, K.Pudur,Madurai - 625007. Tamil Nadu. India.",
                "110/1, Pallapanaikenpatti, Kovilpatti Post, Natham TK, Dindigul (Dt)-624401."
            ]
        }
    }
** [main page] 조회 - https://www.oliocostadoro.com/en/


  result_df = result_df.append(sub_df, ignore_index=True)
  result_total_df = result_total_df.append(sub_result, ignore_index=True)


    [500] : 조회 실패
** [main page] 조회 - http://www.agro19bhd.com
  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": [
                "info@agro19bhd.com",
                "Sales@agro19bhd.com"
            ],
            "phone": [
                "+603 6274 7026",
                "+6012 3781 540",
                "+6010 8906 806",
                "+6011 2346 5165"
            ],
            "fax": [],
            "address": [
                "No 3, Jalan Helang Hindik, Kepong Baru Industrial Estate,  52100 Kuala Lumpur, Malaysia.",
                "No 25, Lorong Sungai Puloh 7/KU6, Kawasan Perindustrian Sungai Puloh,  42100 Klang, Malaysia."
            ]
        }
    }
** [main page] 조회 - https://www.wellpowerenergy.com/


  result_df = result_df.append(sub_df, ignore_index=True)
  result_total_df = result_total_df.append(sub_result, ignore_index=True)


    [500] : 조회 실패
** [main page] 조회 - https://www.monini.com/it/
  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** a tag 추출
  ** [llm] 회사 정보와 관련있는 a tag 선별


  result_df = result_df.append(sub_df, ignore_index=True)


    ['https://www.monini.com/content/wp-content/uploads/2023/07/BilancioMonini2022_DEF-lo.pdf', 'https://www.monini.com/content/wp-content/uploads/2024/01/BrochurePrecisolivo.pdf']
  ** [sub page] 조회 - https://www.monini.com/content/wp-content/uploads/2023/07/BilancioMonini2022_DEF-lo.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


  ** [llm] contact 정보 & products 추출
    ** retry[1] -  400 The request cannot be processed. The most likely reason is that the provided input exceeded the model's input token limit.
    ** retry[2] -  400 The request cannot be processed. The most likely reason is that the provided input exceeded the model's input token limit.
    ** retry[3] -  400 The request cannot be processed. The most likely reason is that the provided input exceeded the model's input token limit.


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [sub page] 조회 - https://www.monini.com/content/wp-content/uploads/2024/01/BrochurePrecisolivo.pdf
    [404] : 조회 실패
** [main page] 조회 - https://forazeytin.com.tr/


  result_total_df = result_total_df.append(sub_result, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": [
                "+90 266 432 51 20",
                "+90 212 216 41 00"
            ],
            "fax": [
                "+90 266 432 27 37",
                "+90 212 216 41 61"
            ],
            "address": [
                "Mescit Mahallesi, Edremit-Bal\u0131kesir Yolu Caddesi,\nFora Zeytin, No:274\nHavran/Bal\u0131kesir",
                "Barbaros Bulvar\u0131 No:155/B\n34349 Balmumcu\nBe\u015fikta\u015f / \u0130stanbul - T\u00dcRK\u0130YE"
            ]
        }
    }
  ** a tag 추출
    [empty]
** [main page] 조회 - https://www.naturzgroup.com/


  result_df = result_df.append(sub_df, ignore_index=True)
  result_total_df = result_total_df.append(sub_result, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": "username@example.com",
            "phone": "0123456789",
            "fax": "0123456789",
            "address": "123 Main Street, Anytown, CA 12345"
        }
    }
** [main page] 조회 - https://gltraders.com


  result_df = result_df.append(sub_df, ignore_index=True)
  result_total_df = result_total_df.append(sub_result, ignore_index=True)


    [500] : 조회 실패
** [main page] 조회 - http://www.Tesorodelrio.com / http://www.fas.com.tn
    [500] : 조회 실패
** [main page] 조회 - http://www.poyrazolive.com
    [500] : 조회 실패
** [main page] 조회 - https://www.torresyribelles.com/home2
  ** [llm] contact 정보 & products 추출
    {
        "email": "Info@torresyribelles.com",
        "phone": "(+34) 955 679 010",
        "fax": "(+34) 955 679 047",
        "address": "C/Virgen de la Esperanza, N 3 - CP 41703\nDos Hermanas (Sevilla) SPAIN"
    }
** [main page] 조회 - https://www.goldenagri.com.sg


  result_df = result_df.append(sub_df, ignore_index=True)
  result_total_df = result_total_df.append(sub_result, ignore_index=True)


    [520] : 조회 실패
** [main page] 조회 - https://www.marsa.com.tr/en
  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** a tag 추출
    [empty]
** [main page] 조회 - https://exportpackers.com/


  result_df = result_df.append(sub_df, ignore_index=True)
  result_total_df = result_total_df.append(sub_result, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** a tag 추출
  ** [llm] 회사 정보와 관련있는 a tag 선별


  result_df = result_df.append(sub_df, ignore_index=True)


    ['https://exportpackers.com/contact/', 'https://exportpackers.com/international-trading/', 'https://exportpackers.com/retail/', 'https://exportpackers.com/foodservice/', 'https://exportpackers.com/foodservice-2/']
  ** [sub page] 조회 - https://exportpackers.com/contact/
  ** [llm] contact 정보 & products 추출
    {
        "email": null,
        "phone": "+1 (905) 792-9700",
        "fax": "(905) 792-3569",
        "address": "107 Walker Drive, Brampton,Ontario, Canada, L6T 5K5"
    }
  ** [sub page] 조회 - https://exportpackers.com/international-trading/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "905-792-9700",
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://exportpackers.com/retail/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "905-792-9700",
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://exportpackers.com/foodservice/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "905-792-9700",
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://exportpackers.com/foodservice-2/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "905-792-9700",
            "fax": null,
            "address": null
        }
    }
** [main page] 조회 - http://www.grand-oils.com


  result_df = result_df.append(sub_df, ignore_index=True)
  result_total_df = result_total_df.append(sub_result, ignore_index=True)


    [500] : 조회 실패
** [main page] 조회 - https://rajdularbrothers.com/
  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+62 751 483 401",
            "fax": null,
            "address": null
        }
    }
  ** a tag 추출
  ** [llm] 회사 정보와 관련있는 a tag 선별


  result_df = result_df.append(sub_df, ignore_index=True)


    ['https://rajdularbrothers.com/contact/', 'https://rajdularbrothers.com/beans/', 'https://rajdularbrothers.com/beans/cocoa-bean/', 'https://rajdularbrothers.com/beans/coffee-beans/', 'https://rajdularbrothers.com/beans/vanilla-beans/', 'https://rajdularbrothers.com/gums/', 'https://rajdularbrothers.com/gums/gambier/', 'https://rajdularbrothers.com/gums/gum-benzamin/', 'https://rajdularbrothers.com/gums/gum-copal/', 'https://rajdularbrothers.com/gums/gum-damar/', 'https://rajdularbrothers.com/gums/gum-rosin/', 'https://rajdularbrothers.com/nuts/', 'https://rajdularbrothers.com/nuts/betel-nut/', 'https://rajdularbrothers.com/cashew-nut-kernel/', 'https://rajdularbrothers.com/nuts/desiccated-coconut/', 'https://rajdularbrothers.com/nuts/nutmeg/', 'https://rajdularbrothers.com/nuts/raw-cashew-nut/', 'https://rajdularbrothers.com/spices/', 'https://rajdularbrothers.com/spices/black-pepper/', 'https://rajdularbrothers.com/spices/cardamond/', 'https://rajdularbrothers.com/spices/cassia-ve

  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "email": null,
        "phone": "+62 751 483 401",
        "fax": null,
        "address": null
    }
  ** [sub page] 조회 - https://rajdularbrothers.com/beans/cocoa-bean/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+62 751 483 401",
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://rajdularbrothers.com/beans/coffee-beans/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+62 751 483 401",
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://rajdularbrothers.com/beans/vanilla-beans/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+62 751 483 401",
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://rajdularbrothers.com/gums/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "email": null,
        "phone": "+62 751 483 401",
        "fax": null,
        "address": null
    }
  ** [sub page] 조회 - https://rajdularbrothers.com/gums/gambier/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+62 751 483 401",
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://rajdularbrothers.com/gums/gum-benzamin/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+62 751 483 401",
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://rajdularbrothers.com/gums/gum-copal/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+62 751 483 401",
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://rajdularbrothers.com/gums/gum-damar/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+62 751 483 401",
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://rajdularbrothers.com/gums/gum-rosin/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+62 751 483 401",
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://rajdularbrothers.com/nuts/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "email": null,
        "phone": "+62 751 483 401",
        "fax": null,
        "address": null
    }
  ** [sub page] 조회 - https://rajdularbrothers.com/nuts/betel-nut/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+62 751 483 401",
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://rajdularbrothers.com/cashew-nut-kernel/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "email": null,
        "phone": "+62 751 483 401",
        "fax": null,
        "address": null
    }
  ** [sub page] 조회 - https://rajdularbrothers.com/nuts/desiccated-coconut/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+62 751 483 401",
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://rajdularbrothers.com/nuts/nutmeg/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+62 751 483 401",
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://rajdularbrothers.com/nuts/raw-cashew-nut/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+62 751 483 401",
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://rajdularbrothers.com/spices/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "email": null,
        "phone": "+62 751 483 401",
        "fax": null,
        "address": null
    }
  ** [sub page] 조회 - https://rajdularbrothers.com/spices/black-pepper/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+62 751 483 401",
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://rajdularbrothers.com/spices/cardamond/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+62 751 483 401",
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://rajdularbrothers.com/spices/cassia-vera/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+62 751 483 401",
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://rajdularbrothers.com/spices/clove/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+62 751 483 401",
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://rajdularbrothers.com/spices/clove-stem/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+62 751 483 401",
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://rajdularbrothers.com/spices/cubeb/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+62 751 483 401",
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://rajdularbrothers.com/damar-batu/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "email": null,
        "phone": "+62 751 483 401",
        "fax": null,
        "address": null
    }
  ** [sub page] 조회 - https://rajdularbrothers.com/spices/dried-sliced-turmeric/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+62 751 483 401",
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://rajdularbrothers.com/spices/dried-ginger/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+62 751 483 401",
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://rajdularbrothers.com/spices/galangal/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+62 751 483 401",
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://rajdularbrothers.com/spices/long-pepper/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+62 751 483 401",
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://rajdularbrothers.com/spices/mace/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+62 751 483 401",
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://rajdularbrothers.com/spices/patchouli-leaves/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+62 751 483 401",
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://rajdularbrothers.com/spices/tamarind/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+62 751 483 401",
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://rajdularbrothers.com/velvet-tamarind/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+62 751 483 401",
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://rajdularbrothers.com/spices/white-pepper/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+62 751 483 401",
            "fax": null,
            "address": null
        }
    }
** [main page] 조회 - https://sprayleggero.it/en/


  result_df = result_df.append(sub_df, ignore_index=True)
  result_total_df = result_total_df.append(sub_result, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** a tag 추출
  ** [llm] 회사 정보와 관련있는 a tag 선별


  result_df = result_df.append(sub_df, ignore_index=True)


    ['https://sprayleggero.it/en/contact-us', 'https://sprayleggero.it/en/the-flavoured-oils/1-108-black-truffle-spray-in-extra-virgin-olive-oil-8006830991114.html#/27-size-250_ml', 'https://sprayleggero.it/en/the-flavoured-oils/13-112-garlic-spray-in-extra-virgin-olive-oil-8006830991312.html#/27-size-250_ml', 'https://sprayleggero.it/en/the-flavoured-oils/14-116-chilli-pepper-spray-in-extra-virgin-olive-oil-800683099121.html#/27-size-250_ml', 'https://sprayleggero.it/en/the-flavoured-oils/15-120-lemon-spray-in-extra-virgin-olive-oil-800683099069.html#/27-size-250_ml', 'https://sprayleggero.it/en/the-extra-virgin-olive-oil/16-124-100-italian-extra-virgin-olive-oil-800683099009.html#/27-size-250_ml', 'https://sprayleggero.it/en/the-nutraceuticals/21-146-avocado-oil-spray-800683099506.html#/26-size-200_ml', 'https://sprayleggero.it/en/the-nutraceuticals/22-147-flaxseed-oil-spray-800683099516.html#/26-size-200_ml', 'https://sprayleggero.it/en/the-nutraceuticals/23-148-grape-seed-oil-spray

  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://sprayleggero.it/en/the-flavoured-oils/13-112-garlic-spray-in-extra-virgin-olive-oil-8006830991312.html#/27-size-250_ml


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://sprayleggero.it/en/the-flavoured-oils/14-116-chilli-pepper-spray-in-extra-virgin-olive-oil-800683099121.html#/27-size-250_ml


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://sprayleggero.it/en/the-flavoured-oils/15-120-lemon-spray-in-extra-virgin-olive-oil-800683099069.html#/27-size-250_ml


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://sprayleggero.it/en/the-extra-virgin-olive-oil/16-124-100-italian-extra-virgin-olive-oil-800683099009.html#/27-size-250_ml


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://sprayleggero.it/en/the-nutraceuticals/21-146-avocado-oil-spray-800683099506.html#/26-size-200_ml


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://sprayleggero.it/en/the-nutraceuticals/22-147-flaxseed-oil-spray-800683099516.html#/26-size-200_ml


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://sprayleggero.it/en/the-nutraceuticals/23-148-grape-seed-oil-spray-800683099526.html#/26-size-200_ml


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
** [main page] 조회 - https://www.belkisyag.com/


  result_df = result_df.append(sub_df, ignore_index=True)
  result_total_df = result_total_df.append(sub_result, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** a tag 추출
    [empty]
** [main page] 조회 - https://www.marbil.com.tr


  result_df = result_df.append(sub_df, ignore_index=True)
  result_total_df = result_total_df.append(sub_result, ignore_index=True)


    [500] : 조회 실패
** [main page] 조회 - https://agriculture.canada.ca/en
  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** a tag 추출
  ** [llm] 회사 정보와 관련있는 a tag 선별


  result_df = result_df.append(sub_df, ignore_index=True)


    ['https://agriculture.canada.ca/en/canadas-agriculture-sectors/canadian-food-system/taste-commitment-campaign?utm_source=int_web&amp;utm_medium=web&amp;utm_campaign=TTC&amp;utm_content=2023-02-10_0090']
  ** [sub page] 조회 - https://agriculture.canada.ca/en/canadas-agriculture-sectors/canadian-food-system/taste-commitment-campaign?utm_source=int_web&amp;utm_medium=web&amp;utm_campaign=TTC&amp;utm_content=2023-02-10_0090
  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
** [main page] 조회 - http://agrozan.com


  result_df = result_df.append(sub_df, ignore_index=True)
  result_total_df = result_total_df.append(sub_result, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** a tag 추출
  ** [llm] 회사 정보와 관련있는 a tag 선별


  result_df = result_df.append(sub_df, ignore_index=True)


    ['http://agrozan.com', 'http://www.agrozan.com']
  ** [sub page] 조회 - http://agrozan.com
  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - http://www.agrozan.com


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
** [main page] 조회 - https://www.tunasbarulampung.com/


  result_df = result_df.append(sub_df, ignore_index=True)
  result_total_df = result_total_df.append(sub_result, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+62-21-5213383",
            "fax": "+62-21-5213332 / 92",
            "address": "Floor 8-9, Wisma Budi H.R. Rasuna Said Rd. Lot C-6\nJakarta, 12940 \u2013 Indonesia"
        }
    }
  ** a tag 추출
  ** [llm] 회사 정보와 관련있는 a tag 선별


  result_df = result_df.append(sub_df, ignore_index=True)


    ['https://www.tunasbarulampung.com/contact/', 'https://www.tunasbarulampung.com/product-review/', 'https://www.tunasbarulampung.com/product-mix/', 'https://www.tunasbarulampung.com/plantation-profile/', 'https://www.tunasbarulampung.com/production-capacities/']
  ** [sub page] 조회 - https://www.tunasbarulampung.com/contact/
  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+62-21-5213383",
            "fax": "+62-21-5213332 / 92",
            "address": "Floor 8-9, Wisma Budi H.R. Rasuna Said Rd. Lot C-6\nJakarta, 12940 \u2013 Indonesia"
        }
    }
  ** [sub page] 조회 - https://www.tunasbarulampung.com/product-review/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+62-21-5213383",
            "fax": "+62-21-5213332 / 92",
            "address": "Floor 8-9, Wisma Budi H.R. Rasuna Said Rd. Lot C-6\nJakarta, 12940 \u2013 Indonesia"
        }
    }
  ** [sub page] 조회 - https://www.tunasbarulampung.com/product-mix/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+62-21-5213383",
            "fax": "+62-21-5213332 / 92",
            "address": "Floor 8-9, Wisma Budi H.R. Rasuna Said Rd. Lot C-6\nJakarta, 12940 \u2013 Indonesia"
        }
    }
  ** [sub page] 조회 - https://www.tunasbarulampung.com/plantation-profile/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "email": null,
        "phone": "+62-21-5213383",
        "fax": "+62-21-5213332 / 92",
        "address": "Floor 8-9, Wisma Budi H.R. Rasuna Said Rd. Lot C-6\nJakarta, 12940 \u2013 Indonesia"
    }
  ** [sub page] 조회 - https://www.tunasbarulampung.com/production-capacities/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "+62-21-5213383",
            "fax": "+62-21-5213332 / 92",
            "address": "Floor 8-9, Wisma Budi H.R. Rasuna Said Rd. Lot C-6\nJakarta, 12940 \u2013 Indonesia"
        }
    }
** [main page] 조회 - http://www.nazligida.com.tr/en


  result_df = result_df.append(sub_df, ignore_index=True)
  result_total_df = result_total_df.append(sub_result, ignore_index=True)


    [404] : 조회 실패
** [main page] 조회 - https://viterra.ca/
  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** a tag 추출
  ** [llm] 회사 정보와 관련있는 a tag 선별


  result_df = result_df.append(sub_df, ignore_index=True)


    ['https://www.viterra.ca', 'https://viterra.ca/myViterra-registration']
  ** [sub page] 조회 - https://www.viterra.ca
  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://viterra.ca/myViterra-registration


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "email": null,
        "phone": null,
        "fax": null,
        "address": null
    }
** [main page] 조회 - https://alicommercialcorp.com


  result_df = result_df.append(sub_df, ignore_index=True)
  result_total_df = result_total_df.append(sub_result, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "email": null,
        "phone": [
            "0092 42 35858171-2",
            "0092 321 7861119",
            "0092 300 8491745"
        ],
        "fax": [
            "0092\u00a042\u00a035861521"
        ],
        "address": [
            "69-A, New Muslim Town, Lahore Pakistan",
            "2-Km off G.T Road, Kala Shah KakuOpp. Ravi Rayon,\u00a0Pakistan"
        ]
    }
  ** a tag 추출
  ** [llm] 회사 정보와 관련있는 a tag 선별


  result_df = result_df.append(sub_df, ignore_index=True)


    ['https://alicommercialcorp.com/?page_id=74', 'https://alicommercialcorp.com/?page_id=72']
  ** [sub page] 조회 - https://alicommercialcorp.com/?page_id=74
  ** [llm] contact 정보 & products 추출
    {
        "email": null,
        "phone": [
            "0092 42 35858171-2",
            "0092 321 7861119",
            "0092 300 8491745"
        ],
        "fax": [
            "0092\u00a042\u00a035861521"
        ],
        "address": [
            "69-A, New Muslim Town, Lahore Pakistan",
            "2-Km off G.T Road, Kala Shah KakuOpp. Ravi Rayon,\u00a0Pakistan"
        ]
    }
  ** [sub page] 조회 - https://alicommercialcorp.com/?page_id=72


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "email": null,
        "phone": [
            "0092 42 35858171-2",
            "0092 321 7861119",
            "0092 300 8491745"
        ],
        "fax": [
            "0092\u00a042\u00a035861521"
        ],
        "address": [
            "69-A, New Muslim Town, Lahore Pakistan",
            "2-Km off G.T Road, Kala Shah KakuOpp. Ravi Rayon,\u00a0Pakistan"
        ]
    }
** [main page] 조회 - https://www.instagram.com/isofoodco/?hl=en


  result_df = result_df.append(sub_df, ignore_index=True)
  result_total_df = result_total_df.append(sub_result, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "email": null,
        "phone": null,
        "fax": null,
        "address": null
    }
  ** a tag 추출
    [empty]
** [main page] 조회 - https://kcof.trustpass.alibaba.com/


  result_df = result_df.append(sub_df, ignore_index=True)
  result_total_df = result_total_df.append(sub_result, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** a tag 추출
    [empty]
** [main page] 조회 - https://acmonterreal.com/en/


  result_df = result_df.append(sub_df, ignore_index=True)
  result_total_df = result_total_df.append(sub_result, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** a tag 추출
  ** [llm] 회사 정보와 관련있는 a tag 선별


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [sub page] 조회 - https://acmonterreal.com/en/contact/
  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "957 17 60 00",
            "fax": null,
            "address": "Av. de Rafael Castro, 14640 Villa del R\u00edo, C\u00f3rdoba"
        }
    }
  ** [sub page] 조회 - https://acmonterreal.com/en/marca/monterreal-en/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://acmonterreal.com/en/marca/ebest-en/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://acmonterreal.com/en/marca/mundial-en/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://acmonterreal.com/en/marca/private-label/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": "ecoba@oleomonterreal.es",
            "phone": "+34 957 17 60 00",
            "fax": null,
            "address": "Avd. Rafael Castro S/N 14640 Villa del R\u00edo, C\u00f3rdoba (Spain)"
        }
    }
** [main page] 조회 - https://www.arkof.id


  result_df = result_df.append(sub_df, ignore_index=True)
  result_total_df = result_total_df.append(sub_result, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** a tag 추출
  ** [llm] 회사 정보와 관련있는 a tag 선별


  result_df = result_df.append(sub_df, ignore_index=True)


    ['https://www.arkof.id/contact-us/', 'https://www.arkof.id/products/arcrema/', 'https://www.arkof.id/products/arfoamer/', 'https://www.arkof.id/products/arkoffie/']
  ** [sub page] 조회 - https://www.arkof.id/contact-us/
  ** [llm] contact 정보 & products 추출
    {
        "email": "info@arkof.id",
        "phone": "+62.24.76450088",
        "fax": null,
        "address": "Jl. Raya Semarang-Demak KM 12, Sayung,\nDemak 59563\nCentral Java, INDONESIA"
    }
** [main page] 조회 - https://abaliyag.com/en


  result_df = result_df.append(sub_df, ignore_index=True)
  result_total_df = result_total_df.append(sub_result, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": "username@example.com",
            "phone": "0123456789",
            "fax": "0123456789",
            "address": "123 Main Street, Anytown, CA 12345"
        }
    }
** [main page] 조회 - https://ybarra.es/


  result_df = result_df.append(sub_df, ignore_index=True)
  result_total_df = result_total_df.append(sub_result, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** a tag 추출
  ** [llm] 회사 정보와 관련있는 a tag 선별


  result_df = result_df.append(sub_df, ignore_index=True)


    ['https://ybarra.es/atencion-al-consumidor/', 'https://ybarra.es/politica-de-privacidad/']
  ** [sub page] 조회 - https://ybarra.es/atencion-al-consumidor/
  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": [
                "900 905 342",
                "955 675 060"
            ],
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://ybarra.es/politica-de-privacidad/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": "lopd@ybarra.es",
            "phone": null,
            "fax": null,
            "address": "Avenida Rafael Ybarra, 1, 41703, Dos Hermanas, Sevilla (Espa\u00f1a)"
        }
    }
** [main page] 조회 - http://aegangroup.com https://aegangroup.com


  result_df = result_df.append(sub_df, ignore_index=True)
  result_total_df = result_total_df.append(sub_result, ignore_index=True)


    [500] : 조회 실패
** [main page] 조회 - https://aceitesabril.com/
  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** a tag 추출
  ** [llm] 회사 정보와 관련있는 a tag 선별


  result_df = result_df.append(sub_df, ignore_index=True)


    ['https://aceitesabril.com/contacto/', 'https://aceitesabril.com/base-de-oro/', 'https://aceitesabril.com/colleita-propia/', 'https://aceitesabril.com/cultura-formacion-aceite/', 'https://aceitesabril.com/aceites-del-dia-a-dia/', 'https://aceitesabril.com/aceite-aovex/', 'https://aceitesabril.com/aceite-gourmet/']
  ** [sub page] 조회 - https://aceitesabril.com/contacto/
    [404] : 조회 실패
  ** [sub page] 조회 - https://aceitesabril.com/base-de-oro/
  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://aceitesabril.com/colleita-propia/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://aceitesabril.com/cultura-formacion-aceite/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://aceitesabril.com/aceites-del-dia-a-dia/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://aceitesabril.com/aceite-aovex/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://aceitesabril.com/aceite-gourmet/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
** [main page] 조회 - https://arabindia.com/


  result_df = result_df.append(sub_df, ignore_index=True)
  result_total_df = result_total_df.append(sub_result, ignore_index=True)


    [500] : 조회 실패
** [main page] 조회 - https://www.medoil.com.tn/en
    [500] : 조회 실패
** [main page] 조회 - https://www.icex.es
  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "(+34) 913 497 100",
            "fax": null,
            "address": null
        }
    }
  ** a tag 추출
  ** [llm] 회사 정보와 관련있는 a tag 선별


  result_df = result_df.append(sub_df, ignore_index=True)


    ['https://www.icex.es/es/quienes-somos/sala-de-prensa/sala-de-prensa/detalle.AniversarioODS.news000202309', 'https://www.icex.es/es/quienes-somos/sala-de-prensa/sala-de-prensa']
  ** [sub page] 조회 - https://www.icex.es/es/quienes-somos/sala-de-prensa/sala-de-prensa/detalle.AniversarioODS.news000202309
  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "(+34) 913 497 100",
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://www.icex.es/es/quienes-somos/sala-de-prensa/sala-de-prensa


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": "(+34) 913 497 100",
            "fax": null,
            "address": null
        }
    }
** [main page] 조회 - https://bunge.com


  result_df = result_df.append(sub_df, ignore_index=True)
  result_total_df = result_total_df.append(sub_result, ignore_index=True)


    ** page 조회 - retry
    [403] : 조회 실패
** [main page] 조회 - https://www.altas.com.tr/
  ** [llm] contact 정보 & products 추출
    {
        "email": null,
        "phone": null,
        "fax": null,
        "address": null
    }
  ** a tag 추출
  ** [llm] 회사 정보와 관련있는 a tag 선별


  result_df = result_df.append(sub_df, ignore_index=True)


    ['https://www.altas.com.tr/iletisim', 'https://www.altas.com.tr/yatirim/altas-yag', 'https://www.altas.com.tr/yatirim/altas-insaat', 'https://www.altas.com.tr/yatirim/altas-otomotiv', 'https://www.altas.com.tr/yatirim/altas-egitim', 'https://www.altas.com.tr/yatirim/altas-medya', 'https://www.altas.com.tr/yatirim/altas-balik']
  ** [sub page] 조회 - https://www.altas.com.tr/iletisim
  ** [llm] contact 정보 & products 추출
    {
        "email": "info@altas.com.tr",
        "phone": "0452 777 1 777",
        "fax": "0452 777 1 777",
        "address": "Durug\u00f6l Mh. Soya Cd. No:150 Merkez / Ordu - T\u00dcRK\u0130YE"
    }
** [main page] 조회 - https://moicommodities.com/


  result_df = result_df.append(sub_df, ignore_index=True)
  result_total_df = result_total_df.append(sub_result, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** a tag 추출
  ** [llm] 회사 정보와 관련있는 a tag 선별


  result_df = result_df.append(sub_df, ignore_index=True)


    ['https://moicommodities.com/moi-commodities', 'https://moicommodities.com/mewah-group']
  ** [sub page] 조회 - https://moicommodities.com/moi-commodities
  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://moicommodities.com/mewah-group


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
** [main page] 조회 - https://www.ajinomoto.com.tr/


  result_df = result_df.append(sub_df, ignore_index=True)
  result_total_df = result_total_df.append(sub_result, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "email": null,
        "phone": null,
        "fax": null,
        "address": null
    }
  ** a tag 추출
  ** [llm] 회사 정보와 관련있는 a tag 선별


  result_df = result_df.append(sub_df, ignore_index=True)


    ['https://www.ajinomoto.com.tr/iletisim/', 'https://www.ajinomoto.com.tr/amino-asitlerin-sirri/', 'https://www.ajinomoto.com.tr/amino-asitlerin-gucu/']
  ** [sub page] 조회 - https://www.ajinomoto.com.tr/iletisim/
  ** [llm] contact 정보 & products 추출
    {
        "email": null,
        "phone": [
            "+90 (232) 236 76 00",
            "+90 (222) 413 22 83"
        ],
        "fax": [
            "+90 232 236 73 50",
            "+90 (222) 413 22 87"
        ],
        "address": [
            "Maslak Mahallesi, AOS 55. Sokak, 42 Maslak B Blok sit. No:4 / 585 Sar\u0131yer \u2013 \u0130stanbul",
            "\u00c7aml\u0131 Mah. Seferihisar Cad. No:171 PK 35310 G\u00dcZELBAH\u00c7E / \u0130ZM\u0130R",
            "K\u00fctahya Yolu, 17. km Y\u00f6r\u00fck, Ak\u00e7ay\u0131r / ESK\u0130\u015eEH\u0130R"
        ]
    }
  ** [sub page] 조회 - https://www.ajinomoto.com.tr/amino-asitlerin-sirri/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** [sub page] 조회 - https://www.ajinomoto.com.tr/amino-asitlerin-gucu/


  result_df = result_df.append(sub_df, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
** [main page] 조회 - http://www.palmtopvegeoil.com.my


  result_df = result_df.append(sub_df, ignore_index=True)
  result_total_df = result_total_df.append(sub_result, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "contact": {
            "email": null,
            "phone": null,
            "fax": null,
            "address": null
        }
    }
  ** a tag 추출
    [empty]
** [main page] 조회 - https://www.zade.com.tr/


  result_df = result_df.append(sub_df, ignore_index=True)
  result_total_df = result_total_df.append(sub_result, ignore_index=True)


  ** [llm] contact 정보 & products 추출
    {
        "email": null,
        "phone": null,
        "fax": null,
        "address": null
    }
  ** a tag 추출
  ** [llm] 회사 정보와 관련있는 a tag 선별


  result_df = result_df.append(sub_df, ignore_index=True)


    ['https://www.zade.com.tr/bize-ulasin/', 'https://www.zade.com.tr/urunler/hindistan-cevizi-yagi/', 'https://www.zade.com.tr/urunler/aromali-zeytinyagi/', 'https://www.zade.com.tr/urunler/zeytinyagi/', 'https://www.zade.com.tr/urunler/aycicek-yaglari/', 'https://www.zade.com.tr/urunler/misir-yaglari/', 'https://www.zade.com.tr/urunler/fistik-yagi/', 'https://www.zade.com.tr/urunler/endustriyel-yag/', 'https://www.zade.com.tr/urunler/kanola-yagi/', 'https://www.zade.com.tr/urun/naturel-sizma-zeytinyagi-1lt-pet/', 'https://www.zade.com.tr/urun/naturel-sizma-zeytinyagi-2lt-pet/', 'https://www.zade.com.tr/urun/riviera-zeytinyagi-1-lt-pet/', 'https://www.zade.com.tr/urun/riviera-zeytinyagi-2-lt-pet/', 'https://www.zade.com.tr/urun/biberiye-aromali-naturel-sizma-zeytinyagi-250-ml-cam/', 'https://www.zade.com.tr/urun/limon-feslegen-aromali-naturel-sizma-zeytinyagi-250-ml-cam/', 'https://www.zade.com.tr/urun/nar-feslegen-limon-aromali-naturel-sizma-zeytinyagi-250-ml-cam/', 'https://www.zade

  result_df = result_df.append(sub_df, ignore_index=True)
  result_total_df = result_total_df.append(sub_result, ignore_index=True)


    [500] : 조회 실패


In [None]:
import pandas as pd

# 제공된 JSON 데이터
json_data = result
normalized_df = pd.json_normalize(json_data)

target_column_list = ['homepage', 'extract_url', 'company_name', 'mail', 'phone', 'address', 'fax', 'products', 'error']
total_column_list = list(normalized_df.columns)

merged_df = pd.DataFrame(columns=target_column_list)

for target_col in target_column_list:
    related_cols = [col for col in total_column_list if target_col in col]

    def merge_row(row):
        merged_data = []
        for item in row.dropna():
            if isinstance(item, dict):
                merged_data.append(json.dumps(item, ensure_ascii=False))
            elif isinstance(item, list):
                merged_data.extend([str(x) for x in item])
            elif not pd.isnull(item):
                merged_data.append(item)
        if merged_data:
            return ';\n'.join(merged_data)
        else:
            return ''

    if related_cols:
        merged_df[target_col] = normalized_df[related_cols].apply(merge_row, axis=1)

In [None]:
import os
file_path = './output/beautifulsoup_sample.csv'
merged_df.to_csv(file_path, index=False)

# Numbers 애플리케이션으로 CSV 파일 열기
os.system(f'open -a Numbers {file_path}')

여러개 csv 하나로 합쳐서 확인하기

In [28]:
import pandas as pd

total_df = None
total = 0
for i in range(1, 6):
    file_path = f"./output/gulfood_crawling_20240124_00{i}.csv"
    print(file_path)
    partial_df = pd.read_csv(file_path)
    total += partial_df.groupby('homepage').count()

    if i == 1:
        total_df = partial_df
    else:
        total_df = total_df.append(partial_df, ignore_index=True)

./output/gulfood_crawling_20240124_001.csv
./output/gulfood_crawling_20240124_002.csv
./output/gulfood_crawling_20240124_003.csv
./output/gulfood_crawling_20240124_004.csv
./output/gulfood_crawling_20240124_005.csv


  total_df = total_df.append(partial_df, ignore_index=True)
  total_df = total_df.append(partial_df, ignore_index=True)
  total_df = total_df.append(partial_df, ignore_index=True)
  total_df = total_df.append(partial_df, ignore_index=True)


In [29]:
def unique_list(series):
    # NaN 값을 제거하고, 중복을 제거한 후 리스트로 변환
    return series.dropna().drop_duplicates().tolist()


# 'homepage'를 기준으로 그룹화하고 각 컬럼에 unique_list 함수 적용
grouped_df = total_df.groupby('homepage').agg(unique_list).reset_index()


In [None]:
def convert_to_string(value):
    # 값이 리스트인 경우
    if isinstance(value, list):
        # 빈 리스트인 경우 None 반환
        if not value:
            return ''
        # 리스트의 요소들을 문자열로 결합
        return ";\n".join(str(item) for item in value)
    # 리스트가 아닌 경우, 값을 그대로 반환
    return value

# 'homepage' 컬럼을 제외한 모든 컬럼에 대해 함수 적용
for col in grouped_df.columns:
    if col != 'homepage':
        grouped_df[col] = grouped_df[col].apply(convert_to_string)

print(grouped_df)


In [30]:
import pandas as pd

# 가정: df는 이미 생성된 DataFrame이며 'mail' 컬럼을 포함하고 있음

# 'mail' 컬럼이 빈 리스트인 행만 필터링
mail_df = grouped_df[grouped_df['mail'].apply(lambda x: len(x) > 0)]



In [None]:
grouped_df.to_csv('./output/gulfood_20240123_1645.csv')