In [1]:
import vertexai
from google.cloud import aiplatform
from langchain.llms.vertexai import VertexAI
from langchain.output_parsers.boolean import BooleanOutputParser
from langchain.prompts import PromptTemplate
import pandas as pd

PROJECT_ID = "grainscanner"  # @param {type:"string"}
REGION = "asia-northeast3"  # @param {type:"string"}

# Initialize Vertex AI SDK
vertexai.init(project=PROJECT_ID, location=REGION)

# Text model instance integrated with langChain


def verify_url(llm, name, url, meta):
    boolean_parser = BooleanOutputParser()

    filter_template = """
        Goal: Make sure the URL is the correct URL
        Data given: Company name, URL, homepage metadata
        Process
        1. Verifying the URL by verifying that the company name and URL are related
        2. Verification by verifying that the company name and homepage metadata are correct

        Format instructions:
        If it's related, return it to YES, if not, return it to NO.

        Company name:{name}
        URL:{url}
        homepage metadata:{meta}

        output:
    """

    filter_prompt_template = PromptTemplate(
        input_variables=["name", "url", "meta"],
        template=filter_template,
        output_parser=boolean_parser,
        # partial_variables={
        #     "format_instructions": boolean_parser.get_format_instructions()
        # }
    )

    chain = filter_prompt_template | llm | boolean_parser

    try:
        result_additional_filter_json = chain.invoke(
            {"name": name, "url": url, "meta": meta}
        )
    except Exception as e:
        print(f"error : {e}")

    return result_additional_filter_json

In [2]:


import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
df = pd.read_csv("/Users/yoonhae/Downloads/중국_20240202_0959.csv")

for i in range(len(df)):
    comp_name = df.loc[i, "comp_name"]
    homepage = df.loc[i, "homepage"]

    try:
        if (
            pd.notna(homepage) and homepage.strip()
        ):  # homepage가 비어있지 않고, 공백이 아닌 경우
            response = requests.get(homepage, timeout=6)
            soup = BeautifulSoup(response.text, "html.parser")
            metadata = ""
            for meta in soup.find_all("meta"):
                metadata += str(meta.attrs)

            llm = VertexAI(
                model_name="text-bison@001",
                max_output_tokens=1024,
                temperature=0,
                top_p=0.8,
                top_k=40,
                verbose=True,
                location=REGION,
            )

            is_url = verify_url(llm, comp_name, homepage, metadata)
            df.loc[i, "is_url"] = is_url
            print(comp_name, homepage, is_url)
    except requests.exceptions.MissingSchema:
        # 잘못된 URL 형식에 대한 예외 처리
        print(f"잘못된 URL 형식: {homepage}")
        continue
    except Exception as e:
        # 기타 예외 처리
        print(f"예외 발생: {e}")
        continue


  warn_deprecated(


VITERRA LTD https://www.viterra.com/ True
SHANNON AGENCY'S PTY LTD https://www.zoominfo.com/c/shannon-bros/371459344 False
SEMAPHORE CONTAINER SERVICES PTY LTD https://www.semaphorecs.com.au/ True
ROBINSON GRAIN TRADING CO PTY LTD https://www.robinsongrain.com.au/ True
QUEENSLAND BULK TERMINALS PTY LTD http://www.qldbulkterminals.com/ True
GRAINPAC PTY LTD http://www.grainpac.com.au/ True
GRAINLINK (NSW) PTY LTD https://abr.business.gov.au/ABN/View/22094464516 False
CANOWINDRA PRODUCE CO PTY LTD https://www.canowindraproduce.com.au/ True
INDEPENDENT GRAIN HANDLERS PTY LTD https://soilms.com.au/contact/ False
MOUNTAIN INDUSTRIES PTY LTD https://au.linkedin.com/company/mountain-industries-pty-ltd True
예외 발생: Failed to parse: 제공된 JSON 데이터에는 "JAMES DOUGLAS WALLACE, ANNETTE RUTH WALLACE, JUSTIN ERIC WALLACE, LORELEI WALLACE"라는 이름의 회사 홈페이지 URL이 포함되어 있지 않습니다. 따라서 빈 문자열("")을 반환합니다.
MOUNTAIN INDUSTRIES PTY LTD https://au.linkedin.com/company/mountain-industries-pty-ltd True
PREMIUM GRAIN HANDLE

In [3]:
df

Unnamed: 0,no,comp_name,city,state,homepage,is_url
0,1008,AGRIGRAIN (COONAMBLE) PTY LTD,COONAMBLE,NSW,,
1,1038,VITERRA LTD,PORT ADELAIDE,SA,https://www.viterra.com/,True
2,1076,SHANNON AGENCY'S PTY LTD,BEULAH,VIC,https://www.zoominfo.com/c/shannon-bros/371459344,False
3,1078,SEMAPHORE CONTAINER SERVICES PTY LTD,RICHMOND,SA,https://www.semaphorecs.com.au/,True
4,1089,ROBINSON GRAIN TRADING CO PTY LTD,DUBBO,NSW,https://www.robinsongrain.com.au/,True
...,...,...,...,...,...,...
95,1334,ESPERANCE QUALITY GRAINS PTY LTD,ESPERANCE,WA,https://esperancequalitygrains.com/,True
96,2397,SEMAPHORE CONTAINER SERVICES PTY LTD,OSBORNE,SA,https://www.semaphorecs.com.au/,True
97,3905,CO-OPERATIVE BULK HANDLING LTD,FORRESTFIELD,WA,https://www.cbh.com.au/,True
98,1729,CONTINENTAL GRAIN HANDLING PTY LTD,BIBRA LAKE,WA,https://www.farmweekly.com.au/story/6939913/cg...,False
