In [1]:
import re

replacement_mapping = {
    'À': 'A', 'Á': 'A', 'Â': 'A', 'Ã': 'A', 'Ä': 'A', 'Å': 'A', 'Ā': 'A', 
    'Æ': 'AE', 
    'Ç': 'C', 'Č': 'C', 
    'Ð': 'D', 
    'Đ': 'Dj', 
    'È': 'E', 'É': 'E', 'Ê': 'E', 'Ë': 'E', 'Ē': 'E', 
    'Ğ': 'G', 
    'I': 'I', 'Ì': 'I', 'Í': 'I', 'Î': 'I', 'Ï': 'I', 'Ī': 'I', 'İ': 'I', 
    'Ł': 'L', 
    'Ñ': 'N', 'Ń': 'N', 
    'Ò': 'O', 'Ó': 'O', 'Ô': 'O', 'Õ': 'O', 'Ö': 'O', 'Ø': 'O', 'Ō': 'O', 
    'Ś': 'S', 'Ş': 'S', 'Š': 'S', 
    'Þ': 'TH', 
    'Ù': 'U', 'Ú': 'U', 'Û': 'U', 'Ü': 'U', 'Ū': 'U', 
    'Ý': 'Y', 
    'Ź': 'Z', 'Ż': 'Z', 'Ž': 'Z', 
    'à': 'a', 'á': 'a', 'â': 'a', 'ã': 'a', 'ä': 'a', 'å': 'a', 'ā': 'a', 
    'æ': 'ae', 
    'ç': 'c', 'č': 'c', 
    'ð': 'd', 
    'đ': 'dj', 
    'è': 'e', 'é': 'e', 'ê': 'e', 'ë': 'e', 'ē': 'e', 
    'ğ': 'g', 
    'i': 'i', 'ì': 'i', 'í': 'i', 'î': 'i', 'ï': 'i', 'ī': 'i', 'ı': 'i', 
    'ł': 'l', 
    'ñ': 'n', 'ń': 'n', 
    'ò': 'o', 'ó': 'o', 'ô': 'o', 'õ': 'o', 'ö': 'o', 'ø': 'o', 'ō': 'o', 
    'ś': 's', 'ş': 's', 'š': 's', 
    'ß': 'ss', 
    'þ': 'th', 
    'ù': 'u', 'ú': 'u', 'û': 'u', 'ü': 'u', 'ū': 'u', 
    'ý': 'y', 'ÿ': 'y', 
    'ź': 'z', 'ż': 'z', 'ž': 'z'}

def replace_special_characters(text):
    """
    주어진 텍스트에서 특수 문자를 영문으로 대체한다.
    """
    for original_char, replacement_char in replacement_mapping.items():
        text = text.replace(original_char, replacement_char)
    return text

def cleansing_name(name: str, remove_space=False) -> str:
    name = replace_special_characters(name) # 영문으로 치환가능한 문자 치환
    name = name.lower()     # 소문자
    name = name.replace('&', 'and') # & 처리

    suffixes = [
        "ltd", "co", "co ltd", "coltd", "company limited", "companylimited",
        "private limited", "privatelimited", "limited", "llc", "sac", "sa", "inc",
        "co-op", "inc", "pty ltd", "ptyltd", "gmbh", "eirl", "sarl", "ltd sti", "ltdsti"
    ]

    # 정규식 패턴 생성: 접미사들을 '|'로 연결하여 선택적으로 매치하도록 함
    # 각 접미사는 단어 경계(\b)로 둘러싸여 있으며, 접미사 앞에 공백이 있을 수 있음
    pattern = r'\b(?:' + '|'.join(re.escape(suffix) for suffix in suffixes) + r')\b'
    name = re.sub(pattern, '', name, flags=re.IGNORECASE)

    # 숫자 문자 외 모두 제거(특수문자, 공백)
    if remove_space:
        pattern = r'[^\w]'
    else:
        pattern = r'[^\w\s]'
    name = re.sub(pattern, '', name, flags=re.UNICODE).strip()

    return name

target_name = 'Greenlabs 한글 きゆん。 Esto es español, Bu Türkçe.1'
search_name = 'Green Labs Llc'
print(cleansing_name(target_name), "/", cleansing_name(search_name))

from scraping_packers_homepage.tools.text_similarity import measure_text_similarity
print(measure_text_similarity(cleansing_name(target_name), cleansing_name(search_name)))

def find_name_from_title(title, cleansed_comp_name):
    word_list = re.split(',|\||-|/', title) # , | - / 기준으로 split
    score_dict = {}
    for word in word_list:
        word = word.strip()
        cleansed_word = cleansing_name(word)
        score_dict[measure_text_similarity(cleansing_name(cleansed_comp_name), cleansing_name(cleansed_word))] = word
    
    max_key = max(score_dict, key=lambda k: k)
    value = score_dict[max_key] if max_key is not None else None
    return max_key, value

greenlabs 한글 きゆん esto es espanol bu turkce1 / green labs
34


In [8]:
import time, json
from google.cloud import bigquery
import scraping_packers_homepage.tools.bing_searcher as bing_searcher

query = """
select uuid, data_type, raw_data, processed_data
from `greenlabs-data-farmmorning.content_analysis.gs_crawling_packer_info_processed`
where source_id = 'test_verity_20240222'
and (process_ts between '2024-02-23 03:51:05' and '2024-02-23 03:53:05')
"""

bigquery_client = bigquery.Client(project="grainscanner")
df = bigquery_client.query(query).to_dataframe()



In [12]:
import json
import pandas as pd

source_id = "test_verity_20240222"
result_list = []

for index, row in df.iterrows():
    search_keyword = row["raw_data"]
    result = row["processed_data"]

    domain = search_keyword.split(' ')[0][len("site:"):]
    comp_name = search_keyword[len(search_keyword.split(' ')[0]):].strip()

    cleansed_comp_name = cleansing_name(comp_name)
    
    if result and result.startswith('{'):
        j_result = json.loads(row["processed_data"])
        if j_result.get("webPages") and j_result["webPages"].get("value"):
            success_list = []
            fail_list = []
            for item in j_result["webPages"]["value"]:
                if domain not in item["url"]:
                    print ("  ** skip(domain not matched) -", item)
                    continue

                score, item_name = find_name_from_title(item["name"], cleansed_comp_name)
                cleansed_item_name = cleansing_name(item_name)
                score_item = {
                    "score": 100 if cleansed_comp_name.replace(' ', '') == cleansed_item_name.replace(' ', '') else score,
                    "title": item["name"],
                    "domain": domain,
                    "url": item["url"]
                }
                if score_item["score"] == 100:    
                    success_list.append(score_item)
                else:
                    fail_list.append(score_item)
            if not success_list:  # 성공한게 없으면 실패 리스트중에 80점 이상인 것이 있는지 확인.
                max_fail = max(fail_list, key=lambda r: r["score"])
                if max_fail["score"] >= 70:  # 부분 일치
                    success_list.append(max_fail)

            if success_list:
                result_list.append({
                    "source_id": source_id,
                    "uuid": row["uuid"],
                    "data_type": row["data_type"],
                    "job_type": "detect_search_result",
                    "job_detail": "bing",
                    "raw_data": row["raw_data"],
                    "processed_data": json.dumps(success_list, ensure_ascii=False)
                })


result_df = pd.DataFrame(result_list)
result_df.to_gbq("content_analysis.gs_crawling_packer_info_processed",
          "greenlabs-data-farmmorning",
          if_exists="append")

******* site:panjiva.com Montana Milling Inc [panjiva.com/Montana West Inc./montana west]
******* site:www.importgenius.co.kr Montana Milling Inc [www.importgenius.co.kr/Montana West Inc./montana west]
******* site:www.exportgenius.in Montana Milling Inc [www.exportgenius.in/Montana West Inc./montana west]
******* site:www.seair.co.in Montana Milling Inc [www.seair.co.in/Montana West Inc./montana west]
******* site:importkey.com Montana Milling Inc [importkey.com/Montana West Inc./montana west]
******* site:panjiva.com HI Plains Agronomy LLC [panjiva.com/HI Plains Silage Llc/hi plains silage]
******* site:www.importgenius.co.kr HI Plains Agronomy LLC [www.importgenius.co.kr/HI Plains Silage Llc/hi plains silage]
******* site:www.exportgenius.in HI Plains Agronomy LLC [www.exportgenius.in/HI Plains Silage Llc/hi plains silage]
******* site:www.seair.co.in HI Plains Agronomy LLC [www.seair.co.in/HI Plains Silage Llc/hi plains silage]
******* site:importkey.com HI Plains Agronomy LLC [imp

100%|██████████| 1/1 [00:00<00:00, 5405.03it/s]


[{'source_id': 'test_verity_20240222',
  'data_type': 'validation_import',
  'job_type': 'detect_search_result',
  'job_detail': 'bing',
  'raw_data': 'site:panjiva.com Montana Milling Inc',
  'processed_data': '[{"score": 100, "title": "Montana West Inc., 2606 Brenner Drive, Dallas, TX 75220, USA | Buyer Report — Panjiva", "url": "https://cn.panjiva.com/Montana-West-Inc/61340517"}, {"score": 100, "title": "Montana West Inc. - Panjiva", "url": "https://panjiva.com/Montana-West-Inc/5402821"}]'},
 {'source_id': 'test_verity_20240222',
  'data_type': 'validation_import',
  'job_type': 'detect_search_result',
  'job_detail': 'bing',
  'raw_data': 'site:panjiva.com HI Plains Agronomy LLC',
  'processed_data': '[{"score": 94, "title": "High Plains Silage Llc - Panjiva", "url": "https://panjiva.com/High-Plains-Silage-Llc/20918244"}]'}]

In [None]:
import time
from google.cloud import bigquery
import scraping_packers_homepage.tools.bing_searcher as bing_searcher

query = """
select source_id
    , uuid
    , packer_name
    , country
    , JSON_STRIP_NULLS(json_Array(handled_sku1, handled_sku2, handled_sku3, handled_sku4, handled_sku5, handled_sku6)) as sku_list
from `greenlabs-data-farmmorning.content_analysis.gs_crawling_packer_info_raw`
where source_id = 'test_verity_20240222'
"""

bigquery_client = bigquery.Client(project="grainscanner")
query_job = bigquery_client.query(query)
query_job.result()

site_list = ['panjiva.com',
             'www.importgenius.co.kr',
             'www.exportgenius.in',
             'www.seair.co.in',
             'importkey.com']

import json
MAX_INSERT_PACKER = 2

result_list = []
for i, row in enumerate(query_job):
    
    packer_name = row.packer_name
    for site in site_list:
        query = f"site:{site} {packer_name}"
        try:
            params = { 'count': 50, 'responseFilter':'Webpages' }
            result = json.dumps(bing_searcher.search(query, '2d20c255950948d6b0ffa9685720117d', kwargs=params), ensure_ascii=False)
        except Exception as e:
            result = str(e)
        
        result_list.append({
            "source_id": row.source_id,
            "uuid": row.uuid,
            "data_type": "validation_import",
            "job_type": "search",
            "job_detail": "bing",
            "raw_data": query,
            "processed_data": result
        })
        time.sleep(0.1)
        break
    
    if (i%MAX_INSERT_PACKER) == (MAX_INSERT_PACKER-1):
        print(f' ** flush - {len(result_list)} rows')
        bigquery_client.insert_rows_json("greenlabs-data-farmmorning.content_analysis.gs_crawling_packer_info_processed", result_list)
        result_list = []
    break

if result_list:
    bigquery_client.insert_rows_json("greenlabs-data-farmmorning.content_analysis.gs_crawling_packer_info_processed", result_list)




In [124]:
query = """
SELECT p.id, p.name, array_agg(distinct cc.name ignore nulls) as country, array_agg(distinct sku.name_en ignore nulls) as sku
FROM `greenlabs-data-farmmorning.stream_ods_grainscanner.packer` as p
inner join `greenlabs-data-farmmorning.stream_ods_grainscanner.packer_country` as pc
  on p.id = pc.packer_id
inner join `greenlabs-data-farmmorning.stream_ods_grainscanner.common_country` as cc
  on pc.country_id = cc.id
left outer join `greenlabs-data-farmmorning.stream_ods_grainscanner.packer_sku` as ps
  on p.id = ps.packer_id
left outer join `greenlabs-data-farmmorning.stream_ods_grainscanner.sku` as sku
  on ps.sku_id = sku.id
WHERE p.is_deleted = 0 
  and p.contact_stage = 'PCS01' 
  and p.verification_stage = 'PVS01'
group by 1, 2
order by 1, 2
"""

'United States'