In [8]:
import pandas as pd
import numpy as np
import time
import requests
import json
import re
import os
import urllib.parse

from tqdm import tqdm
from bs4 import BeautifulSoup

In [18]:
# Google Patents 정보 수집
def fetch_google_patent(patent_number):
    url = f'https://patents.google.com/patent/{patent_number}/en'
    headers = {
        "User-Agent": "Mozilla/5.0"
    }

    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"[{patent_number}] ❌ Status Code: {response.status_code}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')

    try:
        title_tag = soup.find('meta', {'name': 'DC.title'})
        abstract_tag = soup.find('meta', {'name': 'DC.description'})
        filed_tag = soup.find('meta', {'scheme': 'dateFiled'})
        granted_tag = soup.find('meta', {'scheme': 'dateIssued'})

        title = title_tag['content'] if title_tag else None
        abstract = abstract_tag['content'] if abstract_tag else None
        date_filed = filed_tag['content'] if filed_tag else None
        date_granted = granted_tag['content'] if granted_tag else None

        if not title and not abstract:
            print(f"[{patent_number}] ❌ Empty content. Skipped.")
            return None

        return {
            'patent_number': patent_number,
            'title': title,
            'abstract': abstract,
            'date_filed': date_filed,
            'date_granted': date_granted
        }

    except Exception as e:
        print(f"[{patent_number}] ❌ Parsing Error: {e}")
        return None

In [19]:
# 특허 번호에 대한 크롤링
def crawling_patents(input_csv, output_csv):
    if not os.path.exists(input_csv):
        print(f"❌ Input file not found: {input_csv}")
        return

    df = pd.read_csv(input_csv, skiprows=1)
    df['patent_number'] = df['id'].apply(lambda x: f"US{x.split('-')[1]}B2" if pd.notnull(x) else None)
    df = df[df['patent_number'].notnull()].reset_index(drop=True)

    results = []
    for pat in df['patent_number']:
        data = fetch_google_patent(pat)
        if data:
            results.append(data)
        time.sleep(1)  # polite crawling

    if results:
        df_out = pd.DataFrame(results)
        df_out.to_csv(output_csv, index=False, encoding='utf-8-sig')
        print(f"✅ Data saved to {output_csv}")
    else:
        print("⚠️ No data collected.")

In [None]:
# 2023-01-01 ~ 2023-12-31 (1년) 
crawling_patents(
    input_csv='Data/gp-search-20250611-235346.csv',
    output_csv='wearable_devices_patents_2023.csv'
)

[US12223480B2] ❌ Status Code: 404
[US12057026B2] ❌ Status Code: 404
[US12081834B2] ❌ Status Code: 404
[US11935384B2] ❌ Status Code: 404
[US11822732B2] ❌ Status Code: 404
[US12285084B2] ❌ Status Code: 404
[US11949466B2] ❌ Status Code: 404
[US11911181B2] ❌ Status Code: 404
[US11849379B2] ❌ Status Code: 404
[US11894126B2] ❌ Status Code: 404
[US12067100B2] ❌ Status Code: 404
[US12002579B2] ❌ Status Code: 404
[US11893162B2] ❌ Status Code: 404
[US12205453B2] ❌ Status Code: 404
[US11950933B2] ❌ Status Code: 404
[US11937917B2] ❌ Status Code: 404
[US12178745B2] ❌ Status Code: 404
[US12157002B2] ❌ Status Code: 404
[US12307020B2] ❌ Status Code: 404
[US12029434B2] ❌ Status Code: 404
[US11944168B2] ❌ Status Code: 404
[US12217230B2] ❌ Status Code: 404
[US12189846B2] ❌ Status Code: 404
[US12097376B2] ❌ Status Code: 404
[US11921935B2] ❌ Status Code: 404
[US12097011B2] ❌ Status Code: 404
[US11991614B2] ❌ Status Code: 404
[US11964154B2] ❌ Status Code: 404
[US12277570B2] ❌ Status Code: 404
[US12236416B2]

In [None]:
# 2024-01-01 ~ 2024-12-31 (1년) 
crawling_patents(
    input_csv='Data/gp-search-20250612-003902.csv', 
    output_csv='wearable_devices_patents_2024.csv'
)

[US12232993B2] ❌ Status Code: 404
[US11969557B2] ❌ Status Code: 404
[US12207897B2] ❌ Status Code: 404
[US12238494B2] ❌ Status Code: 404
[US12214153B2] ❌ Status Code: 404
[US12315353B2] ❌ Status Code: 404
[US12260672B2] ❌ Status Code: 404
[US12203646B2] ❌ Status Code: 404
[US12127643B2] ❌ Status Code: 404
[US12124985B2] ❌ Status Code: 404
[US12282172B2] ❌ Status Code: 404
[US12208056B2] ❌ Status Code: 404
[US12161878B2] ❌ Status Code: 404
[US12307443B2] ❌ Status Code: 404
[US12316584B2] ❌ Status Code: 404
[US12189866B2] ❌ Status Code: 404
[US12198698B2] ❌ Status Code: 404
[US12103182B2] ❌ Status Code: 404
[US12235680B2] ❌ Status Code: 404
[US12282600B2] ❌ Status Code: 404
[US12070668B2] ❌ Status Code: 404
[US12290367B2] ❌ Status Code: 404
[US12207942B2] ❌ Status Code: 404
[US12213191B2] ❌ Status Code: 404
[US12257102B2] ❌ Status Code: 404
[US12289295B2] ❌ Status Code: 404
[US12182778B2] ❌ Status Code: 404
[US12263846B2] ❌ Status Code: 404
[US12140819B2] ❌ Status Code: 404
[US12280737B2]

In [None]:
# # USPTO 크롤링
# def crawling_patents(input_csv, output_csv):
#     api_key = "3VZoRvHk.oSWlaIqcQtrM8u1nnQ65sYNSRyXpS8BR"

#     # 특허 번호 불러오기
#     df = pd.read_csv(input_csv, encoding='utf-8', skiprows=1)
#     if 'id' not in df.columns:
#         raise ValueError("❌ 'id' 컬럼이 CSV 파일에 존재하지 않습니다.")
    
#     # 특허번호에서 숫자만 추출
#     df['patent_number'] = df['id'].str.extract(r'(\d+)')
#     patent_numbers = df['patent_number'].dropna().unique().tolist()

#     # 수집할 필드 정의
#     fields = [
#         "patent_number",                        # 특허 고유 번호
#         "patent_title",                         # 특허 제목
#         "app_date",                             # 특허 출원일(Application Date) - 특허가 처음 신청된 날짜
#         "patent_date",                          # 특허 등록일(Grant Date) - 특허가 공식적으로 승인된 날짜
#         "patent_abstract",                      # 특허 초록
#         "patent_type",                          # 특허 유형(예. utility, design, plant 등)
#         "assignees.assignee_id",                # 특허 양수인 ID(소유권자)
#         "assignees.country",                    # 양수인 국가 코드
#         "inventors.inventor_id",                # 발명자 ID(복수일 수 있음)
#         "ipc.section",                          # IPC 섹션 코드
#         "ipc.ipc_class",                        # IPC 클래스 코드
#         "ipc.subclass",                         # IPC 서브클래스 코드
#         "ipc.main_group",                       # 주 그룹 코드
#         "ipc.subgroup",                         # 서브그룹 코드
#         "cited_patents.cited_patent_number",    # 후방 인용(Backward Citation), 해당 특허가 인용한 기존 특허 번호
#         "cited_patents.cited_patent_date",      # 인용된 특허의 등록일
#         "citing_patents.citing_patent_number",  # 전방 인용(Forward Citation), 해당 특허를 인용한 미래 특허 번호
#         "citing_patents.citing_patent_date"     # 인용한 특허의 등록일
#     ]

#     base_url = "https://api.patentsview.org/patents/query?q="
#     results = []

#     for pn in tqdm(patent_numbers, desc="Fetching patent data"):
#         query = {
#             "patent_number": pn
#         }
#         options = {
#             "per_page": 1
#         }

#         query_url = base_url + urllib.parse.quote(json.dumps(query))
#         url = (
#             f"{query_url}"
#             f"&f={urllib.parse.quote(json.dumps(fields))}"
#             f"&o={urllib.parse.quote(json.dumps(options))}"
#         )

#         try:
#             response = requests.get(url, headers={"X-Api-Key": api_key})
#             if response.status_code == 200:
#                 data = response.json().get("patents", [])
#                 if data:
#                     results.append(data[0])
#             else:
#                 print(f"⚠️ [{pn}] Status Code: {response.status_code}")
#         except Exception as e:
#             print(f"❌ [{pn}] Error: {e}")
#             continue

#     # 저장
#     if results:
#         df_result = pd.json_normalize(results)
#         df_result.to_csv(output_csv, index=False, encoding='utf-8-sig')
#         print(f"✅ Data saved to: {output_csv}")
#     else:
#         print("⚠️ 수집된 데이터가 없습니다.")

In [None]:
# # 2023-01-01 ~ 2023-12-31 (1년) 
# crawling_patents(
#     input_csv='Data/gp-search-20250611-235346.csv',
#     output_csv='wearable_devices_patents_rest_2023.csv'
# )