In [8]:
import json
from pathlib import Path
from tqdm import tqdm
from rapidfuzz import fuzz


def normalize_name(name: str):
    if not name:
        return ""
    import re
    name = name.lower()
    # 문자/숫자/한글만 남기기
    name = re.sub(r"[^\w가-힣]", "", name)
    return name.strip()


def dedupe_clubs_smart(
    input_path,
    output_path,
    name_extract_fn,
    similarity_threshold=85,
):
    input_path = Path(input_path)
    output_path = Path(output_path)

    # 1) 전체 데이터 로드 + 동아리 이름 추출
    data = []
    with input_path.open("r", encoding="utf-8") as fin:
        for line in fin:
            line = line.strip()
            if not line:
                continue
            try:
                item = json.loads(line)
            except:
                continue

            text = item.get("all_text", "")
            club_name = name_extract_fn(text)
            club_name_norm = normalize_name(club_name)

            data.append({
                "raw": item,
                "club_name": club_name,
                "club_name_norm": club_name_norm,
                "text_len": len(text),
            })

    print(f"총 {len(data)}개 로드됨")

    # 2) 이름 유사도 기준으로 클러스터링
    clusters = []
    for item in tqdm(data, desc="Clustering"):
        name = item["club_name_norm"]

        if not name:
            # 이름 못 뽑은 애들은 그냥 단독 클러스터
            clusters.append([item])
            continue

        matched = False
        for cluster in clusters:
            rep = cluster[0]["club_name_norm"]
            sim = fuzz.partial_ratio(name, rep)  # 0~100

            if sim >= similarity_threshold:
                cluster.append(item)
                matched = True
                break

        if not matched:
            clusters.append([item])

    print(f"동아리 그룹 수: {len(clusters)}")

    # 3) 각 그룹에서 "가장 긴 글" 하나만 선택
    deduped = []
    for cluster in clusters:
        best = max(cluster, key=lambda x: x["text_len"])
        deduped.append(best["raw"])

    print(f"최종 남은 글 수: {len(deduped)}")

    # 4) 저장
    with output_path.open("w", encoding="utf-8") as fout:
        for item in deduped:
            fout.write(json.dumps(item, ensure_ascii=False) + "\n")

    print("완료!")


# ==========================
# 실제 실행
# ==========================

dedupe_clubs_smart(
    input_path=Path("/content/drive/MyDrive/Colab Notebooks/NLP/everytime_crawling_club.jsonl"),
    output_path=Path("/content/drive/MyDrive/Colab Notebooks/NLP/everytime_crawling_club_dedup_smart.jsonl"),
    name_extract_fn=extract_club_name,
)

총 1877개 로드됨


Clustering: 100%|██████████| 1877/1877 [00:04<00:00, 390.80it/s]


동아리 그룹 수: 537
최종 남은 글 수: 537
완료!


In [9]:
!pip install -q transformers accelerate tqdm

In [12]:
import json
import re
from pathlib import Path
from tqdm import tqdm
import torch
from transformers import pipeline

# ==============================
# 0. 입력/출력 파일 경로 설정
# ==============================
INPUT_JSONL  = "/content/drive/MyDrive/Colab Notebooks/NLP/everytime_crawling_club_dedup_smart.jsonl"
OUTPUT_JSONL = "/content/drive/MyDrive/Colab Notebooks/NLP/everytime_club_parsed.jsonl"

# ==============================
# 1. Zero-shot 분류기 (mDeBERTa, GPU)
# ==============================
if torch.cuda.is_available():
    device = 0
else:
    device = -1

clf = pipeline(
    "zero-shot-classification",
    model="MoritzLaurer/mDeBERTa-v3-base-mnli-xnli",
    device=device,
)

# ==============================
# 2. 유틸 함수들
# ==============================

def clean_text(text: str) -> str:
    if not text:
        return ""
    lines = [ln.strip() for ln in text.splitlines()]
    cleaned = []
    for ln in lines:
        if not ln and cleaned and cleaned[-1] == "":
            continue
        cleaned.append(ln)
    return "\n".join(cleaned)


# ==============================
# 동아리 이름 추출
# ==============================

def extract_club_name(all_text: str) -> str:
    """
    1) '동아리' 또는 '모집' 포함 줄을 제목 후보로 선정
    2) 괄호 안의 텍스트 우선 추출
    3) 없으면 단어들 필터링해서 가장 의미 있는 토큰 선택
    4) 최종적으로 숫자/영어/한글만 남김 (이모지, 특수문자 제거)
    """

    lines = [ln.strip() for ln in all_text.splitlines() if ln.strip()]

    # 1) 동아리 포함 라인 찾기
    candidate = None
    for ln in lines:
        if "동아리" in ln or "학회" in ln:
            candidate = ln
            break

    # 2) 없으면 '모집' 포함 줄
    if candidate is None:
        for ln in lines:
            if "모집" in ln:
                candidate = ln
                break

    if not candidate:
        return ""

    # 3) 괄호 패턴으로 찾기
    m = re.search(r"[〈《<\(\[](.+?)[〉》>\)\]]", candidate)
    if m:
        name = m.group(1).strip()
    else:
        name = candidate
        name = re.sub(r"동아리", "", name)
        name = re.sub(r"학회", "", name)
        name = re.sub(r"모집.*", "", name)

    # 4) 숫자/영어/한글만 남기기
    name = re.sub(r"[^0-9a-zA-Z가-힣]", "", name)

    return name.strip()


# ==============================
# 교내/연합 분류
# ==============================
def classify_club_type(text: str):
    candidate_labels = ["교내 동아리", "연합 동아리"]
    short_text = text[:2000]
    result = clf(short_text, candidate_labels)
    label = result["labels"][0]
    score = float(result["scores"][0])
    return ("연합" if "연합" in label else "교내"), score


# ==============================
# 동아리 종류 분류 (대분류 + 소분류)
# ==============================

CATEGORY_MAIN = [
    "운동", "봉사", "음악", "밴드", "댄스", "사진", "영상", "예능/콘텐츠",
    "학술/스터디", "개발/IT", "기독교", "불교", "천주교",
    "친목/힐링", "요리/음식", "여행", "영화", "게임", "보드게임",
    "독서", "공예", "미술/드로잉", "동물", "환경", "기타"
]

CATEGORY_SUB = [
    # 운동
    "테니스", "축구", "풋살", "농구", "배드민턴", "볼링", "헬스", "러닝", "요가", "필라테스",
    # 음악/밴드
    "밴드", "보컬", "기타", "드럼", "피아노", "합창단",
    # 사진/영상/예능
    "필름카메라", "출사", "영상 촬영", "영상 편집", "유튜브", "리얼버라이어티",
    # 학술
    "스터디", "코딩", "인공지능", "경영", "경제", "문학", "철학",
    # 취미
    "보드게임", "방탈출", "독서토론", "요리", "공예", "그림", "여행 모임"
]


def classify_main_category(text: str):
    result = clf(text[:1500], CATEGORY_MAIN)
    return result["labels"][0], float(result["scores"][0])


def classify_sub_category(text: str):
    result = clf(text[:1500], CATEGORY_SUB)
    return result["labels"][0], float(result["scores"][0])


# ==============================
# 기타 추출 함수들 (모집기간, 활동 등)
# ==============================
def extract_recruitment_period(text: str) -> str:
    lines = [ln.strip() for ln in text.splitlines()]
    for ln in lines:
        if "모집기간" in ln or "모집 기간" in ln or "모집일정" in ln or "모집 일정" in ln:
            return ln

    m1 = re.search(r"(20\d{2}\.\d{1,2}\.\d{1,2}.*?20\d{2}\.\d{1,2}\.\d{1,2})", text)
    if m1:
        return m1.group(1).strip()

    m2 = re.search(r"(\d{1,2}/\d{1,2}.*?\d{1,2}/\d{1,2})", text)
    if m2:
        return m2.group(1).strip()

    return ""


def extract_activity_block(text: str) -> str:
    keywords = [
        "활동 일정", "활동내역", "활동 내용", "활동안내",
        "정기 모임", "활동은 이렇게", "활동 내역"
    ]
    lines = text.splitlines()
    n = len(lines)

    for i, ln in enumerate(lines):
        if any(k in ln for k in keywords):
            block = [lines[i].strip()]
            for j in range(i + 1, min(i + 8, n)):
                if not lines[j].strip():
                    break
                block.append(lines[j].strip())
            return "\n".join(block).strip()
    return ""


def extract_activity_dates_from_block(block: str) -> str:
    if not block:
        return ""
    lines = block.splitlines()
    picked = []

    for ln in lines:
        if any(kw in ln for kw in [
            "월", "화", "수", "목", "금", "토", "일",
            "매주", "주 1회", "월 1회", "둘째", "셋째", "넷째", "매달"
        ]) or re.search(r"\d{1,2}:\d{2}", ln):
            picked.append(ln.strip())

    return "\n".join(picked) if picked else ""


def extract_fee(text: str) -> str:
    lines = [ln.strip() for ln in text.splitlines()]
    fees = []

    for ln in lines:
        if any(key in ln for key in ["회비", "가입비", "예치금", "참가비"]) and ("원" in ln or "무료" in ln):
            fees.append(ln)

    if not fees:
        m = re.search(r"\d{1,3}(,\d{3})*원", text)
        if m:
            fees.append(m.group(0))

    return " / ".join(fees)


# ==============================
# 3. 전체 파일 처리
# ==============================

def process_file(input_path, output_path, max_samples=None):

    input_path = Path(input_path)
    output_path = Path(output_path)

    with input_path.open("r", encoding="utf-8") as fin, \
         output_path.open("w", encoding="utf-8") as fout:

        for idx, line in enumerate(tqdm(fin, desc="Parsing clubs")):

            if max_samples is not None and idx >= max_samples:
                break

            try:
                obj = json.loads(line)
            except:
                continue

            text = clean_text(obj.get("all_text", ""))

            club_name = extract_club_name(text)
            club_type, score = classify_club_type(text)
            main_cat, main_score = classify_main_category(text)
            sub_cat, sub_score = classify_sub_category(text)
            recruitment_period = extract_recruitment_period(text)

            block = extract_activity_block(text)
            dates = extract_activity_dates_from_block(block)
            fee = extract_fee(text)

            out = {
                "id": obj.get("id"),
                "url": obj.get("url"),
                "club_name": club_name,
                "club_type": club_type,
                "club_type_score": score,
                "category_main": main_cat,
                "category_main_score": main_score,
                "category_sub": sub_cat,
                "category_sub_score": sub_score,
                "recruitment_period": recruitment_period,
                "activity_dates": dates,
                "activity_description": block,
                "fee": fee,
                "raw_text": text[:4000],
            }

            fout.write(json.dumps(out, ensure_ascii=False) + "\n")

    print("Done:", output_path)


# ==============================
# 4. 실행
# ==============================
process_file(INPUT_JSONL, OUTPUT_JSONL, max_samples=None)

Device set to use cuda:0
Parsing clubs: 537it [14:36,  1.63s/it]

Done: /content/drive/MyDrive/Colab Notebooks/NLP/everytime_club_parsed.jsonl



