# Neo4j 그래프 DB 구축 — Recursive Chunking 버전

GraphBuilder.ipynb와 동일한 흐름이지만 **recursive_chunk_text**로 청킹하여 DB에 적재합니다.

## GraphBuilder와의 차이

| 항목 | GraphBuilder | 이 노트북 |
|------|--------------|----------|
| 청킹 함수 | `chunk_text` (고정 500자) | `recursive_chunk_text` |
| 구분자 | 없음 (글자 수만) | `\n\n` → `\n` → `. ` → ` ` 순 |
| 비교 섹션 | 없음 | 있음 (고정 vs Recursive) |

## 구분자 우선순위 (RecursiveCharacterTextSplitter 스타일)

1. `\n\n` (문단)
2. `\n` (줄)
3. `. ` (문장 끝)
4. `。` (한국어 마침표)
5. `, ` (쉼표)
6. ` ` (공백)

## 1. 데이터 로드

`data_scrapping.py`로 수집한 뉴스 기사 Excel 파일을 로드합니다. 최신 `Articles_*.xlsx` 자동 탐색.

In [None]:
import os
import pandas as pd
import glob

files = sorted(glob.glob(os.path.join('..', 'Articles_*.xlsx')), reverse=True)
if not files:
    raise FileNotFoundError('Articles_*.xlsx 파일을 찾을 수 없습니다. data_scrapping.py를 먼저 실행하세요.')

input_file = files[0]
df = pd.read_excel(input_file)
print(f'Loaded: {input_file}, {len(df)} articles')
df.head()

## 2. Neo4j 연결

In [None]:
import neo4j
import os
import dotenv

dotenv.load_dotenv(dotenv_path=os.path.join(os.path.dirname(os.path.abspath("")), ".env"), override=True)

URI = os.getenv("NEO4J_URI", "neo4j://localhost:7687")
AUTH = (os.getenv("NEO4J_USERNAME", "neo4j"), os.getenv("NEO4J_PASSWORD", "password"))
DB_NAME = os.getenv("NEO4J_DB", "neo4j")

driver = neo4j.GraphDatabase.driver(URI, auth=AUTH)
driver.verify_connectivity()
print(f'Connected to {URI}')
print(f'Database: {DB_NAME}')

## 3. 청킹 함수 정의

- **chunk_text**: 고정 500자 + overlap 100 (비교용)
- **recursive_chunk_text**: 구분자 우선순위 기반 Recursive 방식 (LangChain 없이 순수 Python)

In [None]:
def chunk_text(text, chunk_size=500, overlap=100):
    """고정 크기 청킹 (GraphBuilder.ipynb와 동일)"""
    if pd.isna(text) or text == '':
        return []
    text = str(text)
    chunks = []
    for i in range(0, len(text), chunk_size - overlap):
        chunk = text[i:i + chunk_size]
        if chunk.strip():
            chunks.append(chunk.strip())
    return chunks


def recursive_chunk_text(text, chunk_size=500, overlap=100, separators=None):
    """
    RecursiveCharacterTextSplitter 스타일 청킹 (LangChain 없이 순수 Python).
    구분자 우선순위대로 분할하여 의미 경계를 존중합니다.
    """
    if separators is None:
        separators = ["\n\n", "\n", ". ", "。", ", ", " "]
    if not text or not str(text).strip():
        return []
    text = str(text).strip()

    def _split(t, sep):
        if sep == "":
            return list(t)
        return t.split(sep)

    def _merge_splits(splits, sep):
        if not splits:
            return []
        if sep:
            splits = [s + sep for s in splits[:-1]] + [splits[-1]]
        chunks = []
        current = []
        for s in splits:
            add_len = len(s)
            merged_so_far = "".join(current)
            if len(merged_so_far) + add_len <= chunk_size:
                current.append(s)
            else:
                if current:
                    merged = "".join(current)
                    if merged.strip():
                        chunks.append(merged.strip())
                if overlap and current:
                    prev = "".join(current)
                    tail = prev[-overlap:] if len(prev) >= overlap else prev
                    current = [tail, s] if tail.strip() else [s]
                else:
                    current = [s]
        if current:
            merged = "".join(current)
            if merged.strip():
                chunks.append(merged.strip())
        return chunks

    def _split_recursive(t, sep_idx):
        if not t or not t.strip():
            return []
        if sep_idx >= len(separators):
            if len(t) <= chunk_size:
                return [t] if t.strip() else []
            return [t[:chunk_size]] + _split_recursive(t[chunk_size - overlap:], sep_idx)
        sep = separators[sep_idx]
        splits = _split(t, sep)
        merged = _merge_splits(splits, sep if sep != "" else None)
        result = []
        for m in merged:
            if len(m) <= chunk_size:
                result.append(m)
            else:
                result.extend(_split_recursive(m, sep_idx + 1))
        return result

    return _split_recursive(text, 0)

## 4. 청킹 비교 (고정 vs Recursive)

기사 1~2건에 대해 두 방식의 결과를 비교합니다.

In [None]:
for idx in range(min(2, len(df))):
    row = df.iloc[idx]
    text = row.get('content', '')
    title = row.get('title', '')[:50]
    if pd.isna(text) or not str(text).strip():
        continue
    
    fixed = chunk_text(text, 500, 100)
    recursive = recursive_chunk_text(text, 500, 100)
    
    print(f'=== 기사 {idx+1}: {title}... ===')
    print(f'원문: {len(str(text))}자')
    print(f'고정: {len(fixed)}개 청크')
    for i, c in enumerate(fixed):
        print(f'  [{i}] {len(c)}자 | {c[:60]}...')
    print(f'Recursive: {len(recursive)}개 청크')
    for i, c in enumerate(recursive):
        print(f'  [{i}] {len(c)}자 | {c[:60]}...')
    print()

## 5. DB 초기화 및 제약조건

In [None]:
def clear_database(tx):
    tx.run("MATCH (n) DETACH DELETE n")

def create_constraints(tx):
    constraints = [
        "CREATE CONSTRAINT IF NOT EXISTS FOR (a:Article) REQUIRE a.article_id IS UNIQUE",
        "CREATE CONSTRAINT IF NOT EXISTS FOR (c:Content) REQUIRE c.content_id IS UNIQUE",
        "CREATE CONSTRAINT IF NOT EXISTS FOR (m:Media) REQUIRE m.name IS UNIQUE",
        "CREATE CONSTRAINT IF NOT EXISTS FOR (cat:Category) REQUIRE cat.name IS UNIQUE",
        "CREATE CONSTRAINT IF NOT EXISTS FOR (au:Author) REQUIRE au.name IS UNIQUE",
    ]
    for c in constraints:
        tx.run(c)

with driver.session(database=DB_NAME) as session:
    session.execute_write(clear_database)
    session.execute_write(create_constraints)
print('DB 초기화 완료')

## 6. 노드 및 관계 생성 함수

In [None]:
def create_article_node(tx, article_data):
    tx.run("""
        MERGE (a:Article {article_id: $article_id})
        SET a.title = $title, a.url = $url, a.published_date = $published_date
    """, **article_data)

def create_content_nodes(tx, article_id, content_chunks, article_data):
    for i, chunk in enumerate(content_chunks):
        content_id = f"{article_id}_chunk_{i}"
        tx.run("""
            MERGE (c:Content {content_id: $content_id})
            SET c.chunk = $chunk, c.article_id = $article_id, c.title = $title,
                c.url = $url, c.published_date = $published_date, c.chunk_index = $chunk_index
        """, content_id=content_id, chunk=chunk, article_id=article_id,
               title=article_data['title'], url=article_data['url'],
               published_date=article_data['published_date'], chunk_index=i)
        tx.run("""
            MATCH (a:Article {article_id: $article_id})
            MATCH (c:Content {content_id: $content_id})
            MERGE (a)-[:HAS_CHUNK]->(c)
        """, article_id=article_id, content_id=content_id)

def create_media_node_and_relationship(tx, article_id, source):
    if pd.isna(source) or source == '': return
    tx.run("MERGE (m:Media {name: $source})", source=source)
    tx.run("""MATCH (a:Article {article_id: $article_id}) MATCH (m:Media {name: $source})
              MERGE (m)-[:PUBLISHED]->(a)""", article_id=article_id, source=source)

def create_category_node_and_relationship(tx, article_id, category):
    if pd.isna(category) or category == '': return
    tx.run("MERGE (cat:Category {name: $category})", category=category)
    tx.run("""MATCH (a:Article {article_id: $article_id}) MATCH (cat:Category {name: $category})
              MERGE (a)-[:BELONGS_TO]->(cat)""", article_id=article_id, category=category)

def create_author_node_and_relationship(tx, article_id, author):
    if pd.isna(author) or author == '': return
    tx.run("MERGE (au:Author {name: $author})", author=author)
    tx.run("""MATCH (a:Article {article_id: $article_id}) MATCH (au:Author {name: $author})
              MERGE (au)-[:WROTE]->(a)""", article_id=article_id, author=author)

## 7. 그래프 빌드 (recursive_chunk_text 사용)

In [None]:
def build_graph_from_dataframe(df, chunk_size=500, overlap=100):
    """recursive_chunk_text로 Content 노드 생성"""
    with driver.session(database=DB_NAME) as session:
        for idx, row in df.iterrows():
            try:
                article_id = row.get('article_id', '')
                article_data = {
                    'article_id': article_id,
                    'title': row.get('title', ''),
                    'url': row.get('url', ''),
                    'published_date': str(row.get('published_date', ''))
                }
                session.execute_write(create_article_node, article_data)

                if 'content' in row and pd.notna(row['content']) and row['content'] != '':
                    content_chunks = recursive_chunk_text(row['content'], chunk_size, overlap)
                    if content_chunks:
                        session.execute_write(create_content_nodes, article_id, content_chunks, article_data)

                if 'source' in row:
                    session.execute_write(create_media_node_and_relationship, article_id, row['source'])
                if 'category' in row:
                    session.execute_write(create_category_node_and_relationship, article_id, row['category'])
                if 'author' in row:
                    session.execute_write(create_author_node_and_relationship, article_id, row['author'])

                if (idx + 1) % 10 == 0:
                    print(f'진행률: {idx + 1}/{len(df)} ({((idx + 1)/len(df)*100):.1f}%)')
            except Exception as e:
                print(f'기사 {idx} 처리 중 오류: {e}')
                continue

build_graph_from_dataframe(df, chunk_size=500, overlap=100)
print('\nGraph build complete! (recursive_chunk_text)')

## 8. 검증 — 그래프 요약

In [None]:
with driver.session(database=DB_NAME) as session:
    result = session.run("""
        MATCH (n) RETURN labels(n)[0] AS Label, count(n) AS Count
        ORDER BY Count DESC
    """)
    print('=== Node Counts ===')
    for r in result:
        print(f'  {r["Label"]:12s}: {r["Count"]:>5d}')

    result = session.run("""
        MATCH ()-[r]->() RETURN type(r) AS Relationship, count(r) AS Count
        ORDER BY Count DESC
    """)
    print('\n=== Relationship Counts ===')
    for r in result:
        print(f'  {r["Relationship"]:12s}: {r["Count"]:>5d}')

In [None]:
with driver.session(database=DB_NAME) as session:
    result = session.run("""
        MATCH (a:Article)-[:BELONGS_TO]->(cat:Category)
        RETURN cat.name AS Category, count(a) AS Articles ORDER BY Articles DESC
    """)
    print('=== Articles per Category ===')
    for r in result:
        print(f'  {r["Category"]:12s}: {r["Articles"]}개')

In [None]:
with driver.session(database=DB_NAME) as session:
    result = session.run("""
        MATCH (a:Article) WITH a LIMIT 1
        OPTIONAL MATCH (m:Media)-[:PUBLISHED]->(a)
        OPTIONAL MATCH (a)-[:BELONGS_TO]->(cat:Category)
        OPTIONAL MATCH (a)-[:HAS_CHUNK]->(c:Content)
        RETURN a.article_id AS id, a.title AS title, m.name AS media,
               cat.name AS category, count(c) AS chunks
    """)
    for r in result:
        print(f'Article:  {r["id"]}')
        print(f'Title:    {r["title"]}')
        print(f'Media:    {r["media"]}')
        print(f'Category: {r["category"]}')
        print(f'Chunks:   {r["chunks"]}개')

## 9. 정리

In [None]:
driver.close()
print('Neo4j connection closed.')