In [None]:
# ===== 0. 必要なライブラリのインポート =====
print("Installing necessary libraries...")
# !pip install pdfminer.six bertopic spacy umap-learn hdbscan scikit-learn pandas networkx matplotlib plotly openpyxl markdown-it-py python-frontmatter sentence-transformers keybert
# print("Downloading spaCy model...")
# !python -m spacy download en_core_web_sm

import re
from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
import spacy
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from bertopic import BERTopic
import umap
import hdbscan
import networkx as nx
import matplotlib.pyplot as plt
from collections import Counter
import itertools
import numpy as np
import os
import glob # ファイル検索用
from markdown_it import MarkdownIt # マークダウン解析用
import frontmatter # マークダウンのフロントマター解析用

# 追加する可能性のあるライブラリ
from gensim.models.phrases import Phrases, Phraser
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT
# from textblob import TextBlob # 感情分析用 (オプション)

print("Libraries imported.")

# ===== A. 分析ツールセット 設定フラグとパラメータ =====
# これらをTrue/Falseで切り替えて、使用するツールを選択
USE_MARKDOWN_INPUT = True # Trueにするとマークダウンファイルも読み込む
USE_NGRAMS = True
NGRAM_MIN_COUNT = 5
NGRAM_THRESHOLD = 10
USE_POS_FILTERING = True
ALLOWED_POS = ['NOUN', 'PROPN', 'ADJ'] # 例: 名詞、固有名詞、形容詞
USE_NER_EXTRACTION = False # 必要に応じてTrueに
ALLOWED_NER_LABELS = ['ORG', 'PRODUCT', 'PERSON', 'WORK_OF_ART', 'EVENT'] # 例
USE_SENTENCE_BERT_EMBEDDINGS = True # Word Embeddings (Sentence-BERT)
USE_KEYBERT_EXTRACTION = True # KeyBERTによるキーワード抽出
USE_WATCHLIST_ANALYSIS = True # 注目ワードリスト分析

# Sentence-BERT モデル名
SBERT_MODEL_NAME = 'all-MiniLM-L6-v2' # または 'stsb-roberta-large', 'paraphrase-multilingual-mpnet-base-v2' 等

# 注目ワードリスト (前処理後の形に近いものを想定するか、前処理も通す)
WATCHLIST_KEYWORDS_RAW = ["terraforming governance", "memoryfield feedback", "semantic divergence", "構造知性"] # 例

# ===== 1. 関数定義 (PDF抽出、CV分割 - 既存) =====
# (extract_text_from_pdf_pdfminer, split_cv_by_company は既存のものをそのまま使用)
def extract_text_from_pdf_pdfminer(pdf_path):
    # (既存のコード)
    if not os.path.exists(pdf_path):
         print(f"Error in extract_text: PDF file not found at {pdf_path}")
         return None
    output_string = StringIO()
    try:
        with open(pdf_path, 'rb') as in_file:
            parser = PDFParser(in_file)
            doc = PDFDocument(parser)
            rsrcmgr = PDFResourceManager()
            laparams = LAParams(line_margin=0.5, boxes_flow=0.5)
            device = TextConverter(rsrcmgr, output_string, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.create_pages(doc):
                interpreter.process_page(page)
    except FileNotFoundError:
        print(f"Error in extract_text: PDF file not found (during open) at {pdf_path}")
        return None
    except Exception as e:
        print(f"Error reading PDF with pdfminer.six: {e}")
        return None
    return output_string.getvalue()

def split_cv_by_company(text):
    # (既存のコード)
    print("\n--- DEBUG (split_cv): Entering function ---")
    if not text:
        print("--- DEBUG (split_cv): Input text is empty or None.")
        return []
    lines = text.split('\n')
    prof_exp_start_line_index = -1
    search_term = "professional experience"
    print("--- DEBUG (split_cv): Searching for 'Professional Experience' header ---")
    for i, line in enumerate(lines):
        processed_line = ' '.join(line.lower().split())
        if search_term in processed_line.replace('', ''):
             print(f"--- DEBUG (split_cv): Found header at line index {i}")
             prof_exp_start_line_index = i + 1
             break
    if prof_exp_start_line_index == -1:
        print("--- DEBUG (split_cv): 'Professional Experience' header not found.")
        return []
    pattern_job_date_line = (
        r"^[A-Za-z\s,/()&]+,"
        r"\s*"
        r"(?:"
            r"(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{4}"
            r"(?:\s*[-–]\s*"
            r"(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|\w+)"
            r"(?:\s+\d{4})?)?"
        r"|"
            r"\d{4}\s*[-–]\s*\d{4}"
        r")$"
    )
    job_header_indices = []
    print(f"--- DEBUG (split_cv): Searching for job/date pattern from line index {prof_exp_start_line_index} ---")
    for i in range(prof_exp_start_line_index, len(lines)):
        line = lines[i]
        line_stripped = line.strip().rstrip('').strip()
        if line_stripped:
             match_result = re.match(pattern_job_date_line, line_stripped)
             if ',' in line_stripped and any(char.isdigit() for char in line_stripped):
                 if match_result:
                      print(f"DEBUG (split_cv) Line {i}: MATCH! -> '{line_stripped}'")
                      job_header_indices.append(i)
                 else:
                      if not re.match(r"^[\d•]", line_stripped):
                           print(f"DEBUG (split_cv) Line {i}: NO MATCH -> '{line_stripped}'")
    print(f"--- DEBUG (split_cv): Finished pattern search. Found {len(job_header_indices)} potential header lines.")
    if not job_header_indices:
        print("--- DEBUG (split_cv): No job/date lines matched the pattern.")
        return []
    sections = [];
    for i in range(len(job_header_indices)):
        header_index = job_header_indices[i]; section_start = header_index - 1
        while section_start > prof_exp_start_line_index -1 and not lines[section_start].strip(): section_start -= 1
        prev_header_index = job_header_indices[i-1] if i > 0 else prof_exp_start_line_index - 1
        section_start = max(prev_header_index + 1, section_start)
        if i + 1 < len(job_header_indices):
            next_header_index = job_header_indices[i+1]; section_end = next_header_index - 1
            while section_end > header_index and not lines[section_end].strip(): section_end -= 1
            section_end += 1; next_section_start = next_header_index -1
            while next_section_start > header_index and not lines[next_section_start].strip(): next_section_start -=1
            section_end = min(section_end, next_section_start)
        else: section_end = len(lines)
        section_lines = lines[section_start:section_end]
        if section_lines and not all(s.strip() == '' for s in section_lines):
             clean_section = "\n".join(section_lines).strip().rstrip('').strip(); sections.append(clean_section)
    print(f"--- DEBUG (split_cv): Returning {len(sections)} sections. ---")
    return sections
print("Helper functions defined.")


# ===== 1.5. 新しい関数定義 (マークダウン処理、前処理ツール) =====
def clean_markdown_text(md_text):
    """マークダウンテキストから純粋なテキストを抽出（簡易版）"""
    # HTMLタグの除去
    text = re.sub(r'<[^>]+>', ' ', md_text)
    # Markdown特有の記号の除去 (見出し、リスト、強調、リンクなど)
    text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE)  # 見出し
    text = re.sub(r'^\s*[\*\-\+]\s+', '', text, flags=re.MULTILINE) # リストマーカー
    text = re.sub(r'(\*\*|__)(.*?)(\*\*|__)', r'\2', text) # 太字
    text = re.sub(r'(\*|_)(.*?)(\*|_)', r'\2', text)     # 斜体
    text = re.sub(r'`(.*?)`', r'\1', text)               # インラインコード
    text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text)     # リンク (リンクテキストのみ保持)
    text = re.sub(r'!\[(.*?)\]\(.*?\)', r'\1', text)    # 画像 (altテキストのみ保持)
    # 連続する空白を一つに
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def extract_text_from_markdown(md_file_path):
    """マークダウンファイルからテキストを抽出"""
    if not os.path.exists(md_file_path):
        print(f"Error: Markdown file not found at {md_file_path}")
        return None
    try:
        with open(md_file_path, 'r', encoding='utf-8') as f:
            post = frontmatter.load(f) # フロントマターも考慮する場合
            content = post.content
            # content = f.read() # フロントマターを考慮しない場合
        # Markdownをパースしてテキストのみ抽出 (markdown-it-py を使用)
        md = MarkdownIt()
        html_content = md.render(content) # 一旦HTMLに変換
        text_content = clean_markdown_text(html_content) # HTMLタグやMD記号を除去
        # または、より単純な clean_markdown_text(content) でも可
        return text_content
    except Exception as e:
        print(f"Error reading Markdown file {md_file_path}: {e}")
        return None

def preprocess_text_with_spacy(text, nlp, custom_stop_words=None):
    """spaCyを用いた高度な前処理（品詞フィルタリング、NER抽出を含む）"""
    if not text or not isinstance(text, str):
        return []
    doc = nlp(text)
    tokens = []

    # 固有表現抽出 (USE_NER_EXTRACTIONがTrueの場合)
    if USE_NER_EXTRACTION:
        for ent in doc.ents:
            if ent.label_ in ALLOWED_NER_LABELS:
                # 固有表現はそのままの形で追加（レンマ化しない方が良い場合も）
                tokens.append(ent.text.lower().strip())

    # 通常のトークン処理
    for token in doc:
        is_stop = token.is_stop or (custom_stop_words and token.lemma_.lower() in custom_stop_words)
        if not is_stop and not token.is_punct and len(token.text) > 1:
            if USE_POS_FILTERING:
                if token.pos_ in ALLOWED_POS:
                    tokens.append(token.lemma_.lower())
            else:
                tokens.append(token.lemma_.lower())
    return tokens

def generate_ngrams(processed_token_list, min_count=5, threshold=10):
    """N-gram (bigram, trigram) を生成"""
    if not USE_NGRAMS or not processed_token_list:
        return processed_token_list

    # Phrasesモデルはリストのリストを入力とする
    docs_for_phrases = [doc.split() for doc in processed_token_list if isinstance(doc, str)]
    if not docs_for_phrases:
        return processed_token_list

    try:
        bigram_model = Phrases(docs_for_phrases, min_count=min_count, threshold=threshold)
        trigram_model = Phrases(bigram_model[docs_for_phrases], threshold=threshold) # min_countはbigramで効いている
        bigram_phraser = Phraser(bigram_model)
        trigram_phraser = Phraser(trigram_model)

        ngram_docs = []
        for doc_tokens in docs_for_phrases:
            ngram_docs.append(" ".join(trigram_phraser[bigram_phraser[doc_tokens]]))
        return ngram_docs
    except Exception as e:
        print(f"Error generating ngrams: {e}")
        return processed_token_list # エラー時は元のリストを返す


# ===== 2. データ読み込みと準備 =====
print("\n--- Loading Data Sources ---")
# --- CVデータ (PDFとマークダウンの両方を考慮) ---
cv_text_sources = [] # 生のテキストソースをここにためる

# PDFからのCVデータ (既存)
pdf_file_path = '/content/drive/MyDrive/Yasuyuki_Sakane_20250403.pdf' # ★正しいファイルパス★
print(f"Checking for CV PDF file at: {pdf_file_path}")
if os.path.exists(pdf_file_path):
    print("CV PDF file found. Attempting to extract text...")
    cv_text_pdf = extract_text_from_pdf_pdfminer(pdf_file_path)
    if cv_text_pdf:
        cv_text_sources.append({"source": pdf_file_path, "text": cv_text_pdf, "type": "pdf_cv"})
else:
    print(f"Warning: CV PDF file not found at {pdf_file_path}")

# マークダウンからのCVデータ (新規追加)
if USE_MARKDOWN_INPUT:
    markdown_cv_folder = '/content/drive/MyDrive/CV_Markdown/' # ★マークダウンCVファイルがあるフォルダ★
    md_cv_files = glob.glob(os.path.join(markdown_cv_folder, "*.md"))
    print(f"Found {len(md_cv_files)} Markdown CV files in {markdown_cv_folder}")
    for md_file in md_cv_files:
        print(f"Reading Markdown CV from: {md_file}")
        md_text = extract_text_from_markdown(md_file)
        if md_text:
            cv_text_sources.append({"source": md_file, "text": md_text, "type": "md_cv"})

# CVテキストの分割処理 (複数のCVソースに対応)
cv_sections = []
for cv_source in cv_text_sources:
    if cv_source["text"]:
        print(f"\nAttempting to split text from {cv_source['source']} into sections...")
        # split_cv_by_company はPDFのレイアウトを前提としているため、マークダウンには不向きな場合がある
        # マークダウンの場合は、構造（例：H2見出しごと）で分割するロジックを別途検討するか、全体を一つのセクションとする
        if cv_source["type"] == "pdf_cv":
            sections = split_cv_by_company(cv_source["text"])
            if sections:
                cv_sections.extend(sections)
            else:
                print(f"Warning: PDF CV splitting resulted in 0 sections for {cv_source['source']}. Using full text.")
                cv_sections.append(cv_source["text"]) # 分割失敗時は全文を1セクションとして追加
        elif cv_source["type"] == "md_cv":
            # マークダウンの場合、単純に全文を1セクションとして扱うか、
            # またはマークダウンの構造（例：特定のレベルの見出し）で分割するロジックをここに追加
            print(f"Info: For Markdown CV {cv_source['source']}, using full text as one section. Consider custom splitting if needed.")
            cv_sections.append(cv_source["text"])

print(f"Total CV sections collected: {len(cv_sections)}")
if not cv_sections:
    print("Warning: No CV sections were loaded or processed.")


# --- ジャーナルデータ (既存) ---
journal_file_path = '/content/drive/MyDrive/Journal_02_20250504.csv'
# (既存のジャーナルデータ読み込みコード)
journal_docs = []
try:
    df_journal = pd.read_csv(journal_file_path, header=None)
    if df_journal.shape[1] > 2:
         journal_docs = df_journal.iloc[:, 2].dropna().astype(str).tolist()
         print(f"Loaded {len(journal_docs)} journal entries from CSV (Column C).")
    else: print(f"Error: Column C (index 2) not found in {journal_file_path}. Check CSV structure.")
except FileNotFoundError: print(f"Journal CSV file not found at {journal_file_path}")
except Exception as e: print(f"Error reading journal CSV: {e}")


# --- 用語リスト (既存) ---
buzzword_file_path = '/content/drive/MyDrive/BuzzWords2024_Pillar.xlsx'
# (既存の用語リスト読み込みコード)
my_keywords_from_list_set = set()
keyword_metadata = {}
try:
    all_sheets_df = pd.read_excel(buzzword_file_path, sheet_name=None)
    print(f"Read {len(all_sheets_df)} sheets from Buzzwords Excel.")
    for sheet_name, df_sheet in all_sheets_df.items():
        term_col_index = 1 # B列を想定
        if df_sheet.shape[1] > term_col_index and not df_sheet.iloc[:, term_col_index].isnull().all():
            sheet_keywords = df_sheet.iloc[:, term_col_index].dropna().astype(str).str.lower().str.strip().tolist()
            my_keywords_from_list_set.update(sheet_keywords)
            col_indices = {'context': 2, 'cat_major': 3, 'cat_mid': 4, 'cat_minor': 5, 'sentiment': 7, 'topic': 8}
            available_meta_cols = {key: idx for key, idx in col_indices.items() if df_sheet.shape[1] > idx}
            for index, row in df_sheet.iterrows():
                 term_raw = row.iloc[term_col_index]
                 if pd.notna(term_raw):
                      term = str(term_raw).lower().strip()
                      if term:
                           if term not in keyword_metadata: keyword_metadata[term] = {}
                           keyword_metadata[term]['pillar'] = keyword_metadata[term].get('pillar', []) + [sheet_name]
                           for key, idx in available_meta_cols.items():
                                if key not in keyword_metadata[term] or pd.notna(row.iloc[idx]):
                                     keyword_metadata[term][key] = row.iloc[idx]
    my_keywords_from_list = sorted(list(my_keywords_from_list_set))
    print(f"Loaded {len(my_keywords_from_list)} unique keywords from all Excel sheets.")
except FileNotFoundError: print(f"Buzzwords Excel file not found at {buzzword_file_path}")
except Exception as e: print(f"Error reading Buzzwords Excel: {e}")

print("\n--- Finished Loading Data Sources ---")


# ===== 3. 全データの統合と前処理 (ツールセット適用) =====
print("\n--- Combining and Preprocessing All Documents (with Toolset) ---")
all_docs_raw = []
source_info = [] # 各ドキュメントの出典情報を保持
if cv_sections:
    all_docs_raw.extend(cv_sections)
    source_info.extend([{"source_type": "cv"}] * len(cv_sections))
else:
    print("Warning: Proceeding without CV sections.")
if journal_docs:
    all_docs_raw.extend(journal_docs)
    source_info.extend([{"source_type": "journal"}] * len(journal_docs))
else:
    print("Warning: Proceeding without journal entries.")

print(f"Total raw documents combined: {len(all_docs_raw)}")

# spaCyモデルのロード
print("Loading spaCy model...")
try:
    nlp = spacy.load("en_core_web_sm") # または "en_core_web_lg" などより大きなモデル
except OSError:
    print("spaCy model not found. Please download it: python -m spacy download en_core_web_sm")
    nlp = None # エラーハンドリング

# 注目ワードリストの前処理 (spaCyがロードされていれば)
processed_watchlist_keywords = []
if USE_WATCHLIST_ANALYSIS and WATCHLIST_KEYWORDS_RAW and nlp:
    print("\nPreprocessing Watchlist Keywords...")
    for kw_raw in WATCHLIST_KEYWORDS_RAW:
        # preprocess_text_with_spacy はリストを返すので、通常は1要素のリスト
        processed_kws = preprocess_text_with_spacy(kw_raw, nlp)
        if processed_kws:
            processed_watchlist_keywords.append(" ".join(processed_kws)) # 複数トークンになる場合も考慮
    print(f"Processed Watchlist: {processed_watchlist_keywords}")


processed_docs_tokens = [] # トークンリストのリスト (N-gram処理前)
processed_indices_map = []
processed_doc_lengths_raw_tokens = []

if all_docs_raw and nlp:
    print(f"Preprocessing {len(all_docs_raw)} documents with spaCy (POS, NER)...")
    for i, doc_text in enumerate(all_docs_raw):
        tokens = preprocess_text_with_spacy(doc_text, nlp) # カスタマイズされた前処理関数
        if tokens:
            # ★長さフィルタ (例: 3トークン以上) - ここで調整可能★
            min_tokens_threshold = 3
            if len(tokens) >= min_tokens_threshold:
                processed_docs_tokens.append(tokens) # トークンのリストとして保持
                processed_indices_map.append(i)
                processed_doc_lengths_raw_tokens.append(len(tokens))

    print(f"Initial spaCy preprocessing completed. {len(processed_docs_tokens)} documents retained.")
    if processed_doc_lengths_raw_tokens:
        print(f"Raw token counts per doc: Min={min(processed_doc_lengths_raw_tokens)}, Max={max(processed_doc_lengths_raw_tokens)}, Avg={sum(processed_doc_lengths_raw_tokens)/len(processed_doc_lengths_raw_tokens):.1f}")

    # N-gram生成 (USE_NGRAMSがTrueの場合)
    # N-gram処理のために、一旦スペース区切りの文字列に戻す
    docs_for_ngram = [" ".join(tokens) for tokens in processed_docs_tokens]
    if USE_NGRAMS:
        print("\nGenerating N-grams...")
        processed_docs_ngram_strings = generate_ngrams(docs_for_ngram, NGRAM_MIN_COUNT, NGRAM_THRESHOLD)
        # N-gram処理後のトークンリストを再度作成 (BERTopicやTF-IDFは文字列リストを入力とするのでこのままでも良い)
        # processed_docs = processed_docs_ngram_strings
        # For BERTopic etc., string list is fine.
        print(f"N-gram generation completed. {len(processed_docs_ngram_strings)} documents.")
        # processed_docs_final は文字列のリスト
        processed_docs_final = processed_docs_ngram_strings
    else:
        # processed_docs_final は文字列のリスト
        processed_docs_final = docs_for_ngram

    print(f"\nFinal preprocessing completed. Total documents for analysis: {len(processed_docs_final)}")
    if processed_docs_final: # processed_docs_final が空でないことを確認
        final_doc_lengths = [len(doc.split()) for doc in processed_docs_final]
        if final_doc_lengths: # final_doc_lengths が空でないことを確認
            print(f"Final token counts per doc (after ngrams if used): Min={min(final_doc_lengths)}, Max={max(final_doc_lengths)}, Avg={sum(final_doc_lengths)/len(final_doc_lengths):.1f}")
        else:
            print("No documents after final processing, cannot calculate token counts.")
    else:
        print("No documents available after final preprocessing.")

else:
    print("No documents or spaCy model not loaded, skipping preprocessing.")
    processed_docs_final = []


# ===== 3.5. 注目ワードリストの分析 (Watchlist Analysis) =====
if USE_WATCHLIST_ANALYSIS and processed_watchlist_keywords and processed_docs_final:
    print("\n--- Analyzing Watchlist Keywords ---")
    watchlist_analysis_results = {}
    for kw_proc in processed_watchlist_keywords:
        containing_docs_indices = [
            original_idx for doc_idx, original_idx in enumerate(processed_indices_map)
            if kw_proc in processed_docs_final[doc_idx] # processed_docs_final を参照
        ]
        watchlist_analysis_results[kw_proc] = {
            "raw_keyword": [r for r_idx, r in enumerate(WATCHLIST_KEYWORDS_RAW) if kw_proc in " ".join(preprocess_text_with_spacy(r, nlp))], # 近似的なマッチ
            "processed_keyword": kw_proc,
            "num_containing_docs": len(containing_docs_indices),
            "containing_doc_original_indices": containing_docs_indices
        }
        print(f"Keyword '{kw_proc}': Found in {len(containing_docs_indices)} processed documents.")
        if len(containing_docs_indices) < 5 and len(containing_docs_indices) > 0: # 少数の場合、どの文書か表示
            print(f"  Original indices of containing docs: {containing_docs_indices}")

    # (オプション) さらに詳細な分析：TF-IDFスコア、BERTopicトピックなど


# ===== 4. BERTopic 分析 (processed_docs_final を使用) =====
topic_model = None; topics = None; probs = None; topic_info = None
if processed_docs_final and len(processed_docs_final) > 1: # processed_docs_final に変更
    print("\n--- Running BERTopic Analysis ---")
    # (既存のBERTopicコード、入力は processed_docs_final)
    try:
        num_docs = len(processed_docs_final)
        n_neighbors_val = min(30, max(5, num_docs // 10 + 1))
        min_cluster_val = max(3, num_docs // 25 + 1)
        min_topic_val = min_cluster_val
        print(f"Using BERTopic params: n_neighbors={n_neighbors_val}, min_cluster_size={min_cluster_val}, min_topic_size={min_topic_val}")

        umap_model = umap.UMAP(n_neighbors=n_neighbors_val, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
        hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=min_cluster_val, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

        topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True,
                               min_topic_size=min_topic_val,
                               umap_model=umap_model, hdbscan_model=hdbscan_model)

        topics, probs = topic_model.fit_transform(processed_docs_final) # 入力を変更

        if topic_model:
            topic_info = topic_model.get_topic_info()
            print("\nBERTopic Results:")
            print(topic_info)
            if topic_info is not None and len(topic_info) > 1:
                print("\nGenerating BERTopic visualizations...")
                for viz_func_name in ["visualize_topics", "visualize_hierarchy", "visualize_barchart", "visualize_heatmap"]:
                     try:
                         func = getattr(topic_model, viz_func_name)
                         vis = func()
                         if vis: filename = f"combined_{viz_func_name}.html"; vis.write_html(filename); print(f"- {viz_func_name} saved to {filename}")
                     except Exception as e: print(f"- Could not generate {viz_func_name}: {e}")

                # 注目ワードがどのトピックに属するか確認 (BERTopic後)
                if USE_WATCHLIST_ANALYSIS and processed_watchlist_keywords and topics is not None:
                    print("\n--- Watchlist Keywords in BERTopic Topics ---")
                    # BERTopicは文書ごとにトピックを割り当てる。キーワード自体に直接トピックは振らない。
                    # キーワードを含む文書がどのトピックに多いか、などで分析可能。
                    # または、キーワードを短い文書としてtransformする。
                    try:
                        # キーワード自体を短い文書として扱い、トピックを予測
                        watchlist_topics, _ = topic_model.transform(processed_watchlist_keywords)
                        for i, kw_proc in enumerate(processed_watchlist_keywords):
                            topic_id = watchlist_topics[i]
                            topic_words = topic_model.get_topic(topic_id) if topic_id != -1 else ["Outlier Topic"]
                            print(f"Watchlist Keyword '{kw_proc}': Assigned to Topic {topic_id} ({topic_words[:5] if topic_words else 'N/A'})")
                    except Exception as e:
                        print(f"Could not assign topics to watchlist keywords: {e}")

            else: print("\nNo significant topics found by BERTopic (only outliers or failed).")
        else: print("BERTopic model fitting failed.")
    except Exception as e: print(f"BERTopic failed: {e}")
else:
    print("\nSkipping BERTopic: Not enough processed documents or BERTopic disabled.")

# ===== 5. TF-IDF 分析 (processed_docs_final を使用) =====
tfidf_top_keywords_per_doc_map = {}
feature_names_tfidf = [] # TF-IDFの語彙
df_tfidf = None
if processed_docs_final and len(processed_docs_final) > 0: # processed_docs_final に変更
    print("\n--- Running TF-IDF Analysis ---")
    # (既存のTF-IDFコード、入力は processed_docs_final)
    try:
          vectorizer = TfidfVectorizer(min_df=1, max_df=1.0, stop_words=None, token_pattern=r"(?u)\b\w[\w-]*\w\b") # N-gram対応のためtoken_pattern調整
          tfidf_matrix = vectorizer.fit_transform(processed_docs_final) # 入力を変更
          feature_names_tfidf = vectorizer.get_feature_names_out()
          df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names_tfidf, index=processed_indices_map) # indexは元のまま
          print(f"TF-IDF matrix calculated ({df_tfidf.shape[0]} docs, {df_tfidf.shape[1]} features).")

          num_top_keywords = 7
          for original_idx in df_tfidf.index: # df_tfidf.index は processed_indices_map の値
               doc_tfidf_scores = df_tfidf.loc[original_idx]
               top_keywords_indices = doc_tfidf_scores.nlargest(num_top_keywords).index
               tfidf_top_keywords_per_doc_map[original_idx] = list(top_keywords_indices)
          print(f"Collected top {num_top_keywords} TF-IDF keywords for {len(tfidf_top_keywords_per_doc_map)} documents.")

          # 注目ワードのTF-IDFスコア確認
          if USE_WATCHLIST_ANALYSIS and processed_watchlist_keywords and df_tfidf is not None:
              print("\n--- Watchlist Keywords TF-IDF Scores ---")
              for kw_proc in processed_watchlist_keywords:
                  if kw_proc in feature_names_tfidf:
                      # このキーワードを含む文書でのスコアの平均や最大値などを表示できる
                      kw_scores = df_tfidf[kw_proc][df_tfidf[kw_proc] > 0]
                      if not kw_scores.empty:
                          print(f"Keyword '{kw_proc}': Found in {len(kw_scores)} docs. Avg TF-IDF: {kw_scores.mean():.4f}, Max TF-IDF: {kw_scores.max():.4f}")
                      else:
                          print(f"Keyword '{kw_proc}': Found in TF-IDF vocab, but score is 0 in all processed docs.")
                  else:
                      print(f"Keyword '{kw_proc}': Not found in TF-IDF vocabulary.")

    except Exception as e: print(f"TF-IDF failed: {e}")
else:
    print("\nSkipping TF-IDF: Not enough processed documents.")


# ===== 5.5. Word Embeddings (Sentence-BERT) と KeyBERT =====
doc_embeddings = None
sbert_model = None
if USE_SENTENCE_BERT_EMBEDDINGS and processed_docs_final:
    print(f"\n--- Generating Document Embeddings with Sentence-BERT ({SBERT_MODEL_NAME}) ---")
    try:
        sbert_model = SentenceTransformer(SBERT_MODEL_NAME)
        doc_embeddings = sbert_model.encode(processed_docs_final, show_progress_bar=True)
        print(f"Generated {doc_embeddings.shape[0]} embeddings of dimension {doc_embeddings.shape[1]}.")

        # (オプション) 注目ワードの埋め込みも生成
        if USE_WATCHLIST_ANALYSIS and processed_watchlist_keywords:
            watchlist_embeddings = sbert_model.encode(processed_watchlist_keywords)
            # これらを使って類似度計算などが可能
            # from sklearn.metrics.pairwise import cosine_similarity
            # if doc_embeddings is not None and watchlist_embeddings is not None:
            #     sim_matrix = cosine_similarity(watchlist_embeddings, doc_embeddings)
            #     # print(f"Similarity matrix between watchlist and docs:\n{sim_matrix}")
    except Exception as e:
        print(f"Sentence-BERT embedding generation failed: {e}")

keybert_model = None
if USE_KEYBERT_EXTRACTION and processed_docs_final:
    print("\n--- Running KeyBERT Keyword Extraction ---")
    try:
        # KeyBERTは内部でSentenceTransformerを使用するため、sbert_modelを渡すことも可能
        if sbert_model:
            keybert_model = KeyBERT(model=sbert_model) # 既存のSBERTモデルを再利用
        else:
            keybert_model = KeyBERT(model=SentenceTransformer(SBERT_MODEL_NAME)) # 新たにロード

        keybert_top_keywords_per_doc_map = {}
        for i, doc_text in enumerate(processed_docs_final):
            # keyphrase_ngram_range で抽出するN-gramの範囲を指定
            # top_n で上位N個
            keywords_with_scores = keybert_model.extract_keywords(
                doc_text,
                keyphrase_ngram_range=(1, 3 if USE_NGRAMS else 1), # N-gram使用時は最大3-gramまで考慮
                stop_words='english', # または None やカスタムリスト
                top_n=7,
                use_mmr=True, diversity=0.7 # MMRで多様性を上げる
            )
            if keywords_with_scores: # keywords_with_scores が空でないことを確認
                keybert_top_keywords_per_doc_map[processed_indices_map[i]] = [kw[0] for kw in keywords_with_scores]
            else: # keywords_with_scores が空の場合の処理
                keybert_top_keywords_per_doc_map[processed_indices_map[i]] = []


        print(f"Collected top KeyBERT keywords for {len(keybert_top_keywords_per_doc_map)} documents.")
        # print("Sample KeyBERT keywords:", dict(list(keybert_top_keywords_per_doc_map.items())[:2]))

        # 既存の tfidf_top_keywords_per_doc_map をKeyBERTの結果で上書きするか、別途保持するか選択
        # ここでは上書きせず、ネットワーク分析などでどちらを使うか選択できるようにする
        # または、両方の結果を統合することも検討可能

    except Exception as e:
        print(f"KeyBERT extraction failed: {e}")


# ===== 6. NetworkX 共起ネットワーク分析 =====
# NetworkXの対象とするキーワードリストを選択（TF-IDFベースかKeyBERTベースか、またはExcelリスト）
network_target_keywords_source = "excel" # "tfidf", "keybert", "excel" から選択

network_target_keywords = set()
feature_names_for_network = [] # ネットワーク構築時の語彙（TF-IDF由来など）

# Excelのキーワードを前処理 (既存のロジックを活かしつつ、spaCy前処理を適用)
processed_excel_keywords_for_network = set()
if my_keywords_from_list and nlp:
    print("\nPreprocessing Excel keywords for NetworkX with spaCy...")
    for kw_raw in my_keywords_from_list:
        processed_kws_tokens = preprocess_text_with_spacy(str(kw_raw), nlp)
        if processed_kws_tokens:
            processed_excel_keywords_for_network.add(" ".join(processed_kws_tokens)) # N-gram対応
    print(f"{len(processed_excel_keywords_for_network)} unique processed keywords from Excel list.")


if network_target_keywords_source == "excel":
    # Excelリストのキーワードを、実際に文書中に出現する形（前処理後）に合わせる
    # processed_docs_final に含まれる語彙と照合
    temp_vocab_from_processed_docs = set(itertools.chain.from_iterable(doc.split() for doc in processed_docs_final))
    network_target_keywords.update([kw for kw in processed_excel_keywords_for_network if kw in temp_vocab_from_processed_docs])
    feature_names_for_network = list(temp_vocab_from_processed_docs) # ネットワークの語彙は全文書語彙
    print(f"Using {len(network_target_keywords)} keywords from processed Excel list (found in corpus) for NetworkX.")
elif network_target_keywords_source == "tfidf" and df_tfidf is not None:
    # TF-IDFの上位語をターゲットにする (例: 全文書での合計TF-IDFスコアが高い上位N語)
    if not df_tfidf.empty: # df_tfidf が空でないことを確認
        sum_tfidf = df_tfidf.sum().sort_values(ascending=False)
        network_target_keywords.update(sum_tfidf.head(100).index.tolist()) # 上位100語
        feature_names_for_network = list(df_tfidf.columns)
        print(f"Using top {len(network_target_keywords)} TF-IDF keywords for NetworkX.")
    else:
        print("TF-IDF data is empty, cannot select keywords for network.")
elif network_target_keywords_source == "keybert" and 'keybert_top_keywords_per_doc_map' in locals() and keybert_top_keywords_per_doc_map:
    all_keybert_kws = set(itertools.chain.from_iterable(keybert_top_keywords_per_doc_map.values()))
    network_target_keywords.update(all_keybert_kws)
    # KeyBERTの場合、語彙は processed_docs_final から構築する必要がある
    feature_names_for_network = list(set(itertools.chain.from_iterable(doc.split() for doc in processed_docs_final)))
    print(f"Using {len(network_target_keywords)} unique KeyBERT keywords for NetworkX.")
else:
    print("Warning: Could not determine target keywords for NetworkX based on selection or data availability.")

# 注目ワードもネットワーク分析のターゲットに含めるか (オプション)
if USE_WATCHLIST_ANALYSIS and processed_watchlist_keywords:
    print(f"Adding {len(processed_watchlist_keywords)} processed watchlist keywords to NetworkX targets.")
    network_target_keywords.update(processed_watchlist_keywords)
    print(f"Total target keywords for NetworkX (including watchlist): {len(network_target_keywords)}")


min_occurrence_network = 2
G = None # Gを初期化
if network_target_keywords and processed_docs_final:
    print(f"\n--- Running NetworkX Analysis with {len(network_target_keywords)} target keywords ---")
    # (既存のNetworkXコード、入力は processed_docs_final と network_target_keywords)
    co_occurrence_pairs = [];
    # all_words_in_processed_docs は network_target_keywords に限定せず、全文書の語彙でカウントした方が良い場合もある
    # ここでは、network_target_keywords に含まれる単語の出現回数をカウントする
    all_target_words_in_docs = list(itertools.chain.from_iterable(
        [word for word in doc.split() if word in network_target_keywords] for doc in processed_docs_final
    ))
    total_word_counts = Counter(all_target_words_in_docs)

    for doc_text in processed_docs_final:
        words_in_doc = set(doc_text.split())
        keywords_in_doc = words_in_doc.intersection(network_target_keywords)
        if len(keywords_in_doc) >= 2:
            pairs_in_doc = [tuple(sorted(pair)) for pair in itertools.combinations(keywords_in_doc, 2)]
            co_occurrence_pairs.extend(pairs_in_doc)

    pair_counts = Counter(co_occurrence_pairs)
    G = nx.Graph()

    for pair, count in pair_counts.items():
        if count >= min_occurrence_network:
            keyword1, keyword2 = pair
            # メタデータのキーも前処理後の形に合わせる必要がある
            # keyword_metadata のキーは生のキーワードなので、前処理済みキーワードとのマッピングが必要
            # ここでは簡易的に、前処理済みキーワードでノードを作成
            meta1 = {} # keyword_metadata.get(raw_keyword_for_processed_k1, {})
            meta2 = {} # keyword_metadata.get(raw_keyword_for_processed_k2, {})

            if not G.has_node(keyword1):
                G.add_node(keyword1, total_count=total_word_counts.get(keyword1, 1), **meta1)
            if not G.has_node(keyword2):
                G.add_node(keyword2, total_count=total_word_counts.get(keyword2, 1), **meta2)
            G.add_edge(keyword1, keyword2, weight=count)

    print(f"Network graph created with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges (min_occurrence={min_occurrence_network}).")

    # 描画 (既存)
    if G.number_of_nodes() > 1 and G.number_of_edges() > 0:
        # (既存の描画コード)
        print("Drawing network graphs...")
        node_colors_cat = [] # カテゴリによる色分け
        categories = set()
        default_cat = 'Other (from text)'

        # Excelのメタデータをノードにマッピングする改善が必要
        # processed_excel_keywords_for_network と keyword_metadata のキーを紐付ける
        # ここでは簡易的に、ノード名（処理済みキーワード）でメタデータを引く試み（失敗する可能性大）
        for node in G.nodes():
             cat = keyword_metadata.get(node, {}).get('cat_major', default_cat) # ノード名がExcelの元キーワードと一致する場合
             if pd.isna(cat): cat = default_cat
             node_colors_cat.append(cat); categories.add(cat)

        unique_categories = sorted(list(categories))
        # color_map_list = plt.cm.get_cmap('tab20', len(unique_categories)) if len(unique_categories) > 0 else plt.cm.get_cmap('tab20')
        # color_map_dict = {cat: color_map_list(i) for i, cat in enumerate(unique_categories)}
        # final_node_colors = [color_map_dict.get(cat, color_map_list(0)) for cat in node_colors_cat] # getでデフォルト色指定

        # 色分けロジックを安全に
        if unique_categories:
            num_cats = len(unique_categories)
            # cmap = plt.cm.get_cmap('viridis', num_cats if num_cats > 0 else 1) # カラーマップ変更 & num_catsが0の場合の対処
            cmap = plt.get_cmap('viridis', num_cats if num_cats > 0 else 1)
            color_map_dict = {category: cmap(i / num_cats if num_cats > 0 else 0) for i, category in enumerate(unique_categories)}
        else: # カテゴリがない場合
            color_map_dict = {default_cat: 'blue'} # デフォルトカテゴリに単一色
            if not categories: # categoriesが空ならdefault_catを追加
                categories.add(default_cat)
                node_colors_cat = [default_cat] * G.number_of_nodes()


        final_node_colors = [color_map_dict.get(cat, 'gray') for cat in node_colors_cat] # 取得できない場合はグレー


        plt.figure(figsize=(24, 24));
        try: pos = nx.kamada_kawai_layout(G)
        except: pos = nx.spring_layout(G, k=0.7, iterations=50, seed=42) # kパラメータ調整
        node_sizes = [(G.degree(node) + 1) * 100 for node in G.nodes()] # サイズ調整
        edge_widths = [G[u][v]['weight'] * 0.5 for u, v in G.edges()] # 太さ調整
        nx.draw_networkx_nodes(G, pos, node_size=node_sizes, node_color=final_node_colors, alpha=0.8)
        nx.draw_networkx_edges(G, pos, width=edge_widths, edge_color='lightgray', alpha=0.5)
        nx.draw_networkx_labels(G, pos, font_size=9, font_family='sans-serif') # フォントサイズ調整

        if unique_categories: # unique_categories が空でないことを確認
            legend_handles = [plt.Line2D([0], [0], marker='o', color='w', label=cat, markerfacecolor=color_map_dict.get(cat, 'gray'), markersize=10) for cat in unique_categories]
            plt.legend(handles=legend_handles, title="Categories (Major)", loc='best', fontsize='small')

        plt.title(f"Combined Data - Keyword Network by Category (Source: {network_target_keywords_source})", size=20); plt.axis('off'); plt.tight_layout(); plt.show()
        print("Graph drawing complete.")
    else: print("\nCould not draw network graph based on current settings (not enough nodes/edges or G is None).")
else:
    print("\nSkipping NetworkX: Not enough target keywords or processed documents.")
    G = nx.Graph() # G が None のままにならないように空のグラフを代入

# ===== 7. (オプション) メタデータ活用例 (既存コードにGの存在確認を強化) =====
# (既存のメタデータ活用コード、GがNoneでないことを確認)
if 'keyword_metadata' in locals() and keyword_metadata:
     print("\n--- Example: Analyzing Keyword Metadata from Excel ---")
     if G is not None and G.number_of_nodes() > 0: # G が None でないことを確認
          categories_found_in_graph = {}
          for node in G.nodes():
             # ノード名は前処理済み。keyword_metadataのキーは生。ここのマッピングが課題。
             # ここでは、もしノード名がkeyword_metadataにあれば、という仮定で進める。
             cat = keyword_metadata.get(node, {}).get('cat_major', 'Unknown in Graph Meta')
             if pd.isna(cat): cat = 'Unknown in Graph Meta'
             categories_found_in_graph[cat] = categories_found_in_graph.get(cat, 0) + 1
          print("Category distribution of NODES IN GRAPH (approximate due to key matching):", categories_found_in_graph)
     else:
          print("NetworkX graph (G) was not created or has no nodes. Skipping category distribution for graph.")

     # Pillar情報の集計は G とは独立 (既存)
     pillar_counts = {}
     for kw_raw, meta in keyword_metadata.items(): # kw_raw は Excel の生のキーワード
          if 'pillar' in meta and isinstance(meta['pillar'], list):
               for p in meta['pillar']:
                    if pd.notna(p):
                         pillar_counts[p] = pillar_counts.get(p, 0) + 1
     print("Keyword count per Pillar (from raw Excel list):", pillar_counts)
else:
     print("\nKeyword metadata not loaded. Skipping metadata analysis examples.")

print("\n--- End of Optional Metadata Analysis ---")

print("\n--- Comprehensive Analysis Workflow (with Toolset) Ready ---")
print("Please review the code, adjust file paths, parameters, and run the analysis.")
print("Consider installing: markdown-it-py, python-frontmatter, sentence-transformers, keybert")