In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from itertools import combinations
from tqdm.auto import tqdm
from pathlib import Path

# --------------------------------------------------------
# 0) Veri yükle
# --------------------------------------------------------
DATA_DIR    = Path("../../01_data")
df = pd.read_parquet(DATA_DIR / "predictive_model" /"df_auto_corpus_area_tech.parquet")  # path'i gerekirse değiştir

# Varsayım: aşağıdaki kolonlar var:
#   - source_type  (paper / patent / ...)
#   - auto_top8_pred
#   - seed_top1_sim
#   - seed_top2_sim

# --------------------------------------------------------
# 1) AREA + high-confidence filtre tanımı
# --------------------------------------------------------
AREA = "Cybersecurity_Safety_Governance"

# sadece paper + patent
mask_pp = df["source_type"].isin(["paper", "patent"])
df["margin_pp"] = df["seed_top1_sim"]-df["seed_top2_sim"]
# high-confidence eşiklerini burada oynayabilirsin
mask_conf = (
    (df["seed_top1_sim"] >= 0.60) & (df["margin_pp"] >= 0.1)
)

mask_area = df["auto_focus_area"] == AREA
df_area = df[mask_pp & mask_conf & mask_area].copy()

# --------------------------------------------------------
# EXTRA: AREA için en yüksek similarity'ye sahip ilk 5 satırı göster
# --------------------------------------------------------
top5 = (
    df_area
    .sort_values("seed_top1_sim", ascending=False)
    .head(5)[
        ["text", "seed_top1_sim", "auto_focus_area"]
    ]
)

print("\n=== AREA için en yüksek similarity'ye sahip ilk 5 doküman ===")
print(top5.to_string(index=False))
print("============================================================\n")

texts = df_area["text"].fillna("").astype(str).tolist()


print(f"AREA: {AREA}")
print("Kullanılan doküman sayısı:", len(texts))

if len(texts) == 0:
    raise ValueError("Bu filtrelerle hiç doküman yok, eşikleri/AREA'yı gevşet.")

# --------------------------------------------------------
# 2) UNIGRAM
# --------------------------------------------------------
cv_uni = CountVectorizer(ngram_range=(1,1), min_df=3, stop_words="english")
X_uni = cv_uni.fit_transform(texts)
unigram_counts = np.asarray(X_uni.sum(axis=0)).ravel()
unigram_vocab = cv_uni.get_feature_names_out()

df_uni = pd.DataFrame({
    "term": unigram_vocab,
    "count": unigram_counts
}).sort_values("count", ascending=False)

uni_path = f"word_frequency_unigram_{AREA}.csv"
df_uni.to_csv(uni_path, index=False)
print(f"✓ unigram kaydedildi -> {uni_path}")

# --------------------------------------------------------
# 3) BIGRAM
# --------------------------------------------------------
cv_bi = CountVectorizer(ngram_range=(2,2), min_df=3, stop_words="english")
X_bi = cv_bi.fit_transform(texts)
bigram_counts = np.asarray(X_bi.sum(axis=0)).ravel()
bigram_vocab = cv_bi.get_feature_names_out()

df_bi = pd.DataFrame({
    "term": bigram_vocab,
    "count": bigram_counts
}).sort_values("count", ascending=False)

bi_path = f"word_frequency_bigram_{AREA}.csv"
df_bi.to_csv(bi_path, index=False)
print(f"✓ bigram kaydedildi -> {bi_path}")

# --------------------------------------------------------
# 4) TRIGRAM (istersen kapatabilirsin)
# --------------------------------------------------------
cv_tri = CountVectorizer(ngram_range=(3,3), min_df=3, stop_words="english")
X_tri = cv_tri.fit_transform(texts)
trigram_counts = np.asarray(X_tri.sum(axis=0)).ravel()
trigram_vocab = cv_tri.get_feature_names_out()

df_tri = pd.DataFrame({
    "term": trigram_vocab,
    "count": trigram_counts
}).sort_values("count", ascending=False)

tri_path = f"word_frequency_trigram_{AREA}.csv"
df_tri.to_csv(tri_path, index=False)
print(f"✓ trigram kaydedildi -> {tri_path}")




print("\nTAMAM ✔ AREA bazlı CSV'ler üretildi:")
print("-", uni_path)
print("-", bi_path)
print("-", tri_path)

