In [1]:
import os
import re
import pandas as pd
import numpy as np
import networkx as nx
from tqdm import tqdm
from pyvi import ViTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

In [2]:
# === CONFIG ===
DOCS_PATH = "../1.CollectingDocuments/data_clean"
LINK_FILE = "../1.CollectingDocuments/extracted_urls.csv"

In [3]:
# === 1Ô∏è‚É£ ƒê·ªçc d·ªØ li·ªáu v√† ti·ªÅn x·ª≠ l√Ω ===
def clean_text(text):
    # Lo·∫°i b·ªè k√Ω t·ª± ƒë·∫∑c bi·ªát, gi·ªØ l·∫°i ti·∫øng Vi·ªát c√≥ d·∫•u
    text = re.sub(r"http\S+", "", text)  # b·ªè URL
    text = re.sub(r"[^a-zA-Z√Ä-·ªπ0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text.lower()

def load_and_preprocess_docs(folder_path):
    docs = {}
    for fname in os.listdir(folder_path):
        if fname.endswith(".txt"):
            with open(os.path.join(folder_path, fname), "r", encoding="utf-8") as f:
                raw = f.read()
            clean = clean_text(raw)
            tokens = ViTokenizer.tokenize(clean)
            docs[fname] = tokens
    return docs

print("üîÑ ƒêang t·∫£i v√† x·ª≠ l√Ω t√†i li·ªáu...")
docs = load_and_preprocess_docs(DOCS_PATH)
print(f"‚úÖ ƒê√£ x·ª≠ l√Ω {len(docs)} t√†i li·ªáu.\n")

üîÑ ƒêang t·∫£i v√† x·ª≠ l√Ω t√†i li·ªáu...
‚úÖ ƒê√£ x·ª≠ l√Ω 406 t√†i li·ªáu.



In [4]:
# === 2Ô∏è‚É£ Ph√¢n t√≠ch th·ªëng k√™ c∆° b·∫£n ===
doc_stats = []
for fname, text in docs.items():
    tokens = text.split()
    total_words = len(tokens)
    unique_words = len(set(tokens))
    top_words = [w for w, _ in Counter(tokens).most_common(10)]
    doc_stats.append({
        "document": fname,
        "total_words": total_words,
        "unique_words": unique_words,
        "top_words": ", ".join(top_words)
    })

df_stats = pd.DataFrame(doc_stats)
print("üìä Th·ªëng k√™ c∆° b·∫£n:\n", df_stats.head(), "\n")

# === 3Ô∏è‚É£ TF-IDF cho to√†n b·ªô t·∫≠p t√†i li·ªáu ===
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = vectorizer.fit_transform(docs.values())
feature_names = vectorizer.get_feature_names_out()

üìä Th·ªëng k√™ c∆° b·∫£n:
                                             document  total_words  \
0  15_ia_iem_du_lich_Buon_Ma_Thuot_ep_quen_loi_ve...         1807   
1  3_ngoi_lang_co_nep_minh_giua_long_cao_nguyen_a...          952   
2               ao_Binh_Ba__Wikipedia_tieng_Viet.txt          483   
3  ao_Hon_Khoai_o_Ca_Mau_-_vung_bien_va_dai_at_ph...         2553   
4                    a_Lat__Wikipedia_tieng_Viet.txt        15349   

   unique_words                                          top_words  
0           646  kh√°ch_s·∫°n, bu√¥n, du_l·ªãch, c·ªßa, l√†, thu·ªôt, ma, ...  
1           503  c·ªßa, l√†ng, l√¥, nh·ªØng, ng√¥i, ƒë√°, kh√°ch, v·ªõi, l√†...  
2           252  b√¨nh, ba, ƒë·∫£o, l√†, ng√†y, th√°ng, nƒÉm, 2013, b√£i...  
3           749  du_l·ªãch, ƒë·∫£o, ch√¢u, h√≤n, b√¨nh, tour, khoai, ni...  
4          2646  nƒÉm, v√†, l·∫°t, ƒë√†, th√°ng, ng√†y, c·ªßa, th√†nh_ph·ªë,...   



In [5]:
# Tr√≠ch 5 t·ª´ c√≥ TF-IDF cao nh·∫•t cho m·ªói doc
def top_tfidf_words(row, features, top_n=5):
    idx = row.nonzero()[1]
    scores = zip(idx, [row[0, i] for i in idx])
    sorted_words = sorted(scores, key=lambda x: x[1], reverse=True)[:top_n]
    return ", ".join([features[i] for i, _ in sorted_words])

top_tfidf = [top_tfidf_words(tfidf_matrix[i], feature_names) for i in range(len(docs))]
df_stats["top_tfidf"] = top_tfidf

In [6]:
df_stats.head()

Unnamed: 0,document,total_words,unique_words,top_words,top_tfidf
0,15_ia_iem_du_lich_Buon_Ma_Thuot_ep_quen_loi_ve...,1807,646,"kh√°ch_s·∫°n, bu√¥n, du_l·ªãch, c·ªßa, l√†, thu·ªôt, ma, ...","bu√¥n, st, thu·ªôt, ma, kh√°ch_s·∫°n"
1,3_ngoi_lang_co_nep_minh_giua_long_cao_nguyen_a...,952,503,"c·ªßa, l√†ng, l√¥, nh·ªØng, ng√¥i, ƒë√°, kh√°ch, v·ªõi, l√†...","l√¥, l√†ng, l≈©ng, c·ªï_k√≠nh, t∆∞·ªùng"
2,ao_Binh_Ba__Wikipedia_tieng_Viet.txt,483,252,"b√¨nh, ba, ƒë·∫£o, l√†, ng√†y, th√°ng, nƒÉm, 2013, b√£i...","2013, ƒë·∫£o, b√¨nh, ba, truy_c·∫≠p"
3,ao_Hon_Khoai_o_Ca_Mau_-_vung_bien_va_dai_at_ph...,2553,749,"du_l·ªãch, ƒë·∫£o, ch√¢u, h√≤n, b√¨nh, tour, khoai, ni...","khoai, ƒë·∫£o, h√≤n, du_l·ªãch, ch√¢u"
4,a_Lat__Wikipedia_tieng_Viet.txt,15349,2646,"nƒÉm, v√†, l·∫°t, ƒë√†, th√°ng, ng√†y, c·ªßa, th√†nh_ph·ªë,...","l·∫°t, tr·∫ßn_s·ªπ, ƒë√†, tr, nƒÉm"


In [None]:
# # === 4Ô∏è‚É£ Ph√¢n t√≠ch li√™n k·∫øt & t√≠nh PageRank ===
# if os.path.exists(LINK_FILE):
#     links = pd.read_csv(LINK_FILE)
#     G = nx.DiGraph()
#     for _, row in links.iterrows():
#         src, tgt = row["source_file"], row["target"]
#         if src in docs and tgt in docs:
#             G.add_edge(src, tgt)
    
#     # T√≠nh PageRank (ƒë·ªô ph·ªï bi·∫øn / ·∫£nh h∆∞·ªüng)
#     pagerank_scores = nx.pagerank(G, alpha=0.85)
#     df_stats["pagerank"] = df_stats["document"].map(pagerank_scores).fillna(0)
# else:
#     df_stats["pagerank"] = 0
#     print("‚ö†Ô∏è Kh√¥ng t√¨m th·∫•y file li√™n k·∫øt, b·ªè qua PageRank.\n")

KeyError: 'source'

In [None]:
# === 5Ô∏è‚É£ Chu·∫©n h√≥a d·ªØ li·ªáu t·ªïng h·ª£p ===
df_stats["avg_tfidf"] = tfidf_matrix.mean(axis=1).A1
# df_stats = df_stats.sort_values(ascending=False)

KeyError: 'pagerank'

In [9]:
print("üèÅ K·∫øt qu·∫£ t·ªïng h·ª£p:")
print(df_stats.head())

üèÅ K·∫øt qu·∫£ t·ªïng h·ª£p:
                                            document  total_words  \
0  15_ia_iem_du_lich_Buon_Ma_Thuot_ep_quen_loi_ve...         1807   
1  3_ngoi_lang_co_nep_minh_giua_long_cao_nguyen_a...          952   
2               ao_Binh_Ba__Wikipedia_tieng_Viet.txt          483   
3  ao_Hon_Khoai_o_Ca_Mau_-_vung_bien_va_dai_at_ph...         2553   
4                    a_Lat__Wikipedia_tieng_Viet.txt        15349   

   unique_words                                          top_words  \
0           646  kh√°ch_s·∫°n, bu√¥n, du_l·ªãch, c·ªßa, l√†, thu·ªôt, ma, ...   
1           503  c·ªßa, l√†ng, l√¥, nh·ªØng, ng√¥i, ƒë√°, kh√°ch, v·ªõi, l√†...   
2           252  b√¨nh, ba, ƒë·∫£o, l√†, ng√†y, th√°ng, nƒÉm, 2013, b√£i...   
3           749  du_l·ªãch, ƒë·∫£o, ch√¢u, h√≤n, b√¨nh, tour, khoai, ni...   
4          2646  nƒÉm, v√†, l·∫°t, ƒë√†, th√°ng, ng√†y, c·ªßa, th√†nh_ph·ªë,...   

                        top_tfidf  avg_tfidf  
0  bu√¥n, st, thu·ªôt, ma, kh√°c

In [10]:
# === 6Ô∏è‚É£ Xu·∫•t ra file t·ªïng h·ª£p ===
os.makedirs("output", exist_ok=True)
df_stats.to_csv("output/doc_analysis.csv", index=False, encoding="utf-8-sig")
print("\nüíæ ƒê√£ l∆∞u k·∫øt qu·∫£ v√†o output/doc_analysis.csv")


üíæ ƒê√£ l∆∞u k·∫øt qu·∫£ v√†o output/doc_analysis.csv
