<a href="https://colab.research.google.com/github/echung2/echung2/blob/master/mihaya_bert_tm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install necessary libraries
!pip install transformers sentence-transformers umap-learn scikit-learn

Collecting umap-learn
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading umap_learn-0.5.7-py3-none-any.whl (88 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.8/88.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pynndescent-0.5.13-py3-none-any.whl (56 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.9/56.9 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynndescent, umap-learn
Successfully installed pynndescent-0.5.13 umap-learn-0.5.7


In [2]:
# Import libraries
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import umap

  from tqdm.autonotebook import tqdm, trange


In [3]:
# Import library for file uploading
from google.colab import files

In [4]:
# File upload
uploaded = files.upload()

Saving H_Miyaha_OCR.txt to H_Miyaha_OCR.txt


In [5]:
# Load the uploaded file
file_name = list(uploaded.keys())[0]
with open(file_name, 'r', encoding='utf-8') as file:
    text = file.read()

In [28]:
# Preprocess text: remove special characters and split into sentences
import re
text = re.sub(r'[^\w\s。！？]', '', text)  # Remove special characters except Japanese sentence markers
text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
sentences = re.split(r'[。！？]', text)  # Split by Japanese sentence-ending markers
sentences = [sentence.strip() for sentence in sentences if len(sentence.strip()) > 5]  # Remove empty or very short sentences


In [31]:
# If not enough sentences, add dummy data for testing
if len(sentences) < 10:
    print(f"Warning: Not enough sentences for clustering. Found {len(sentences)} sentences. Adding dummy sentences.")
    dummy_sentences = [
        "これはテスト用の文章です。",
        "分析を実行するためのダミーデータを追加します。",
        "BERTモデルを使ったトピックモデリングの例です。",
        "日本語の文章を分割してトピックを抽出します。",
        "クラスタリングアルゴリズムを適用します。",
        "十分なデータが必要です。",
        "分散型表現を作成しています。",
        "トピックモデルの結果を確認します。",
        "分析に成功しました。",
        "結果を保存しています。"
    ]
    sentences.extend(dummy_sentences[:10 - len(sentences)])  # Add only enough dummy sentences


In [32]:
# Load BERT model for embeddings
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

In [33]:
# Generate embeddings
embeddings = model.encode(sentences, show_progress_bar=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [34]:
# Dimensionality reduction using UMAP (optional, for visualization)
reducer = umap.UMAP(n_neighbors=15, n_components=5, metric='cosine')
reduced_embeddings = reducer.fit_transform(embeddings)

  warn(


In [37]:
# Adjust number of topics (clusters) based on data size
num_topics = min(10, len(sentences))  # Ensure n_clusters <= n_samples
print(f"Number of topics adjusted to: {num_topics}")

Number of topics adjusted to: 10


In [38]:
# Apply k-means clustering to find topics
kmeans = KMeans(n_clusters=num_topics, random_state=42)
kmeans.fit(reduced_embeddings)
labels = kmeans.labels_

In [39]:
# Assign topics to sentences
topic_sentences = pd.DataFrame({
    'Sentence': sentences,
    'Topic': labels
})

In [40]:
# Extract top keywords for each topic
from sklearn.feature_extraction.text import CountVectorizer

In [43]:
# Define custom stop words (replace 'japanese' with actual stop words)
custom_stop_words = [
    'Page', 'は', 'が', 'の', 'に', 'を', 'へ', 'と', 'で', 'や', 'も', 'から', 'まで',
    'だけ', 'より', 'しか', 'ながら', 'など', 'これ', 'それ', 'あれ', 'どれ',
    'ここ', 'そこ', 'あそこ', 'どこ', 'そして', 'しかし', 'だから', 'なぜ',
    'どう', '何', 'いつ', 'ただ', 'ようやく', 'やはり', 'さらに', 'すぐに',
    'いる', 'ある', 'する', 'なる', '言う', '思う', '分かる', '知る'
]


In [44]:
# Combine sentences by topic
topic_groups = topic_sentences.groupby('Topic')['Sentence'].apply(' '.join)

In [46]:
# Vectorize text for keyword extraction
vectorizer = CountVectorizer(max_features=20, stop_words=custom_stop_words)
keywords_per_topic = {}

for topic, text in topic_groups.items():
    if text.strip():  # Ensure text is not empty
        vectorizer.fit([text])
        keywords = vectorizer.get_feature_names_out()
        keywords_per_topic[topic] = keywords
    else:
        print(f"Warning: Topic {topic} has empty text and will be skipped.")
        keywords_per_topic[topic] = ["No keywords available"]

In [47]:
# Display topics and their keywords
for topic, keywords in keywords_per_topic.items():
    print(f"Topic {topic}: {', '.join(keywords)}")

Topic 0: bertモデルを使ったトピックモデリングの例です
Topic 1: 分析を実行するためのダミーデータを追加します
Topic 2: これはテスト用の文章です
Topic 3: 分散型表現を作成しています
Topic 4: 日本語の文章を分割してトピックを抽出します
Topic 5: トピックモデルの結果を確認します
Topic 6: クラスタリングアルゴリズムを適用します
Topic 7: 十分なデータが必要です
Topic 8: bertモデルを使ったトピックモデリングの例です
Topic 9: 分析に成功しました


In [50]:
# Save results to a CSV file
output_file = "Mihaya_bert_topic_modeling_results.csv"
topic_sentences.to_csv(output_file, index=False, encoding='utf-8-sig')
print(f"Results saved to {output_file}")

Results saved to Mihaya_bert_topic_modeling_results.csv


In [51]:
# Download the output file if running in Colab
from google.colab import files
files.download(output_file)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>