<a href="https://colab.research.google.com/github/echung2/echung2/blob/master/Hanna_Ren_Miyaha.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
!pip install -U ginza ja_ginza
# Install GiNZA, an open Source Japanese NLP library, which is not pre-installed on Google Colab
# It may take several minutes




In [14]:
# Import library for processing Japanese texts
import spacy
import ginza

# Import library for topic modeling
from gensim import corpora, models
from gensim.models.ldamodel import LdaModel

# Import library for uploading files
from google.colab import files

# Import library for reading files
import glob

# Import library for manipulating data more conveniently
import pandas as pd
import numpy as np

In [25]:
import spacy
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from google.colab import files
import re

# **Part 2: Read data**

# Upload the data on Google Colab

you can upload the file by clicking the folder-shaped icon ("Files") to open the file browser on the left side of the notebook. Click the upload button that appears at the top of the file browser (the up arrow button).

In [26]:
uploaded = files.upload()

Saving H_Miyaha_OCR.txt to H_Miyaha_OCR (1).txt


# **Part 3: Tokenize article text**

In [29]:
# Load text from uploaded file
file_name = list(uploaded.keys())[0]
with open(file_name, 'r', encoding='utf-8') as file:
    text = file.read()

In [30]:
# Preprocess text: remove special characters
text = re.sub(r'[^\w\s]', '', text)
text = re.sub(r'\s+', ' ', text).strip()

In [53]:
# Define Japanese stopwords
stopwords = set([
    'は', 'が', 'の', 'に', 'を', 'へ', 'と', 'で', 'や', 'も', 'から', 'まで', 'だけ', 'より', 'しか', 'ながら', 'など',
    '私', '僕', '俺', 'あなた', '君', '彼', '彼女', '私たち', 'あなたたち', '皆', 'これ', 'それ', 'あれ', 'どれ', 'ここ',
    'そこ', 'あそこ', 'どこ', 'とても', 'かなり', '少し', 'ちょっと', '大体', 'ほとんど', 'また', 'なぜ', 'どう', '何', 'いつ',
    'どこ', 'ただ', 'ようやく', 'やはり', 'さらに', 'すぐに', 'いる', 'ある', 'する', 'なる', '行く', '来る', '見る', '聞く',
    '言う', '思う', '感じる', '分かる', '知る', '良い', '悪い', '大きい', '小さい', '高い', '低い', '多い', '少ない', '簡単だ',
    '難しい', '長い', '短い', 'そして', 'しかし', 'それでも', 'だから', 'または', 'それに', 'つまり', 'なぜなら', 'ああ', 'うん',
    'ええ', 'はい', 'いいえ', 'おお', 'あれ', 'まあ', 'えっ', 'あっ', 'ふうん', 'こと', 'もの', 'ところ', 'よう', '例え',
    '例えば', '場合', 'ような', '様子', 'する', 'ある', 'いる', 'なる', 'れる', 'の', 'も', 'た', 'が', 'に', 'は', 'を', 'で', 'Page'
])

In [54]:
# Load the "ja_ginza" SpaCy pipeline
try:
    nlp = spacy.load("ja_ginza")
except Exception as e:
    print(f"Error loading ja_ginza: {e}")

In [55]:
# Split text into chunks if necessary
chunk_size = 5000
chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

In [56]:
# Tokenize and filter words
tokens = []
for chunk in chunks:
    doc = nlp(chunk)
    for token in doc:
        if token.pos_ in ['NOUN', 'VERB', 'ADJ']:
            tokens.append(token.text)

In [58]:
# Display extracted tokens for verification (optional)
print(f"Filtered Tokens: {tokens[:50]}")

Filtered Tokens: ['Page', '羽', '贈る', '拳銃', 'Page', 'Page', '羽', '贈る', '拳銃', '手', '中', '銃', 'ある', 'つめたく', '光る', '黒色', '銃', 'きみ', '小さな', '手のひら', '慣れ', '真', '黒', '重み', '戸惑い', '汗', 'いる', '真夜中', '忍び込ん', '書斎', '無数', '傷', '刻ま', '年', '代物', '机', '色褪せ', '木目', '怪物', '眼球', '怯え', '机', '引き出し', '刃先', '錆び', 'IMRI', '用', '接続', 'プラグ', 'レントゲン']


In [59]:
# Create dictionary and corpus for LDA
dictionary = corpora.Dictionary([tokens])
corpus = [dictionary.doc2bow(tokens)]

In [61]:
# Apply LDA for topic modeling with additional parameters
num_topics = 20  # Specify the number of topics
lda_model = gensim.models.ldamodel.LdaModel(
    corpus=corpus,
    num_topics=num_topics,
    id2word=dictionary,  # Use the dictionary created above
    passes=20,  # Increase passes for better convergence
    iterations=400,  # Increase iterations for better optimization
    random_state=50  # For reproducibility
)

# Display topics
for idx, topic in lda_model.print_topics(num_topics=num_topics, num_words=10):
    print(f"Topic {idx + 1}: {topic}")

Topic 1: 0.000*"い" + 0.000*"継" + 0.000*"実" + 0.000*"羽" + 0.000*"こと" + 0.000*"いる" + 0.000*"神" + 0.000*"し" + 0.000*"脳" + 0.000*"インプラント"
Topic 2: 0.017*"い" + 0.017*"こと" + 0.017*"羽" + 0.015*"継" + 0.014*"実" + 0.013*"いる" + 0.010*"神" + 0.009*"脳" + 0.009*"し" + 0.007*"自分"
Topic 3: 0.000*"羽" + 0.000*"いる" + 0.000*"こと" + 0.000*"実" + 0.000*"い" + 0.000*"神" + 0.000*"継" + 0.000*"脳" + 0.000*"し" + 0.000*"インプラント"
Topic 4: 0.000*"漆黒" + 0.000*"滾々" + 0.000*"演出" + 0.000*"演じ" + 0.000*"漏れ聞こえ" + 0.000*"漏らす" + 0.000*"漏らし" + 0.000*"潤し" + 0.000*"滞り" + 0.000*"滞っ"
Topic 5: 0.000*"漆黒" + 0.000*"滾々" + 0.000*"演出" + 0.000*"演じ" + 0.000*"漏れ聞こえ" + 0.000*"漏らす" + 0.000*"漏らし" + 0.000*"潤し" + 0.000*"滞り" + 0.000*"滞っ"
Topic 6: 0.000*"い" + 0.000*"羽" + 0.000*"いる" + 0.000*"継" + 0.000*"こと" + 0.000*"実" + 0.000*"脳" + 0.000*"神" + 0.000*"し" + 0.000*"自分"
Topic 7: 0.000*"漆黒" + 0.000*"滾々" + 0.000*"演出" + 0.000*"演じ" + 0.000*"漏れ聞こえ" + 0.000*"漏らす" + 0.000*"漏らし" + 0.000*"潤し" + 0.000*"滞り" + 0.000*"滞っ"
Topic 8: 0.000*"こと" + 0.000*"羽" + 0.000*"い" + 

In [62]:
# Apply LDA for topic modeling
#** num_topics = 10  # Specify the number of topics
#** lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=50)

For this process, you'll use SpaCy, a natural language processing library, and GiNZA, a Japanese NLP library that works on spaCy.

이 과정에서는 자연어 처리 라이브러리인 SpaCy와 spaCy에서 작동하는 일본어 NLP 라이브러리인 GiNZA를 사용하게 됩니다.

In [63]:
import csv

# Extract topics and words with probabilities
topics = lda_model.show_topics(num_topics=num_topics, num_words=10, formatted=False)



In [64]:
# Prepare data for CSV
csv_data = []
for topic_idx, topic_words in topics:
    topic_row = [f"Topic {topic_idx + 1}"]
    for word, prob in topic_words:
        topic_row.append(f"{word} ({prob:.4f})")
    csv_data.append(topic_row)


In [69]:
import csv

# Extract topics and words with probabilities
topics = lda_model.show_topics(num_topics=num_topics, num_words=10, formatted=False)



In [70]:
# Prepare data for CSV
csv_data = []
for topic_idx, topic_words in topics:
    topic_row = [f"Topic {topic_idx + 1}"]
    for word, prob in topic_words:
        topic_row.append(f"{word} ({prob:.4f})")
    csv_data.append(topic_row)



In [71]:
# Save the topics to a CSV file with UTF-8 with BOM encoding
output_file = "output-1.csv"
with open(output_file, mode='w', encoding='utf-8-sig', newline='') as file:  # 'utf-8-sig' to add BOM
    writer = csv.writer(file)
    # Write header
    writer.writerow(["Topic", "Words with Probabilities"])
    # Write topics and words
    for row in csv_data:
        writer.writerow(row)

print(f"Results saved to {output_file}")

Results saved to output-1.csv
