# 빈도분석

In [None]:
 !pip install konlpy
 !apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
import pandas as pd
import re
from konlpy.tag import Okt
from collections import Counter
from gensim import corpora, models

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 1. 전처리 함수
def preprocess_text(text):
    text = re.sub(r'\.co\.kr.*|\.com.*|@.*', '', text)  # 특정 도메인 이후 텍스트 제거
    text = re.sub(r'[a-zA-Z]+', '', text)  # 영어 제거
    return text.strip()

# 2. 빈도분석 함수
def frequency_analysis(texts):
    okt = Okt()
    nouns = [noun for text in texts for noun in okt.nouns(text) if len(noun) > 1]
    count = Counter(nouns)
    return count.most_common(30)  # 상위 20개 명사 반환

# 3. 토픽 모델링 함수 (수정됨)
def topic_modeling(texts):
    okt = Okt()
    texts = [okt.nouns(text) for text in texts]
    texts = [[word for word in text if len(word) > 1] for text in texts]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)
    return lda_model.print_topics(num_words=5)

# 1. MS

In [None]:
# CSV 파일 읽기
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/MS_craw.csv')
df['processed_text'] = df['text_sentence'].apply(preprocess_text)  # 전처리 적용

#'/content/drive/MyDrive/Colab Notebooks/MS_craw.csv'
#'/content/drive/MyDrive/Colab Notebooks/apple_craw.csv'
#'/content/drive/MyDrive/Colab Notebooks/aramco_craw.csv'
#'/content/drive/MyDrive/Colab Notebooks/envidia_craw.csv'
#'/content/drive/MyDrive/Colab Notebooks/amazon_craw.csv'

In [None]:
# 빈도분석 및 토픽 모델링 실행
frequencies = frequency_analysis(df['processed_text'].tolist())
topics = topic_modeling(df['processed_text'].tolist())

# 결과 출력
print("빈도분석 결과:")
print(frequencies)
print("\n토픽 모델링 결과:")
for idx, topic in enumerate(topics):
    print(f"Topic {idx+1}: {topic}")

In [None]:
# 결과를 DataFrame으로 변환
frequencies_df = pd.DataFrame(frequencies, columns=['Word', 'Frequency'])
topics_df = pd.DataFrame(topics, columns=['Topic', 'Words'])

# CSV 파일로 저장
frequencies_df.to_csv('/content/drive/MyDrive/Colab Notebooks/frequency_analysis_MS.csv', index=False)
topics_df.to_csv('/content/drive/MyDrive/Colab Notebooks/topic_modeling_results_MS.csv', index=False)

print("파일 저장 완료: 'frequency_analysis.csv' 와 'topic_modeling_results.csv'")