<a href="https://colab.research.google.com/github/flrain2/Study/blob/main/%EB%B9%88%EB%8F%84%EB%B6%84%EC%84%9D_%EA%B2%BD%EC%98%81%ED%95%99%ED%9A%8C_240427.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 빈도분석

In [None]:
 !pip install konlpy
 !apt-get install openjdk-8-jdk-headless -qq > /dev/null

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting JPype1>=0.7.0 (from konlpy)
  Downloading JPype1-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (488 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m488.6/488.6 kB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.5.0 konlpy-0.6.0


In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
import pandas as pd
import re
from konlpy.tag import Okt
from collections import Counter
from gensim import corpora, models

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 1. 전처리 함수
def preprocess_text(text):
    text = re.sub(r'\.co\.kr.*|\.com.*|@.*', '', text)  # 특정 도메인 이후 텍스트 제거
    text = re.sub(r'[a-zA-Z]+', '', text)  # 영어 제거
    return text.strip()

# 2. 빈도분석 함수
def frequency_analysis(texts):
    okt = Okt()
    nouns = [noun for text in texts for noun in okt.nouns(text) if len(noun) > 1]
    count = Counter(nouns)
    return count.most_common(30)  # 상위 20개 명사 반환

# 3. 토픽 모델링 함수 (수정됨)
def topic_modeling(texts):
    okt = Okt()
    texts = [okt.nouns(text) for text in texts]
    texts = [[word for word in text if len(word) > 1] for text in texts]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)
    return lda_model.print_topics(num_words=5)

# 1. MS

In [None]:
# CSV 파일 읽기
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/MS_craw.csv')
df['processed_text'] = df['text_sentence'].apply(preprocess_text)  # 전처리 적용

#'/content/drive/MyDrive/Colab Notebooks/MS_craw.csv'
#'/content/drive/MyDrive/Colab Notebooks/apple_craw.csv'
#'/content/drive/MyDrive/Colab Notebooks/aramco_craw.csv'
#'/content/drive/MyDrive/Colab Notebooks/envidia_craw.csv'
#'/content/drive/MyDrive/Colab Notebooks/amazon_craw.csv'

In [None]:
# 빈도분석 및 토픽 모델링 실행
frequencies = frequency_analysis(df['processed_text'].tolist())
topics = topic_modeling(df['processed_text'].tolist())

# 결과 출력
print("빈도분석 결과:")
print(frequencies)
print("\n토픽 모델링 결과:")
for idx, topic in enumerate(topics):
    print(f"Topic {idx+1}: {topic}")

빈도분석 결과:
[('기업', 1857), ('서비스', 1520), ('투자', 1494), ('데이터', 1356), ('미국', 1324), ('클라우드', 1251), ('시장', 1181), ('기술', 1140), ('구글', 1020), ('오픈', 1020), ('개발', 904), ('지능', 875), ('반도체', 857), ('마이크로소프트', 844), ('모델', 832), ('인공', 813), ('센터', 796), ('엔비디아', 741), ('기자', 741), ('지난해', 670), ('성형', 664), ('위해', 642), ('대한', 639), ('애플', 630), ('메타', 622), ('중국', 611), ('통해', 604), ('제공', 604), ('전자', 591), ('일본', 584)]

토픽 모델링 결과:
Topic 1: (0, '0.020*"클라우드" + 0.016*"서비스" + 0.014*"기업" + 0.014*"메타" + 0.013*"모델"')
Topic 2: (1, '0.014*"반도체" + 0.011*"엔비디아" + 0.010*"전자" + 0.010*"시장" + 0.010*"기업"')
Topic 3: (2, '0.016*"투자" + 0.011*"기업" + 0.009*"클라우드" + 0.008*"아마존" + 0.007*"서비스"')
Topic 4: (3, '0.012*"오픈" + 0.011*"투자" + 0.009*"서비스" + 0.009*"데이터" + 0.009*"기술"')
Topic 5: (4, '0.012*"미국" + 0.012*"언론사" + 0.010*"일본" + 0.008*"구독" + 0.008*"데이터"')


In [None]:
# 결과를 DataFrame으로 변환
frequencies_df = pd.DataFrame(frequencies, columns=['Word', 'Frequency'])
topics_df = pd.DataFrame(topics, columns=['Topic', 'Words'])

# CSV 파일로 저장
frequencies_df.to_csv('/content/drive/MyDrive/Colab Notebooks/frequency_analysis_MS.csv', index=False)
topics_df.to_csv('/content/drive/MyDrive/Colab Notebooks/topic_modeling_results_MS.csv', index=False)

print("파일 저장 완료: 'frequency_analysis.csv' 와 'topic_modeling_results.csv'")

파일 저장 완료: 'frequency_analysis.csv' 와 'topic_modeling_results.csv'


# 2. APPLE

In [None]:
# CSV 파일 읽기
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/apple_craw.csv')
df['processed_text'] = df['text_sentence'].apply(preprocess_text)  # 전처리 적용

#'/content/drive/MyDrive/Colab Notebooks/MS_craw.csv'
#'/content/drive/MyDrive/Colab Notebooks/apple_craw.csv'
#'/content/drive/MyDrive/Colab Notebooks/aramco_craw.csv'
#'/content/drive/MyDrive/Colab Notebooks/envidia_craw.csv'
#'/content/drive/MyDrive/Colab Notebooks/amazon_craw.csv'

In [None]:
# 빈도분석 및 토픽 모델링 실행
frequencies = frequency_analysis(df['processed_text'].tolist())
topics = topic_modeling(df['processed_text'].tolist())

# 결과 출력
print("빈도분석 결과:")
print(frequencies)
print("\n토픽 모델링 결과:")
for idx, topic in enumerate(topics):
    print(f"Topic {idx+1}: {topic}")

빈도분석 결과:
[('애플', 3161), ('시장', 1637), ('미국', 1592), ('기업', 1330), ('서비스', 1235), ('기술', 1139), ('투자', 1074), ('중국', 1074), ('반도체', 957), ('구글', 956), ('전자', 951), ('엔비디아', 903), ('지능', 790), ('올해', 785), ('삼성', 761), ('인공', 759), ('세계', 739), ('개발', 729), ('기자', 706), ('대한', 647), ('위해', 611), ('모델', 607), ('출시', 598), ('아이폰', 587), ('글로벌', 586), ('게임', 585), ('경쟁', 581), ('지난', 577), ('주가', 576), ('지난해', 567)]

토픽 모델링 결과:
Topic 1: (0, '0.013*"미국" + 0.012*"금리" + 0.011*"상승" + 0.010*"지수" + 0.010*"시장"')
Topic 2: (1, '0.012*"전자" + 0.009*"시장" + 0.008*"삼성" + 0.008*"사업" + 0.008*"서비스"')
Topic 3: (2, '0.026*"애플" + 0.011*"반도체" + 0.010*"구글" + 0.009*"기업" + 0.009*"시장"')
Topic 4: (3, '0.023*"애플" + 0.009*"게임" + 0.009*"기업" + 0.008*"미국" + 0.007*"서비스"')
Topic 5: (4, '0.017*"언론사" + 0.014*"구독" + 0.010*"통신" + 0.010*"서비스" + 0.009*"원금"')


In [None]:
# 결과를 DataFrame으로 변환
frequencies_df = pd.DataFrame(frequencies, columns=['Word', 'Frequency'])
topics_df = pd.DataFrame(topics, columns=['Topic', 'Words'])

# CSV 파일로 저장
frequencies_df.to_csv('/content/drive/MyDrive/Colab Notebooks/frequency_analysis_apple.csv', index=False)
topics_df.to_csv('/content/drive/MyDrive/Colab Notebooks/topic_modeling_results_apple.csv', index=False)

print("파일 저장 완료: 'frequency_analysis.csv' 와 'topic_modeling_results.csv'")

파일 저장 완료: 'frequency_analysis.csv' 와 'topic_modeling_results.csv'


# 3. aramco

In [None]:
# CSV 파일 읽기
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/aramco_craw.csv')
df['processed_text'] = df['text_sentence'].apply(preprocess_text)  # 전처리 적용

#'/content/drive/MyDrive/Colab Notebooks/MS_craw.csv'
#'/content/drive/MyDrive/Colab Notebooks/apple_craw.csv'
#'/content/drive/MyDrive/Colab Notebooks/aramco_craw.csv'
#'/content/drive/MyDrive/Colab Notebooks/envidia_craw.csv'
#'/content/drive/MyDrive/Colab Notebooks/amazon_craw.csv'

In [None]:
# 빈도분석 및 토픽 모델링 실행
frequencies = frequency_analysis(df['processed_text'].tolist())
topics = topic_modeling(df['processed_text'].tolist())

# 결과 출력
print("빈도분석 결과:")
print(frequencies)
print("\n토픽 모델링 결과:")
for idx, topic in enumerate(topics):
    print(f"Topic {idx+1}: {topic}")

빈도분석 결과:
[('사우디', 1702), ('협력', 1449), ('측은', 1264), ('기업', 1185), ('사업', 1131), ('분야', 1025), ('한국', 908), ('기술', 878), ('투자', 765), ('양국', 758), ('그룹', 655), ('네이버', 623), ('글로벌', 572), ('세계', 560), ('위해', 559), ('시장', 553), ('서비스', 546), ('엔비디아', 543), ('산업', 507), ('아람', 503), ('현대', 479), ('디지털', 475), ('미래', 471), ('미국', 461), ('중공업', 440), ('평가', 437), ('개발', 437), ('사우디아라비아', 435), ('수소', 428), ('국제', 423)]

토픽 모델링 결과:
Topic 1: (0, '0.037*"측은" + 0.036*"협력" + 0.032*"사우디" + 0.024*"분야" + 0.022*"양국"')
Topic 2: (1, '0.017*"네이버" + 0.015*"엔비디아" + 0.014*"기업" + 0.013*"기술" + 0.011*"디지털"')
Topic 3: (2, '0.019*"사업" + 0.018*"그룹" + 0.016*"현대" + 0.015*"중공업" + 0.011*"사장"')
Topic 4: (3, '0.011*"기업" + 0.010*"기술" + 0.010*"세계" + 0.010*"플랫폼" + 0.009*"경제"')
Topic 5: (4, '0.011*"사업" + 0.009*"기업" + 0.007*"중국" + 0.007*"미국" + 0.006*"투자"')


In [None]:
# 결과를 DataFrame으로 변환
frequencies_df = pd.DataFrame(frequencies, columns=['Word', 'Frequency'])
topics_df = pd.DataFrame(topics, columns=['Topic', 'Words'])

# CSV 파일로 저장
frequencies_df.to_csv('/content/drive/MyDrive/Colab Notebooks/frequency_analysis_aramco.csv', index=False)
topics_df.to_csv('/content/drive/MyDrive/Colab Notebooks/topic_modeling_results_aramco.csv', index=False)

print("파일 저장 완료: 'frequency_analysis.csv' 와 'topic_modeling_results.csv'")

파일 저장 완료: 'frequency_analysis.csv' 와 'topic_modeling_results.csv'


# 4. envidia

In [None]:
# CSV 파일 읽기
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/envidia_craw.csv')
df['processed_text'] = df['text_sentence'].apply(preprocess_text)  # 전처리 적용

#'/content/drive/MyDrive/Colab Notebooks/MS_craw.csv'
#'/content/drive/MyDrive/Colab Notebooks/apple_craw.csv'
#'/content/drive/MyDrive/Colab Notebooks/aramco_craw.csv'
#'/content/drive/MyDrive/Colab Notebooks/envidia_craw.csv'
#'/content/drive/MyDrive/Colab Notebooks/amazon_craw.csv'

In [None]:
# 빈도분석 및 토픽 모델링 실행
frequencies = frequency_analysis(df['processed_text'].tolist())
topics = topic_modeling(df['processed_text'].tolist())

# 결과 출력
print("빈도분석 결과:")
print(frequencies)
print("\n토픽 모델링 결과:")
for idx, topic in enumerate(topics):
    print(f"Topic {idx+1}: {topic}")

빈도분석 결과:
[('반도체', 2044), ('엔비디아', 1771), ('기업', 1603), ('미국', 1565), ('투자', 1313), ('시장', 1247), ('개발', 1133), ('기술', 1121), ('서비스', 1046), ('인텔', 946), ('전자', 876), ('데이터', 845), ('클라우드', 827), ('지능', 789), ('인공', 742), ('네이버', 727), ('기자', 668), ('구글', 659), ('모델', 653), ('센터', 593), ('고객', 580), ('위해', 574), ('올해', 568), ('대한', 558), ('삼성', 555), ('상승', 552), ('오픈', 541), ('지수', 540), ('중국', 535), ('생산', 534)]

토픽 모델링 결과:
Topic 1: (0, '0.010*"바이오" + 0.010*"기업" + 0.009*"기술" + 0.009*"투자" + 0.008*"지금"')
Topic 2: (1, '0.014*"투자" + 0.011*"반도체" + 0.011*"엔비디아" + 0.009*"기업" + 0.009*"미국"')
Topic 3: (2, '0.021*"반도체" + 0.014*"엔비디아" + 0.012*"인텔" + 0.012*"기업" + 0.011*"개발"')
Topic 4: (3, '0.019*"언론사" + 0.015*"구독" + 0.009*"뉴스" + 0.008*"기사" + 0.008*"분류"')
Topic 5: (4, '0.014*"지수" + 0.014*"미국" + 0.013*"상승" + 0.010*"하락" + 0.009*"시장"')


In [None]:
# 결과를 DataFrame으로 변환
frequencies_df = pd.DataFrame(frequencies, columns=['Word', 'Frequency'])
topics_df = pd.DataFrame(topics, columns=['Topic', 'Words'])

# CSV 파일로 저장
frequencies_df.to_csv('/content/drive/MyDrive/Colab Notebooks/frequency_analysis_envidia.csv', index=False)
topics_df.to_csv('/content/drive/MyDrive/Colab Notebooks/topic_modeling_results_envidia.csv', index=False)

print("파일 저장 완료: 'frequency_analysis.csv' 와 'topic_modeling_results.csv'")

파일 저장 완료: 'frequency_analysis.csv' 와 'topic_modeling_results.csv'


# 5. amazon

In [None]:
# CSV 파일 읽기
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/amazon_craw.csv')
df['processed_text'] = df['text_sentence'].apply(preprocess_text)  # 전처리 적용

#'/content/drive/MyDrive/Colab Notebooks/MS_craw.csv'
#'/content/drive/MyDrive/Colab Notebooks/apple_craw.csv'
#'/content/drive/MyDrive/Colab Notebooks/aramco_craw.csv'
#'/content/drive/MyDrive/Colab Notebooks/envidia_craw.csv'
#'/content/drive/MyDrive/Colab Notebooks/amazon_craw.csv'

In [None]:
# 빈도분석 및 토픽 모델링 실행
frequencies = frequency_analysis(df['processed_text'].tolist())
topics = topic_modeling(df['processed_text'].tolist())

# 결과 출력
print("빈도분석 결과:")
print(frequencies)
print("\n토픽 모델링 결과:")
for idx, topic in enumerate(topics):
    print(f"Topic {idx+1}: {topic}")

빈도분석 결과:
[('기업', 2020), ('투자', 1597), ('클라우드', 1516), ('시장', 1454), ('미국', 1389), ('서비스', 1384), ('기술', 1319), ('데이터', 1276), ('엔비디아', 1135), ('아마존', 1124), ('개발', 1043), ('사업', 960), ('센터', 953), ('글로벌', 948), ('구글', 901), ('반도체', 890), ('지능', 872), ('애플', 865), ('인공', 778), ('모델', 769), ('지난해', 736), ('세계', 699), ('위해', 698), ('오픈', 685), ('대표', 683), ('로봇', 680), ('기자', 679), ('산업', 666), ('국내', 660), ('제공', 645)]

토픽 모델링 결과:
Topic 1: (0, '0.023*"투자" + 0.014*"구글" + 0.013*"아마존" + 0.012*"오픈" + 0.009*"기업"')
Topic 2: (1, '0.015*"애플" + 0.012*"미국" + 0.009*"시장" + 0.007*"게임" + 0.007*"기업"')
Topic 3: (2, '0.021*"로봇" + 0.019*"엔비디아" + 0.015*"데이터" + 0.015*"센터" + 0.010*"기술"')
Topic 4: (3, '0.014*"언론사" + 0.011*"구독" + 0.008*"기업" + 0.007*"뉴스" + 0.006*"기사"')
Topic 5: (4, '0.024*"클라우드" + 0.016*"기업" + 0.013*"서비스" + 0.011*"사업" + 0.010*"시장"')


In [None]:
# 결과를 DataFrame으로 변환
frequencies_df = pd.DataFrame(frequencies, columns=['Word', 'Frequency'])
topics_df = pd.DataFrame(topics, columns=['Topic', 'Words'])

# CSV 파일로 저장
frequencies_df.to_csv('/content/drive/MyDrive/Colab Notebooks/frequency_analysis_amazon.csv', index=False)
topics_df.to_csv('/content/drive/MyDrive/Colab Notebooks/topic_modeling_results_amazon.csv', index=False)

print("파일 저장 완료: 'frequency_analysis.csv' 와 'topic_modeling_results.csv'")

파일 저장 완료: 'frequency_analysis.csv' 와 'topic_modeling_results.csv'
