# 뉴스 데이터셋 LSA

## 1.데이터 로드

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# CSV 파일을 불러와 DataFrame으로 변환
news_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/news.csv')  # 파일 경로에 맞게 변경

# 상위 1000개의 리뷰만 사용
news_df = news_df.head(1000)

# 첫 번째 리뷰 출력
print("첫 번째 리뷰:")
print(news_df['Description'][0])  # 첫 번째 리뷰 출력
print('리뷰 개수:', len(news_df))

첫 번째 리뷰:
Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.
리뷰 개수: 1000


In [3]:
news_df[['Title', 'Description']]

Unnamed: 0,Title,Description
0,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."
...,...,...
995,U.S. Stocks Rebound as Oil Prices Ease,NEW YORK (Reuters) - U.S. stocks rebounded on...
996,Dollar Rises Vs Euro After Asset Data,NEW YORK (Reuters) - The dollar gained agains...
997,Bikes Bring Internet to Indian Villagers (AP),"AP - For 12-year-old Anju Sharma, hope for a b..."
998,Celebrity Chefs Are Everywhere in Vegas,By ADAM GOLDMAN LAS VEGAS (AP) -- The waite...


## 2.전처리

In [8]:
news_df['Description'] = news_df['Description'].str.replace('[^a-zA-z]',' ', regex=True)
news_df['clean'] = news_df['Description'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
news_df['clean'] = news_df['clean'].apply(lambda x: x.lower())
news_df['clean']

Unnamed: 0,clean
0,reuters short sellers wall street dwindling\ba...
1,reuters private investment firm carlyle group ...
2,reuters soaring crude prices plus worries\abou...
3,reuters authorities have halted export\flows f...
4,tearaway world prices toppling records straini...
...,...
995,york reuters stocks rebounded monday prices re...
996,york reuters dollar gained against euro monday...
997,year anju sharma hope better life arrives poor...
998,adam goldman vegas waiter appears tableside co...


## 3.TF-IDF 벡터화

In [72]:
tfidf = TfidfVectorizer(stop_words='english',
                        max_features = 500,
                        max_df= 0.5,
                        smooth_idf=True)

X = tfidf.fit_transform(news_df['clean'])

X.shape

(1000, 500)

In [85]:
df = pd.DataFrame(X.toarray(), columns=tfidf.get_feature_names_out())
df

Unnamed: 0,according,added,administration,afghanistan,agency,ahead,america,american,americans,angeles,...,work,workers,working,world,worries,year,years,yesterday,york,young
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.295362,0.000000,0.0,0.0,0.000000,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.341306,0.000000,0.000000,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.357440,0.0
996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.352552,0.0
997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.210328,0.0,0.0,0.000000,0.0
998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0


## 4.LSA 적용

In [None]:
encoder = OneHotEncoder()
encoder.fit(x_train)

In [86]:
svd_model = TruncatedSVD(n_components = 20,
                         algorithm = 'randomized',
                         n_iter =10,
                         random_state =42
                         )

svd_model.fit(X)

print(np.shape(svd_model.components_))
print(f'문서별 단어 수 : {len(svd_model.components_[0])}')
print(f'토픽 수 : {len(svd_model.components_)}')

(20, 500)
문서별 단어 수 : 500
토픽 수 : 20


In [73]:
svd_model.components_[19]

array([-6.16204531e-03, -1.94790659e-02,  3.90765671e-03,  1.23065243e-02,
       -5.68898131e-02,  3.71405089e-03,  3.64211223e-02,  3.08705972e-03,
        9.91242541e-03,  6.69521993e-03, -1.94931372e-02, -4.86269471e-02,
       -3.50929151e-02, -2.30063486e-02,  3.93407978e-04, -1.23121958e-02,
       -9.06102513e-03,  1.56624827e-02,  4.16114702e-03, -1.40789054e-03,
       -1.23059877e-02, -1.64556638e-02, -1.79134290e-02,  2.00713021e-02,
       -1.64930068e-02,  1.80393678e-04, -4.20854278e-03,  5.92312092e-04,
        7.67562830e-04, -5.11365454e-03,  2.00724156e-02, -2.58692261e-02,
        6.49184148e-03,  9.64686036e-02, -3.59435617e-02,  2.73545176e-02,
        2.14111751e-03,  5.57654236e-02,  6.25308650e-03, -2.84026683e-02,
       -2.14375953e-03, -3.86706489e-03,  1.93364738e-04,  1.37399512e-02,
        4.42277879e-02,  1.01147417e-03, -3.96436362e-03, -2.33895957e-02,
       -5.52536767e-02, -2.48526863e-02,  8.90016825e-03, -8.06854451e-03,
       -1.48502086e-02, -

## 5.토픽 확인
- 토픽별 단어 확인
- 문서별 토픽 비율 확인

In [89]:
# 각 문서의 토픽 비율 오름차순 정렬
svd_model.components_.argsort()

array([[244, 220,  70, ..., 378, 273, 370],
       [378,  74, 495, ..., 440, 297,  19],
       [ 74, 411, 495, ..., 273, 370, 332],
       ...,
       [464, 334, 496, ..., 240,  74, 387],
       [493, 411, 161, ..., 265, 153, 314],
       [445, 493, 391, ...,  74, 453, 240]])

In [51]:
#[::-1]로 내림차순 정렬, [:5] 상위 5개의 인덱스 출력
svd_model.components_.argsort()[0][::-1][:5]

array([741, 554, 768, 868,  43])

In [54]:
# 상위 단어와 토픽 비율 확인
print(f'상위 단어 : {tfidf.get_feature_names_out()[741]}' )
print(f'상위 단어 토픽 비율 : {svd_model.components_[0][741].round(5)}')

상위 단어 : reuters
상위 단어 토픽 비율 : 0.36078


In [67]:
terms = tfidf.get_feature_names_out()
components = svd_model.components_

def get_topic(term, components, n=5):
    for idx, component  in enumerate(components):
        sorted_component = component.argsort()[::-1][:5]
        print(f'Topic {idx+1}', [(term[i],component[i]) for i in sorted_component])

get_topic(terms, components)

Topic 1 [('reuters', 0.36078464838632784), ('monday', 0.2858024309590395), ('said', 0.25854543760451965), ('sunday', 0.22912959251379716), ('athens', 0.1700873213037877)]
Topic 2 [('athens', 0.4113994338807743), ('olympic', 0.30550951499650636), ('sunday', 0.26808364695290926), ('team', 0.2341358365628493), ('phelps', 0.16976309961267036)]
Topic 3 [('president', 0.286280740418453), ('chavez', 0.2341695129499811), ('hugo', 0.2341695129499811), ('referendum', 0.22615817488026188), ('venezuela', 0.22481251193909224)]
Topic 4 [('monday', 0.29149868817734914), ('reuters', 0.2498172922661257), ('profit', 0.14037748205423242), ('sales', 0.11385385724078605), ('billion', 0.11015951832450437)]
Topic 5 [('space', 0.2823486764453481), ('said', 0.24432051140159172), ('people', 0.11672632492225068), ('troops', 0.115515363832914), ('national', 0.11017528299319841)]
Topic 6 [('space', 0.5116607528191993), ('nasa', 0.1650330843449567), ('earth', 0.15355494044358384), ('google', 0.15305639734314871), (

In [91]:
def get_topic(term, components, n=5):
    for component in components:
        sorted_component = component.argsort()[::-1][:5]

        topic = []
        for i in sorted_component:
            topic.append((term[i],component[i]))
        print(topic)

get_topic(terms, components)

[('reuters', 0.36078464838632784), ('monday', 0.2858024309590395), ('said', 0.25854543760451965), ('sunday', 0.22912959251379716), ('athens', 0.1700873213037877)]
[('athens', 0.4113994338807743), ('olympic', 0.30550951499650636), ('sunday', 0.26808364695290926), ('team', 0.2341358365628493), ('phelps', 0.16976309961267036)]
[('president', 0.286280740418453), ('chavez', 0.2341695129499811), ('hugo', 0.2341695129499811), ('referendum', 0.22615817488026188), ('venezuela', 0.22481251193909224)]
[('monday', 0.29149868817734914), ('reuters', 0.2498172922661257), ('profit', 0.14037748205423242), ('sales', 0.11385385724078605), ('billion', 0.11015951832450437)]
[('space', 0.2823486764453481), ('said', 0.24432051140159172), ('people', 0.11672632492225068), ('troops', 0.115515363832914), ('national', 0.11017528299319841)]
[('space', 0.5116607528191993), ('nasa', 0.1650330843449567), ('earth', 0.15355494044358384), ('google', 0.15305639734314871), ('telescope', 0.13605921408350344)]
[('google', 0

### 문서별 관련 토픽 찾기

In [114]:
# 문서마다 각 토픽의 중요도를 담은 매트릭스
lsa_matrix = svd_model.transform(X)

print(dict(zip(('문서 수', '토픽 수'),lsa_matrix.shape)))
print()

lsa_matrix

{'문서 수': 1000, '토픽 수': 20}



array([[ 0.14343675, -0.01759823,  0.09317176, ...,  0.02020201,
        -0.01825019,  0.00925559],
       [ 0.15964832, -0.04767284, -0.0057305 , ...,  0.04243396,
        -0.08485973, -0.03880619],
       [ 0.19606586, -0.11973428,  0.09078609, ...,  0.09270937,
        -0.0762865 , -0.02177904],
       ...,
       [ 0.12808891, -0.11170004, -0.12824925, ..., -0.09916582,
        -0.05432146,  0.04090212],
       [ 0.02545917, -0.00460551,  0.00720685, ...,  0.01354914,
        -0.00724491, -0.00886323],
       [ 0.12471805, -0.15495603, -0.21009882, ..., -0.00106841,
         0.12861671, -0.02016896]])

In [131]:
# 문서별 중요 토픽
def get_doc_topic(lsa_matrix, n=5):
    # 각 문서 for문
    for idx, doc in enumerate(lsa_matrix):
        print(f'Doc {idx+1}', [(f'topic{i+1}', doc[i]) for i in doc.argsort()[::-1][:5]])
        if idx==10:
            break

get_doc_topic(lsa_matrix, n=5)

Doc 1 [('topic1', 0.14343674860255257), ('topic4', 0.1070776052501462), ('topic3', 0.09317175850338036), ('topic16', 0.08693080662679549), ('topic8', 0.05821059379341596)]
Doc 2 [('topic1', 0.1596483233201855), ('topic4', 0.07778063370386779), ('topic15', 0.07544464259447874), ('topic13', 0.06179269600788939), ('topic18', 0.04243395884324738)]
Doc 3 [('topic14', 0.2997926799791099), ('topic1', 0.19606585598543808), ('topic8', 0.19162411762560766), ('topic4', 0.18988584540136705), ('topic11', 0.155181076912483)]
Doc 4 [('topic1', 0.2279140360177577), ('topic5', 0.15314721972304848), ('topic9', 0.107352467019582), ('topic13', 0.10287819295352979), ('topic6', 0.07400436976579013)]
Doc 5 [('topic11', 0.12739823981304182), ('topic1', 0.10148176147737148), ('topic15', 0.08063253488680121), ('topic7', 0.0646838713529491), ('topic14', 0.06406716890412167)]
Doc 6 [('topic1', 0.1768991879073473), ('topic8', 0.1438014982811088), ('topic11', 0.10395049379654883), ('topic7', 0.08593273010557334), (