# 1.DTM 예제

## 1.1 sklearn

In [92]:
# 필요한 라이브러리 불러오기
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from IPython.core import display as ICD

# 샘플 문서 리스트 (겹치는 단어가 포함된 데이터)
documents = [
    "오늘 날씨는 매우 좋다. 나는 기분이 좋다.",
    "오늘 커피를 마셨다. 커피는 정말 좋다.",
    "주말에 가족과 산책을 했다. 오늘은 날씨가 좋다.",
    "나는 책을 좋아한다. 독서는 매우 좋다.",
    "오늘 저녁에 영화를 볼 예정이다. 나는 영화를 좋아한다."
]

# CountVectorizer 객체 생성
count_vect = CountVectorizer()

# DTM 생성
TDM = count_vect.fit_transform(documents)

# DTM을 DataFrame으로 변환
DTM = pd.DataFrame(TDM.toarray(), columns=count_vect.get_feature_names_out())
# DataFrame 출력
DTM

Unnamed: 0,가족과,기분이,나는,날씨가,날씨는,독서는,마셨다,매우,산책을,영화를,...,오늘은,저녁에,정말,좋다,좋아한다,주말에,책을,커피는,커피를,했다
0,0,1,1,0,1,0,0,1,0,0,...,0,0,0,2,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,1,1,0,0,0,1,1,0
2,1,0,0,1,0,0,0,0,1,0,...,1,0,0,1,0,1,0,0,0,1
3,0,0,1,0,0,1,0,1,0,0,...,0,0,0,1,1,0,1,0,0,0
4,0,0,1,0,0,0,0,0,0,2,...,0,1,0,0,1,0,0,0,0,0


In [95]:
TDM.toarray()

array([[0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0],
       [1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0]])

## 1.2.gensim

In [47]:
# 필요한 라이브러리 불러오기
from gensim import corpora, matutils
import pandas as pd
from IPython.core import display as ICD

# 샘플 문서 리스트 (토큰화된 형태가 아닌 1차원 리스트 형태)
documents = [
    "오늘 날씨는 매우 좋다. 나는 기분이 좋다.",
    "오늘 커피를 마셨다. 커피는 정말 좋다.",
    "주말에 가족과 산책을 했다. 오늘은 날씨가 좋다.",
    "나는 책을 좋아한다. 독서는 매우 좋다.",
    "오늘 저녁에 영화를 볼 예정이다. 나는 영화를 좋아한다."
]

# 각 문서를 띄어쓰기 기준으로 토큰화
tokens = [doc.split() for doc in documents]

# Gensim의 Dictionary 객체를 사용해 단어 ID 생성
id2words = corpora.Dictionary(tokens)

# DTM 생성 (각 문서에 대한 Bag of Words)
BoW = [id2words.doc2bow(token) for token in tokens]
BoW

# corpus2dense를 사용해 희소 행렬을 밀집 벡터로 변환
DTM = matutils.corpus2dense(BoW, num_terms=len(id2words))


# 밀집 벡터를 DataFrame으로 변환하여 보기 좋게 출력
DTM_df = pd.DataFrame(DTM.T, columns = id2words.values())
# DataFrame 출력
DTM_df

Unnamed: 0,기분이,나는,날씨는,매우,오늘,좋다.,마셨다.,정말,커피는,커피를,...,오늘은,주말에,했다.,독서는,좋아한다.,책을,볼,영화를,예정이다.,저녁에
0,1.0,1.0,1.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,2.0,1.0,1.0


# 2.TF-IDF 예제

## 2.1.sklearn

In [97]:
# 필요한 라이브러리 불러오기
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from IPython.core import display as ICD

# 새로운 샘플 문서 리스트
documents = [
    "AI 기술은 날로 발전하고 있다. 많은 회사들이 AI 연구에 투자하고 있다.",
    "기후 변화는 심각한 문제로 떠오르고 있다. 우리는 모두 지구를 보호해야 한다.",
    "AI 기술과 기후 변화는 현대 사회에서 중요한 이슈이다.",
    "건강한 식습관은 우리의 삶의 질을 향상시킬 수 있다.",
    "AI와 기후 변화, 그리고 건강한 식습관 모두 중요한 문제들이다."
]

# TfidfVectorizer 객체 생성
tfidf = TfidfVectorizer()


# TF-IDF 행렬 생성
tf_array = tfidf.fit_transform(documents)

# TF-IDF 행렬을 DataFrame으로 변환
df = pd.DataFrame(tf_array.toarray(), columns = tfidf.get_feature_names_out())

# DataFrame 출력
df

Unnamed: 0,ai,ai와,건강한,그리고,기술과,기술은,기후,날로,떠오르고,많은,...,이슈이다,있다,중요한,지구를,질을,투자하고,한다,향상시킬,현대,회사들이
0,0.477951,0.0,0.0,0.0,0.0,0.296204,0.0,0.296204,0.0,0.296204,...,0.0,0.396743,0.0,0.0,0.0,0.296204,0.0,0.0,0.0,0.296204
1,0.0,0.0,0.0,0.0,0.0,0.0,0.220811,0.0,0.329711,0.0,...,0.0,0.220811,0.0,0.329711,0.0,0.0,0.329711,0.0,0.0,0.0
2,0.318882,0.0,0.0,0.0,0.395246,0.0,0.264701,0.0,0.0,0.0,...,0.395246,0.0,0.318882,0.0,0.0,0.0,0.0,0.0,0.395246,0.0
3,0.0,0.0,0.326676,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.271171,0.0,0.0,0.404907,0.0,0.0,0.404907,0.0,0.0
4,0.0,0.367576,0.296558,0.367576,0.0,0.0,0.24617,0.0,0.0,0.0,...,0.0,0.0,0.296558,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 2.2.gemsim

In [98]:
# 필요한 라이브러리 불러오기
from gensim import corpora, models, matutils
import pandas as pd
from IPython.core import display as ICD

# 새로운 샘플 문서 리스트
documents = [
    "AI 기술은 날로 발전하고 있다. 많은 회사들이 AI 연구에 투자하고 있다.",
    "기후 변화는 심각한 문제로 떠오르고 있다. 우리는 모두 지구를 보호해야 한다.",
    "AI 기술과 기후 변화는 현대 사회에서 중요한 이슈이다.",
    "건강한 식습관은 우리의 삶의 질을 향상시킬 수 있다.",
    "AI와 기후 변화, 그리고 건강한 식습관 모두 중요한 문제들이다."
]

# 각 문서를 띄어쓰기 기준으로 토큰화
tokens = [doc.split() for doc in documents]

# Dictionary 생성
corpus = corpora.Dictionary(tokens)

# 문서들에 대한 BoW 생성 (Bag of Words)
BoW = [corpus.doc2bow(token) for token in tokens]

# TF-IDF모델 생성
model = models.TfidfModel(BoW)

# TF-IDF 계산된 값 (희소행렬 생성)
TDM = model[BoW]

# 밀집 벡터로 변환 - corpus2dense 사용 (희소 행렬을 밀집 행렬로 변환)
TDM = matutils.corpus2dense(TDM, num_terms=len(corpus))

# 밀집 벡터를 DataFrame으로 변환하여 보기 좋게 출력
tfidf_df = pd.DataFrame(TDM.T, columns=corpus.values())

# DataFrame 출력
tfidf_df

Unnamed: 0,AI,기술은,날로,많은,발전하고,연구에,있다.,투자하고,회사들이,기후,...,수,식습관은,우리의,질을,향상시킬,AI와,그리고,문제들이다.,"변화,",식습관
0,0.386049,0.339042,0.339042,0.339042,0.339042,0.339042,0.21522,0.339042,0.339042,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.113285,0.0,0.0,0.113285,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.252768,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.140916,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.125218,0.0,0.0,0.0,...,0.394519,0.394519,0.394519,0.394519,0.394519,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.128793,...,0.0,0.0,0.0,0.0,0.0,0.405783,0.405783,0.405783,0.405783,0.405783
