In [24]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.utils import Bunch

# 데이터 다운로드
# newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
data = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]
newsgroups_train = Bunch(data=data)

# print(newsgroups_train.data[99])
# 텍스트 데이터를 피처로 추출하기 위해 CountVectorizer 및 TfidfVectorizer 초기화
count_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()
# CountVectorizer 객체에서 단어 목록을 가져옴


# BoW(Bag-of-Words) 피처 추출
X_count = count_vectorizer.fit_transform(newsgroups_train.data)

# TF-IDF(Term Frequency-Inverse Document Frequency) 피처 추출
X_tfidf = tfidf_vectorizer.fit_transform(newsgroups_train.data)

# 추출된 피처의 차원 확인
print("BoW 피처 shape:", X_count.shape)
print("TF-IDF 피처 shape:", X_tfidf.shape)


feature_names = count_vectorizer.get_feature_names_out()

# 처음 100개의 단어를 출력
print("처음 100개의 단어:", feature_names[:1000])




BoW 피처 shape: (4, 9)
TF-IDF 피처 shape: (4, 9)
처음 100개의 단어: ['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']


In [23]:
from sklearn.feature_extraction.text import CountVectorizer

# 샘플 데이터
corpus = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# CountVectorizer 객체 생성
count_vectorizer = CountVectorizer()

# 코퍼스를 BoW 피처로 변환
X = count_vectorizer.fit_transform(corpus)

# 단어의 빈도수 계산
word_frequencies = X.sum(axis=0)

# 단어 빈도수의 딕셔너리 생성
word_freq_dict = dict(zip(count_vectorizer.get_feature_names_out(), word_frequencies.tolist()[0]))
print(word_freq_dict)
# 필터링할 최소 빈도수 설정
min_frequency = 2

# 최소 빈도수 이상인 단어만 보존
filtered_words = [word for word, freq in word_freq_dict.items() if freq >= min_frequency]

# 필터링된 단어로 CountVectorizer 객체 생성
filtered_count_vectorizer = CountVectorizer(vocabulary=filtered_words)

# 필터링된 피처로 다시 변환
X_filtered = filtered_count_vectorizer.fit_transform(corpus)

# 필터링된 단어 확인
print("Filtered Words:", filtered_count_vectorizer.get_feature_names_out())

# 필터링된 피처 확인
print("Filtered Features:")
print(X_filtered.toarray())


{'and': 1, 'document': 4, 'first': 2, 'is': 4, 'one': 1, 'second': 1, 'the': 4, 'third': 1, 'this': 4}
Filtered Words: ['document' 'first' 'is' 'the' 'this']
Filtered Features:
[[1 1 1 1 1]
 [2 0 1 1 1]
 [0 0 1 1 1]
 [1 1 1 1 1]]
