# 코퍼스 불러오기

In [24]:
##### sample corpus ####
with open('sample_corpus.txt', encoding='utf-8') as f:
    corpus = f.readlines()
    corpus = [line.strip() for line in corpus]

corpus

['코로나 거리두기와 코로나 상생지원금 문의입니다.',
 '지하철 운행시간과 지하철 요금 문의입니다.',
 '지하철 승강장 문의입니다.',
 '코로나 선별진료소 문의입니다.',
 '버스 운행시간 문의입니다.',
 '버스 터미널 위치 안내입니다.',
 '코로나 거리두기 안내입니다.',
 '택시 승강장 문의입니다.']

# BoW(Bag of Words)
- scikit learn 라이브러리 설치
- BoW를 만들 수 있는 CountVectorizer 객체 생성

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

cvect = CountVectorizer()

- 코퍼스의 모든 토큰의 어휘 사전을 학습하고, 문서를 단어 빈도수가 들어 있는 DTM(Document Term Matrix) 변환

In [26]:
# DTM(Document Term Matrix)
dtm = cvect.fit_transform(corpus)

dtm.toarray()

array([[0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 2, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0]], dtype=int64)

In [27]:
vocab = cvect.get_feature_names_out()
vocab

array(['거리두기', '거리두기와', '문의입니다', '버스', '상생지원금', '선별진료소', '승강장', '안내입니다',
       '요금', '운행시간', '운행시간과', '위치', '지하철', '코로나', '택시', '터미널'],
      dtype=object)

# DTM을 데이터프레임으로 만들어 보기

In [28]:
import pandas as pd

df_dtm = pd.DataFrame(dtm.toarray(), columns=vocab)
df_dtm

Unnamed: 0,거리두기,거리두기와,문의입니다,버스,상생지원금,선별진료소,승강장,안내입니다,요금,운행시간,운행시간과,위치,지하철,코로나,택시,터미널
0,0,1,1,0,1,0,0,0,0,0,0,0,0,2,0,0
1,0,0,1,0,0,0,0,0,1,0,1,0,2,0,0,0
2,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0
3,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0
4,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0
5,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1
6,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
7,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0


In [29]:
# 전체 문서에서 특정 토근이 총 몇 회 등장했는지 확인

df_dtm.sum()

거리두기     1
거리두기와    1
문의입니다    6
버스       2
상생지원금    1
선별진료소    1
승강장      2
안내입니다    2
요금       1
운행시간     1
운행시간과    1
위치       1
지하철      3
코로나      4
택시       1
터미널      1
dtype: int64

# DTM 함수 만들기

In [41]:
def display_dtm(cvect, corpus):
    """모델을 받아 변환을 하고 DTM을 반환
    """
    dtm = cvect.fit_transform(corpus)
    vocab = cvect.get_feature_names_out()
    return pd.DataFrame(dtm.toarray(), columns=vocab)


# N-grams
- 토큰을 몇 개 묶어서 사용할 것인지를 구분하여 DTM 생성
- ngram_range(min, max): 토큰의 개수 설정 inclusive

In [43]:
cvect = CountVectorizer(ngram_range=(2,3))
display_dtm(cvect, corpus)

Unnamed: 0,거리두기 안내입니다,거리두기와 코로나,거리두기와 코로나 상생지원금,버스 운행시간,버스 운행시간 문의입니다,버스 터미널,버스 터미널 위치,상생지원금 문의입니다,선별진료소 문의입니다,승강장 문의입니다,...,코로나 거리두기와,코로나 거리두기와 코로나,코로나 상생지원금,코로나 상생지원금 문의입니다,코로나 선별진료소,코로나 선별진료소 문의입니다,택시 승강장,택시 승강장 문의입니다,터미널 위치,터미널 위치 안내입니다
0,0,1,1,0,0,0,0,1,0,0,...,1,1,1,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,1,0,0,0,0
4,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,1
6,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,1,0,0


In [44]:

cvect = CountVectorizer(min_df=2)
display_dtm(cvect, corpus)

Unnamed: 0,문의입니다,버스,승강장,안내입니다,지하철,코로나
0,1,0,0,0,0,2
1,1,0,0,0,2,0
2,1,0,1,0,1,0
3,1,0,0,0,0,1
4,1,1,0,0,0,0
5,0,1,0,1,0,0
6,0,0,0,1,0,1
7,1,0,1,0,0,0



# TF-IDF
- TF-IDF를 만들 수 있는 TfidfVectorizer 객체 생성
- word 단위로, 1~2개씩 묶기, 토큰 최대 7개로 제한

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfvect = TfidfVectorizer()

In [47]:
display_dtm(tfidfvect, corpus)

Unnamed: 0,거리두기,거리두기와,문의입니다,버스,상생지원금,선별진료소,승강장,안내입니다,요금,운행시간,운행시간과,위치,지하철,코로나,택시,터미널
0,0.0,0.479919,0.239821,0.0,0.479919,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.694148,0.0,0.0
1,0.0,0.0,0.222166,0.0,0.0,0.0,0.0,0.0,0.444589,0.0,0.444589,0.0,0.7452,0.0,0.0,0.0
2,0.0,0.0,0.3885,0.0,0.0,0.0,0.651563,0.0,0.0,0.0,0.0,0.0,0.651563,0.0,0.0,0.0
3,0.0,0.0,0.375318,0.0,0.0,0.75107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.543168,0.0,0.0
4,0.0,0.0,0.357659,0.599839,0.0,0.0,0.0,0.0,0.0,0.715732,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.454195,0.0,0.0,0.0,0.454195,0.0,0.0,0.0,0.541948,0.0,0.0,0.0,0.541948
6,0.670344,0.0,0.0,0.0,0.0,0.0,0.0,0.561801,0.0,0.0,0.0,0.0,0.0,0.484788,0.0,0.0
7,0.0,0.0,0.357659,0.0,0.0,0.0,0.599839,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.715732,0.0


In [48]:
tfidfvect = TfidfVectorizer(analyzer='word',
                            ngram_range=(1,2),
                            max_features=7)

display_dtm(tfidfvect, corpus)

Unnamed: 0,문의입니다,버스,승강장,승강장 문의입니다,안내입니다,지하철,코로나
0,0.32655,0.0,0.0,0.0,0.0,0.0,0.94518
1,0.285703,0.0,0.0,0.0,0.0,0.958318,0.0
2,0.325502,0.0,0.545908,0.545908,0.0,0.545908,0.0
3,0.568471,0.0,0.0,0.0,0.0,0.0,0.822704
4,0.512131,0.858908,0.0,0.0,0.0,0.0,0.0
5,0.0,0.707107,0.0,0.0,0.707107,0.0,0.0
6,0.0,0.0,0.0,0.0,0.757092,0.0,0.653308
7,0.3885,0.0,0.651563,0.651563,0.0,0.0,0.0
