출처: https://blog.breezymind.com/2018/03/02/sklearn-feature_extraction-text-2/

In [1]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None
np.random.seed(0)

from konlpy.tag import Mecab
mecab = Mecab()


from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

# tokenizer : 문장에서 색인어 추출을 위해 명사,동사,알파벳,숫자 정도의 단어만 뽑아서 normalization, stemming 처리하도록 함
def tokenizer(raw, pos=["NNG","NNP"], stopword=['수','퀄리티','도시','분','전문','스타','년','원',\
                       '월','화','수','목','금','시','앤','일','그램','문'] ):
    return [
        word for word, tag in mecab.pos(raw)
            if len(word) > 1 and tag in pos and word not in stopword
        ]

# 테스트 문장
df = pd.read_csv("word2vec_wrangling.csv")
df.head()

Unnamed: 0,exercise_name,Content_txt
0,PT,"💯 What I try to educate my clients around, doe..."
1,검도,#20200115\n저녁 초대!\n와인잔 속에 비치는\n모든 것들이 화려한\n도심속...
2,기구필라테스,#오늘의동작\n캐딜락 동작의 완성 '행잉'\n⠀\n중력을 이용해 척추를 늘려주고\n...
3,다빈치바디보드,#mbn생생정보마당 \n#고투\n#고투GX\n#다빈치바디보드\n#생방송 #GOTOL...
4,드럼스틱,#드럼스틱 #고무팁 #테크라스틱 #전자드럼용스틱\n\n1. 전자드럼타격시 덜 시끄럽...


In [4]:
import re

def preprocessing(text):
    # 개행문자 제거
    text = re.sub('\\\\n', ' ', text)
    # 특수문자 제거
    # 특수문자나 이모티콘 등은 때로는 의미를 갖기도 하지만 여기에서는 제거했습니다.
    # text = re.sub('[?.,;:|\)*~`’!^\-_+<>@\#$%&-=#}※]', '', text)
    # 한글, 영문, 숫자만 남기고 모두 제거하도록 합니다.
    # text = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣa-zA-Z0-9]', ' ', text)
    # 한글, 영문만 남기고 모두 제거하도록 합니다.
    text = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣa-zA-Z]', ' ', text)
    return text

In [5]:
%time rawdata = df['Content_txt'].apply(preprocessing)

CPU times: user 961 ms, sys: 26.8 ms, total: 988 ms
Wall time: 1.16 s


In [6]:
vectorize = CountVectorizer(
    tokenizer=tokenizer, 
    min_df=10    # 예제로 보기 좋게 1번 정도만 노출되는 단어들은 무시하기로 했다
                # min_df = 0.01 : 문서의 1% 미만으로 나타나는 단어 무시
                # min_df = 10 : 문서에 10개 미만으로 나타나는 단어 무시
                # max_df = 0.80 : 문서의 80% 이상에 나타나는 단어 무시
                # max_df = 10 : 10개 이상의 문서에 나타나는 단어 무시
)
 
# 문장에서 노출되는 feature(특징이 될만한 단어) 수를 합한 Document Term Matrix(이하 DTM) 을 리턴한다
X = vectorize.fit_transform(rawdata)
 
print(
    'fit_transform, (sentence {}, feature {})'.format(X.shape[0], X.shape[1])
)
# fit_transform, (sentence 5, feature 7)
 
print(type(X))
# <class 'scipy.sparse.csr.csr_matrix'>
 
print(X.toarray())

# [[0, 1, 2, 0, 0, 0, 1],
# [0, 1, 1, 0, 0, 0, 2],
# [1, 0, 0, 2, 1, 1, 0],
# [1, 0, 0, 1, 0, 0, 0],
# [0, 0, 0, 3, 1, 1, 0]]
 
# 문장에서 뽑아낸 feature 들의 배열
features = vectorize.get_feature_names()

fit_transform, (sentence 61, feature 5157)
<class 'scipy.sparse.csr.csr_matrix'>
[[  0   3   0 ...   1   0   0]
 [  1   1   2 ...   7   0   2]
 [  6  21   1 ...  15   0   0]
 ...
 [  4  37   0 ... 158   1   0]
 [  5  26   0 ...   8   0   0]
 [  0   7   0 ...  35   0 195]]


In [7]:
# 박조은 튜토리얼 이어나가려고 함
vectorizer = vectorize
feature_vector = X

In [8]:
feature_vector.shape

(61, 5157)

In [9]:
vocab = vectorizer.get_feature_names()
print(len(vocab))
vocab[:10]

5157


['가게', '가격', '가구', '가능', '가동', '가드', '가디건', '가람', '가량', '가로']

In [10]:
# 각 리뷰마다 등장하는 단어에 빈도수가 표현됩니다. 0 은 등장하지 않음을 의미합니다.
pd.DataFrame(feature_vector[:10].toarray(), columns=vocab).head()

Unnamed: 0,가게,가격,가구,가능,가동,가드,가디건,가람,가량,가로,...,희생,희열,흰색,히말라야,히어로,히트,히트맨,힐링,힐스,힙합
0,0,3,0,24,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,1,2,11,0,4,0,1,0,0,...,0,0,2,0,0,0,0,7,0,2
2,6,21,1,124,5,0,0,6,0,3,...,0,2,0,0,0,0,0,15,0,0
3,0,0,0,106,0,0,0,0,0,0,...,1,0,0,0,0,0,0,15,0,0
4,1,47,3,47,0,10,0,0,0,0,...,0,0,4,0,1,1,0,9,0,1


In [29]:
# 위에서 구한 단어벡터를 더하면 단어가 전체에서 등장하는 횟수를 알 수 있습니다.
# 벡터화 된 피처를 확인해 봄
# Bag of words 에 몇 개의 단어가 들어있는지 확인
dist = np.sum(feature_vector, axis=0)
    
df_freq = pd.DataFrame(dist, columns=vocab)
df_freq

Unnamed: 0,가게,가격,가구,가능,가동,가드,가디건,가람,가량,가로,...,희생,희열,흰색,히말라야,히어로,히트,히트맨,힐링,힐스,힙합
0,160,591,67,4726,123,176,26,34,28,33,...,16,19,32,51,67,141,13,2174,26,1254


In [62]:
# 행과 열의 축을 T로 바꿔주고 빈도수로 정렬
df_too_much = df_freq.T.sort_values(by=0, ascending=False).head(50)
df_too_much

Unnamed: 0,0
운동,50774
요가,45629
필라테스,38788
댄스,26248
번지,20921
다이어트,18939
복싱,13744
발레,11933
니스,11664
수업,11573


In [63]:
df_too_much.T

Unnamed: 0,운동,요가,필라테스,댄스,번지,다이어트,복싱,발레,니스,수업,...,개인,부산,스튜디오,시작,기구,자이로,학원,상담,센터,감사
0,50774,45629,38788,26248,20921,18939,13744,11933,11664,11573,...,4148,4110,4072,4026,4023,3968,3950,3899,3763,3743


In [37]:
too_much_list = list(df_too_much.T.columns.values)
print(too_much_list)

['운동', '요가', '필라테스', '댄스', '번지', '다이어트', '복싱', '발레', '니스', '수업', '플라잉', '헬스', '살사', '여자', '피트', '시간', '강사', '일상', '점핑', '취미', '바디', '교육', '오늘', '레슨', '남자', '키즈', '회원', '보드', '맛집', '가능', '교정', '라틴', '여행', '건강', '피닝', '진행', '서핑', '그룹', '플라이', '잠실', '개인', '부산', '스튜디오', '시작', '기구', '자이로', '학원', '상담', '센터', '감사', '피지', '소통', '지도자', '과정', '수련', '무용', '대구', '자격증', '클럽', '수영', '홍대', '볼링', '일산', '트레이닝', '동요', '동호회', '토닉', '사진', '영상', '크로스', '오픈', '사람', '데일리', '방송', '광주', '선생', '자세', '마음', '체형', '문의', '행복', '사랑', '카페', '근력', '동작', '탕가', '클래스', '드럼', '친구', '카톡', '무료', '전화', '스포츠', '공연', '야사', '할인', '하루', '에어로빅', '등록', '준비']


In [38]:
# df_freq_T = df_freq.T.reset_index()
# df_freq_T.columns = ["course", "freq"]
# # 강의명을 토큰 3개로 중복제거하기 위해, 강좌명에서 지식공유자의 이름을 빈문자열로 변경
# df_freq_T["course_find"] = df_freq_T["course"].str.replace("홍정모의", "")
# df_freq_T["course_find"] = df_freq_T["course_find"].apply(lambda x : " ". join(x.split()[:4]))
# df_freq_T.sort_values(["course_find", "freq"], ascending=False).head(10)

TF-IDF

In [14]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=False)
transformer

TfidfTransformer(norm='l2', smooth_idf=False, sublinear_tf=False, use_idf=True)

In [15]:
%%time 
feature_tfidf = transformer.fit_transform(feature_vector)
feature_tfidf.shape

CPU times: user 12.7 ms, sys: 1.1 ms, total: 13.8 ms
Wall time: 15 ms


(61, 5157)

In [16]:
feature_tfidf

<61x5157 sparse matrix of type '<class 'numpy.float64'>'
	with 122368 stored elements in Compressed Sparse Row format>

In [17]:
# 각 row에서 전체 단어가방에 있는 어휘에서 등장하는 단어에 대한 one-hot-vector에 TF-IDF 가중치 반영
tfidf_freq = pd.DataFrame(feature_tfidf.toarray(), columns=vocab)
tfidf_freq.head() 

Unnamed: 0,가게,가격,가구,가능,가동,가드,가디건,가람,가량,가로,...,희생,희열,흰색,히말라야,히어로,히트,히트맨,힐링,힐스,힙합
0,0.0,0.00808,0.0,0.058562,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002522,0.0,0.0
1,0.000301,0.000213,0.000922,0.002121,0.0,0.001442,0.0,0.000506,0.0,0.0,...,0.0,0.0,0.00095,0.0,0.0,0.0,0.0,0.001395,0.0,0.000691
2,0.001784,0.004412,0.000455,0.023604,0.001487,0.0,0.0,0.002999,0.0,0.001197,...,0.0,0.000911,0.0,0.0,0.0,0.0,0.0,0.002951,0.0,0.0
3,0.0,0.0,0.0,0.01743,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000418,0.0,0.0,0.0,0.0,0.0,0.0,0.002549,0.0,0.0
4,0.000235,0.007801,0.001079,0.007068,0.0,0.002813,0.0,0.0,0.0,0.0,...,0.0,0.0,0.001482,0.0,0.000315,0.00034,0.0,0.001399,0.0,0.00027


In [39]:
df_tfidf = pd.DataFrame(tfidf_freq.sum())
df_tfidf_top = df_tfidf.sort_values(by=0, ascending=False)
df_tfidf_top.head(50)

Unnamed: 0,0
운동,13.310169
요가,9.245517
필라테스,8.166625
댄스,5.697942
다이어트,4.375272
헬스,3.710771
발레,3.169627
번지,2.869569
플라잉,2.778633
살사,2.659052


In [19]:
df_tfidf_bottom = df_tfidf.sort_values(by=0, ascending=True)
df_tfidf_bottom.head(10)

Unnamed: 0,0
중시,0.004372
태가,0.005114
경쟁력,0.005843
카키,0.0059
단비,0.006057
나뭇잎,0.006114
부영,0.006157
사단법인,0.006307
불빛,0.006309
요양,0.006372


In [64]:
# 중간에 생략되는 단어를 자세히 보고자 할 때
# 제일 쓸모 없는 단어들 뽑아내기

top_list = []
for t in df_tfidf_top.index[:50]:
    top_list.append(t)
print(top_list)

['요가','필라테스','다이어트','']

['운동', '요가', '필라테스', '댄스', '다이어트', '헬스', '발레', '번지', '플라잉', '살사', '니스', '복싱', '수업', '여자', '패들', '아쿠아', '일상', '시간', '보드', '서핑', '수영', '피트', '강사', '바디', '자이로', '오늘', '피닝', '남자', '레슨', '취미', '점핑', '여행', '회원', '건강', '맛집', '라틴', '키즈', '교육', '가능', '크로스', '소통', '바이크', '감사', '기구', '부산', '토닉', '교정', '스포츠', '그룹', '후프']


In [65]:
list_too_much1 = ['운동', '요가', '필라테스', '댄스', '다이어트', '헬스', '발레', '번지', '플라잉', '살사', '니스', '복싱', '수업', '여자', '패들', '아쿠아', '일상', '시간', '보드', '서핑', '수영', '피트', '강사', '바디', '자이로', '오늘', '피닝', '남자', '레슨', '취미', '점핑', '여행', '회원', '건강', '맛집', '라틴', '키즈', '교육', '가능', '크로스', '소통', '바이크', '감사', '기구', '부산', '토닉', '교정', '스포츠', '그룹', '후프', '진행', '시작', '검도', '볼링', '등산', '수련', '클라이밍', '클럽', '개인', '학원', '사진', '서프', '데일리', '트레이닝', '센터', '스튜디오', '상담', '수영장', '에어로빅', '무용', '드럼', '동요', '피티', '트램', '탕가', '조깅', '공연', '파운드', '야사', '폴린', '광주', '수중', '피지', '동호회', '사람', '킥복싱', '휘트', '대구', '카페', '지도자', '선수', '핫요가', '플라이', '타요', '기부', '하루', '행복', '일산', '사이클', '체형']
list_too_much_tf_idf = ['운동', '요가', '필라테스', '댄스', '번지', '다이어트', '복싱', '발레', '니스', '수업', '플라잉', '헬스', '살사', '여자', '피트', '시간', '강사', '일상', '점핑', '취미', '바디', '교육', '오늘', '레슨', '남자', '키즈', '회원', '보드', '맛집', '가능', '교정', '라틴', '여행', '건강', '피닝', '진행', '서핑', '그룹', '플라이', '잠실', '개인', '부산', '스튜디오', '시작', '기구', '자이로', '학원', '상담', '센터', '감사', '피지', '소통', '지도자', '과정', '수련', '무용', '대구', '자격증', '클럽', '수영', '홍대', '볼링', '일산', '트레이닝', '동요', '동호회', '토닉', '사진', '영상', '크로스', '오픈', '사람', '데일리', '방송', '광주', '선생', '자세', '마음', '체형', '문의', '행복', '사랑', '카페', '근력', '동작', '탕가', '클래스', '드럼', '친구', '카톡', '무료', '전화', '스포츠', '공연', '야사', '할인', '하루', '에어로빅', '등록', '준비']
list_too_much_200 = list_too_much1 + list_too_much_tf_idf
bulyongeo_list = list(set(list_too_much_200))
print(bulyongeo_list)

['점핑', '동작', '광주', '가능', '사진', '데일리', '공연', '하루', '카페', '영상', '바디', '일산', '검도', '살사', '대구', '사람', '볼링', '동요', '조깅', '등산', '체형', '클럽', '피지', '휘트', '무료', '자이로', '드럼', '발레', '수업', '폴린', '자격증', '피트', '트램', '홍대', '등록', '준비', '수영장', '센터', '댄스', '패들', '토닉', '요가', '일상', '클라이밍', '필라테스', '회원', '잠실', '여행', '취미', '여자', '킥복싱', '번지', '후프', '학원', '교육', '니스', '사랑', '부산', '스튜디오', '피티', '선생', '플라이', '레슨', '남자', '선수', '강사', '친구', '자세', '서핑', '교정', '클래스', '무용', '행복', '그룹', '수련', '트레이닝', '방송', '플라잉', '바이크', '수영', '키즈', '동호회', '지도자', '오늘', '타요', '개인', '헬스', '할인', '에어로빅', '기부', '상담', '라틴', '시작', '맛집', '아쿠아', '다이어트', '파운드', '카톡', '건강', '오픈', '마음', '수중', '탕가', '감사', '진행', '운동', '복싱', '소통', '문의', '전화', '사이클', '서프', '야사', '크로스', '근력', '기구', '시간', '핫요가', '피닝', '과정', '스포츠', '보드']
