In [1]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
from kiwipiepy import Kiwi

tqdm.pandas()

In [2]:
def cleansing(body, mode=None):
    if type(body) == np.ndarray:
        body = np.round(body.tolist(), 5)
        body = body.tolist()
        return body
    else:
        body = re.sub('<YNAOBJECT.*?/YNAOBJECT>', '', body, 0, re.I|re.S) # YNAOBJECT 태그 제거
        body = re.sub('<table.*?/table>', '', body, 0, re.I|re.S) # table 태그 제거
        
        if mode == 'meta':
            try:
                body = body[re.search("\(.*?연합뉴스\).*?=", body).span()[0]:]
            except AttributeError:
                pass
            return body
        
        if mode == 'sum':
            try:
                body = body[re.search("\(.*?연합뉴스\).*?=", body).span()[1]:]
            except AttributeError:
                pass
            
        
        body = re.sub(r'\([^)]*\)', '', body, 0, re.I|re.S) # 괄호안 내용 제거
        body = re.sub(r'\[.*\]|\s-\s.*', '', body, 0, re.I|re.S) # 괄호안 내용 제거
        body = re.sub(r'\<.*\>|\s-\s.*', '', body, 0, re.I|re.S) # 괄호안 내용 제거
        body = re.sub('·', ' ', body, 0, re.I|re.S) # 변환
        body = re.sub('-', ' ', body, 0, re.I|re.S) # 변환
        body = re.sub('\(.*?연합뉴스\).*?=', '', body, 0, re.I|re.S) # 기자명 및 기자소속 제거
        body = re.sub('\(서울.*?연합뉴스\)', '', body, 0, re.I|re.S) # 기자소속 제거
        body = re.sub('yna.co.kr', '', body, 0, re.I|re.S) # 연합 url 제거
        body = re.sub('([a-zA-Z0-9\_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)', '', body, 0, re.I|re.S) # E-mail 제거
        body = re.sub('(http|ftp|https)://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', '', body, 0, re.I|re.S) # URL 제거
#         body = re.sub('([0-9\_.+-]+억[0-9-]+천[0-9-]+만원)', '', body, 0, re.I|re.S) # 억천만 제거
#         body = re.sub('([0-9\_.+-]+천[0-9-]+억원)', '', body, 0, re.I|re.S) # 억천만 제거
#         body = re.sub('([0-9\_.+-]+억[0-9-]+천[0-9-]+달러)', '', body, 0, re.I|re.S) # 억천만 제거
#         body = re.sub('([0-9\_.+-]+억[0-9-]+천만달러)', '', body, 0, re.I|re.S) # 억천만 제거
#         body = re.sub('([0-9\_.+-]+천[0-9-]+만원)', '', body, 0, re.I|re.S) # 천만 제거
#         body = re.sub('([0-9\_.+-]+천[0-9-]+달러)', '', body, 0, re.I|re.S) # 천만 제거
#         body = re.sub('([0-9\_.+-]+만원)', '', body, 0, re.I|re.S) # 만 제거
#         body = re.sub('([0-9\_.+-]+만[0-9-]천[0-9-]+원)', '', body, 0, re.I|re.S) # 만 제거
        body = re.sub('전문보기.*?\n', '', body, 0, re.I|re.S) # 전문보기 제거
        body = re.sub('&lt;.*?&gt;&gt;', '', body, 0, re.I|re.S) # 기타 안내문 제거
        body = re.sub('&lt;.*?&gt;', '', body, 0, re.I|re.S) # '&gt;'가 1개인 안내문 제거
        body = re.sub('&apos;', "'", body, 0, re.I|re.S) # &apos; ' 변환
        body = re.sub('&amp;', "&", body, 0, re.I|re.S) # &amp;; ' 변환        
        body = re.sub('-.*?_.*?\(끝\)', '', body, 0, re.I|re.S) # 9001887 속성에만 존재하는 불용어 제거
        body = re.sub('[#@*※|》■▲▼◀▶◇☎]-<>=+·↑', '', body, 0, re.I|re.S) # 특수문자 제거
        body = re.sub('…', '', body, 0, re.I|re.S) # … 제거
        body = re.sub('\(끝\)', '', body, 0, re.I|re.S) # 끝 태그 제거
        body = re.sub('[""]', '', body, 0, re.I|re.S) # "" 제거
        body = re.sub(r'[^ A-Za-z가-힣+]',' ' , body, 0, re.I|re.S) # 한글, 알파벳을 제외한 나머지 제거


        paragraphs = body.split('\r\n') # 단락 분리
        if ' 기자' in paragraphs[-1]: del paragraphs[-1] # 끝 단락 기자명 제거
        body = []
        for paragraph in paragraphs:
            paragraph = re.sub('\[.+?\]', '', paragraph, 0, re.I|re.S)
            paragraph = ' '.join(paragraph.split()) # 문자열 중간 다중 공백 제거
            if len(paragraph) > 0:
                body.append(paragraph)
        
        if mode == 'quot':
            return body
        else:
            return ' '.join(body)

In [3]:
def kiwi_extract(data, kiwi):
    f = open('new_dict.txt','w',encoding='utf-8-sig')

    for i in data:
        f.write(str(i))
    f.close()
    
    inputs = list(open('new_dict.txt', encoding='utf-8-sig'))
    result = kiwi.extract_words(inputs, min_cnt=10, max_word_len=10, min_score=0.25)

    return pd.DataFrame(result)

In [4]:
def kiwi_add(data, kiwi):
    data['NNP'] = np.nan
    data['NNP'] = data['NNP'].fillna('NNP')
    data = data[['word','NNP','finscore']]
    result = []
    
    for i in range(len(data)):
        result.append(kiwi.add_user_word(data['word'][i],data['NNP'][i],data['finscore'][i]))
    return result

In [5]:
def addword(data, kiwi):
    data1 = cleansing(data)
    data2 = kiwi_extract(data1)
    k = open('extract.txt','w',encoding='utf-8-sig')
    for i in data2:
        k.write(i)
    k.close()
    data3 = kiwi_add(data2)
    
    return pd.DataFrame(data2,data3)

In [6]:
kiwi = Kiwi(model_type='sbg',num_workers=0)

In [7]:
df = pd.read_csv('result_day_TB_7.csv', encoding='utf-8-sig')

In [8]:
df = df.rename(columns={'0':'word','1':'finscore','2':'count','3':'NNPscore'})

In [9]:
kiwi_add(df,kiwi)

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,

In [10]:
kiwi.prepare()

In [11]:
kiwi.tokenize('투모로우바이투게더가 선별진료소에서 젤렌스키를 만났다')

[Token(form='투모로우바이투게더', tag='NNP', start=0, len=9),
 Token(form='가', tag='JKS', start=9, len=1),
 Token(form='선별진료소', tag='NNP', start=11, len=5),
 Token(form='에서', tag='JKB', start=16, len=2),
 Token(form='젤렌스키', tag='NNP', start=19, len=4),
 Token(form='를', tag='JKO', start=23, len=1),
 Token(form='만나', tag='VV', start=25, len=2),
 Token(form='었', tag='EP', start=26, len=1),
 Token(form='다', tag='EF', start=27, len=1)]

In [12]:
df1 = pd.read_csv('cleansing_TB.csv')

In [13]:
morph_analysis = lambda x: kiwi.split_into_sents(text=x, return_tokens=True,normalize_coda=True) if type(x) is str else None

In [14]:
len(df1)

917057

In [15]:
df1 = df1.drop_duplicates().reset_index().drop(columns='index',axis=1)

In [16]:
len(df1)

592537

In [17]:
clean_morpherme1 = df1['TB'].iloc[:250000].progress_apply(morph_analysis)

100%|██████████| 250000/250000 [49:31<00:00, 84.14it/s]  


In [18]:
clean_morpherme2 = df1['TB'].iloc[250000:592537].progress_apply(morph_analysis)

100%|██████████| 342537/342537 [1:09:19<00:00, 82.35it/s]  


In [19]:
from collections import Counter
'''주요 품사, 용언 품사 정의'''
주요품사 = ['NNG', 'NNP', 'VV', 'VA', 'XR', 'SL']
용언품사 = ['VV', 'VA']
'''Counter를 활용해 가장 많이 나온 n개의 품사 결과를 돌려주는 pos_count() 함수'''
def pos_count(df, col, n=1000000):
    카운터 = Counter()
    
    for index, row in df.iterrows(): 
        if row[col]:
            필터링결과 = [(token.form, token.tag) for token in row[col] if token.tag in 주요품사]
            카운터.update(필터링결과)
            
    return 카운터
#     with open(output_filename, "w", encoding='utf-8-sig') as output_file:
#         print("형태소,품사,개수", file=output_file)
#         for (형태소, 품사), 개수 in 카운터.most_common(n):
#             if 품사 in 용언품사:
#                 형태소 += "다"
#             print(f"{형태소},{품사},{개수}", file=output_file)

In [20]:
clean_morpherme1 = pd.DataFrame(clean_morpherme1)

In [21]:
clean_morpherme2 = pd.DataFrame(clean_morpherme2)

In [22]:
df_all = pd.concat([clean_morpherme1,clean_morpherme2])

In [1]:
df_all.to_csv('Token.csv',encoding='utf-8',index=False)

NameError: name 'df_all' is not defined

In [73]:
for i in range(3):
    a = pos_count(pd.DataFrame(df_all['TB'][i]),'tokens')

In [None]:
pos_count(df_all['TB'][0], 'TB', './주요어휘빈도.csv')