In [None]:
import re
import glob
import json
import string
from datetime import datetime
from warnings import filterwarnings
filterwarnings("ignore")

import pandas as pd
import dask.dataframe as dd
import dask.array as da
# df가 무거워지면 dask 사용
import regex # import re.. slow
#!python -m spacy download en_core_web_sm
# spacy 모듈에 현재 wasabi 버전에러 관련 문제가 있어 주석처리함
# 참조 링크 : https://github.com/explosion/spaCy/issues/11236
# 현재 해결방법이 있는듯 하나 spacy를 대체하는 것이 좋을 듯 함
# sentencizer 메소드 변경 필요
# import spacy # too slow...
# from spacy.lang.en import English # updated
from tqdm.notebook import tqdm

from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer

In [None]:
# data = filing date, ticker, text, coname,... 등의 column이 있는 데이터프레임
# %%time
# data = None
# data['vacant'] = data['content'].apply(lambda x: 1 if "NoSuchKey" in x or "getElementsByTagName" in x else 0)
# print(Counter(data['vacant']))
# data = data[(data['vacant']!=1)].drop(['vacant'], axis = 1).reset_index(drop = True)

# Preprocess Rules

In [None]:
rmv_rules = [
  r'(_){2,}', r'(=){2,}', r'(-){2,}', r'(—){2,}', # 연속된 불용어 제거
  r'(Table of Contents ){2,}', # 매 페이지 반복되는 tables of contents 제거
  r'(.htm)', r'(.txt)', # 첨부파일 확장자 제거.
  r'([0-9]{10,10} )', # 숫자 10자리 제거
  r'([0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}){3,}', # <-- 어떤 텍스트?
  r'((INDEX)( [0-9][0-9]){2,})', r'((INDEX)( [0-9][0-9][0-9]){2,})', # index 제거
  # f'({ticker_list[i]}:[A-Za-z0-9]+)', # ticker 제거 # 개선요망 #우선 주석처리후 실행시켜봄
 ]

repl_rules = [
  ("I TEM", "ITEM"),
  # Bill McDonald 교수가 제공하는 자료 내에서 UNITED STATES SECURITIES... 에서의 UNITED STATES가 종종 누락됨.
  # 원본 문서에서 문장 사이에 줄바꿈이 있을 시 누락되는 것으로 추정되며 repl_rule 내에서 수정 요함.
  (r"[\s]*(UNITED STATES SECURITIES AND EXCHANGE COMMISSION)[\s]*", "UNITED STATES SECURITIES AND EXCHANGE COMMISSION"),
  # (r"[\s]*(SECURITIES AND EXCHANGE COMMISSION)[\s]*", "UNITED STATES SECURITIES AND EXCHANGE COMMISSION") # 이건 불필요한 strip, 오문 발생시킴. raw 확인 후 수정 필요
  # 이 부분의 코드 수정 요망
  (r"[\s]*(Table of Contents UNITED STATES FORM)[\s]*", "UNITED STATES SECURITIES AND EXCHANGE COMMISSION"),
  (r"[\s]*(UNITED STATES Table of Contents)[\s]*", "UNITED STATES SECURITIES AND EXCHANGE COMMISSION"),
  # 이 부분의 코드 수정 요망, txt데이터와 html 파싱 데이터의 차이점이 있음.
  # 이하는 일반적인 텍스트 클리닝임.
  (r'[\n\t]', ' '), # 엔터, 탭 제거
  (r'[\s]+', ' '), # 연속된 공백 제거
  ]

# Preprocessing, Tokenization, Lemmatization

In [None]:
def preprocess(text):
  for pattern in rmv_rules:
    text = regex.sub(pattern, '', text, flags=regex.IGNORECASE)
  for i, (pattern, repl) in enumerate(repl_rules):
    text = regex.sub(pattern, repl, text, flags=regex.IGNORECASE) 
  text = text.strip()
  return text

# This function will be our all-in-one noise removal function
def remove_stopwords(tokens):
    cleaned_tokens = []
    for token in tokens: 
        # cleaned 함수
        # Eliminating the token if its length is less than 2, if it is a punctuation or if it is a stopword
        if token not in string.punctuation and len(token) > 2 and token not in stop_words:
            # 영어 외 제거
            token = re.sub(r"[^a-zA-Z]",'', token)
            # more 1 times
            if token not in string.punctuation and len(token) > 2 and token not in stop_words:
                cleaned_tokens.append(token.lower())
                cleaned_tokens = list(filter(None, cleaned_tokens))
    return cleaned_tokens

stop_words = stopwords.words('english')

# The reduce_len parameter will allow a maximum of 3 consecutive repeating characters, while trimming the rest
# For example, it will tranform the word: 'Helloooooooooo' to: 'Hellooo'
tk = TweetTokenizer(reduce_len=True)

# The tokenize function will return a list of tokens
def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(tokens):
        # First, we will convert the pos_tag output tags to a tag format that the WordNetLemmatizer can interpret
        # In general, if a tag starts with NN, the word is a noun and if it stars with VB, the word is a verb.
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence 

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

In [None]:
# 테스트 코드

dir = r'C:\Users\wonhyeong\workings\data\10X\sample.txt'
text = open(dir, 'r', encoding='utf-8').read()
text = text[text.find('Table of Contents'):]
prep = preprocess(text)
token = tk.tokenize(prep)
token = remove_stopwords(token)
lemma = lemmatize_sentence(token)

with open(r'C:\Users\wonhyeong\workings\data\10X\sample_prep.txt', 'w', encoding='utf-8') as f:
  f.write(prep)

# Sentence

In [None]:
nlp = English()
nlp.add_pipe('sentencizer') # updated
# This uses the rule-based method, rather than the statistical model to split sentences.
# For my use case, using en_core_web_sm worked better but.. too slow...

# content에 대한 spacy sentencizer 결과 리스트로 저장
data['sentence'] = data['content'].apply(lambda x: [sent.text.strip() for sent in nlp(x).sents])
  

# 리스트 자료형에 대해 ';'.join, 세미콜론은 텍스트 내에서 자주 언급되므로 대체가 필요해 보임 (한 문장 내에서 사용하는 경우 있음)
# data['token'] = data['token'].apply(lambda x: ";".join(x))
# data['lemma'] = data['lemma'].apply(lambda x: ";".join(x))
# data['sentence'] = data['sentence'].apply(lambda x: ";".join(x))
# data['filingDate'] = pd.to_datetime(data['filingDate'], format='%Y-%m-%d')
# data.to_csv("data.csv", encoding = "utf-8", index = False)

## 기존 코드

In [None]:
%load_ext line_profiler
%lprun
# %%time 이후 여러 셀 실행 --> lprun 사용, 셀 통합
if __name__ == "__main__": 
  data['content'] = data['content'].apply(preprocess)
  data['token'] = data['content'].apply(lambda x: tk.tokenize(x))
  data['token'] = data['token'].apply(lambda x: remove_stopwords(x))
  data['lemma'] = data['token'].apply(lambda x: lemmatize_sentence(x))
  data.to_csv("data.csv", encoding = "utf-8", index = False)

In [None]:
# temp1 = 'UNITED STATES SECURITIES AND EXCHANGE COMMISSION'.lower()
# temp2 = 'SECURITIES AND EXCHANGE COMMISSION'.lower()

# ticker_list = []
# temp_list = list(data['ticker'].drop_duplicates())
# for k in temp_list:
#     ticker_list.extend(k.split(";"))
# ticker_list = [k.lower() for k in ticker_list]
# ticker_list.extend(['iso4217','us-gaap', 'xbrli', 'utr', 'srt', 'country', 'jaws'])

In [None]:
%%time
# regex-based preprocessing
# needs to organized as class / function

data['content'] = data['content'].apply(lambda x: x.replace("I TEM", "ITEM"))
data['content'] = data['content'].apply(lambda x: x.replace(x, regex.sub(r'[\n\t]', ' ', x).strip()))

data['content'] = data['content'].apply(lambda x: regex.sub(r'(_){2,}', '', x))
data['content'] = data['content'].apply(lambda x: regex.sub(r'(=){2,}', '', x))
data['content'] = data['content'].apply(lambda x: regex.sub(r'(-){2,}', '', x))
data['content'] = data['content'].apply(lambda x: regex.sub(r'(—){2,}', '', x))

###
data['content'] = data['content'].apply(lambda x : x.replace("Table of Contents UNITED STATES FORM", "UNITED STATES SECURITIES AND EXCHANGE COMMISSION"))
data['content'] = data['content'].apply(lambda x : x.replace("UNITED STATES Table of Contents", "UNITED STATES SECURITIES AND EXCHANGE COMMISSION"))
# united states securities and exchange commission <<-- .lower() 에 대해 regex.sub 대신 flag=re.IGNORECASE 사용
# data['content'] = data['content'].apply(lambda x: x[x.lower().find(temp1):].strip() if temp1 in x.lower() else (x[x.lower().find(temp2):].strip() if temp2 in x.lower() else x))
###

data['content'] = data['content'].apply(lambda x: regex.sub(r'(Table of Contents ){2,}', '', x))

# 시간이 많이 걸리는 듯?
# 모든 ticker에 대해 루프를 도는 이유가 뭘까? 추가 설명 필요
# ex: data['symbol'] 에 대해 실행할 수 있지 않을까?
for i in tqdm(range(0, len(ticker_list))):
    data['content'] = data['content'].apply(lambda x: x.replace(x, regex.sub(f'({ticker_list[i]}:[A-Za-z0-9]+)', '', x).strip()))
data['content'] = data['content'].apply(lambda x: x.replace(x, regex.sub(r'(.htm)', '', x).strip()))
data['content'] = data['content'].apply(lambda x: x.replace(x, regex.sub(r'(.txt)', '', x).strip()))

data['content'] = data['content'].apply(lambda x: x.replace(x, regex.sub(r"\s+", " ", x)))

data['content'] = data['content'].apply(lambda x: regex.sub(r'([0-9]{10,10} )', '', x).strip())
data['content'] = data['content'].apply(lambda x: regex.sub(r'( [0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}){3,}', '', x).strip())

# 의도를 알 수 없음... 
temp = "Pursu a nt t o t he".lower()
data['content'] = data['content'].apply(lambda x: x[:x.lower().find(temp)].strip() if temp in x.lower()[-500:] else x)
data['content'] = data['content'].apply(lambda x: regex.sub(r'((INDEX)( [0-9][0-9]){2,})', '', x).strip())
data['content'] = data['content'].apply(lambda x: regex.sub(r'((INDEX)( [0-9][0-9][0-9]){2,})', '', x).strip())