In [1]:
from nltk.tokenize import sent_tokenize
from selenium import webdriver
from ast import literal_eval
import pandas as pd
import numpy as np
import requests
import nltk
import json
import time
import re

In [6]:
df_kospi_filing_2015_2019 = pd.read_excel('D:\\data\\finance\\financial_statements\\kr\\kospi200_2015_2019.xlsx')

In [7]:
# Use nltk to split sentences (works great on Korean as well)
raw_text = df_kospi_filing_2015_2019['Text']
token_text = raw_text.apply(lambda x: sent_tokenize(x))

# Use extra hand-made parser
def additional_sent_tokenize(text):
    '''
    Some bullet points, which should be taken as separate sentences,
    are not accounted for by the nltk sentence tokenizer.
    This additional tokenizer will take care of such bullet points.
    '''
    if '※' in text:
        text_list = text.split('※')
    elif '* ' in text:
        text_list = text.split('* ')
    else:
        text_list = [text]
    return text_list

additional_token_text =[]

for t in token_text:
    temp_list = []
    for sent in t:
        temp_list.append(additional_sent_tokenize(sent))
    additional_token_text.append([item for sublist in temp_list for item in sublist])

# Select relevant, proper sentences and pre-process
def clean_text(text):
    '''
    Clean the messy raw text
    1) Remove \n, 전자공시시스템 dart.fss.or.kr, Page #
    2) Remove white space before and after the input text
    '''
    text = re.sub("\\n|(전자공시시스템 dart\.fss\.or\.kr)?(\n)?(Page \d{1,2})?(?=\s)",
                  " ",
                  text)
    text = re.sub("\s\s+", " ", text).strip()
    return text

sent_text = []

for t in additional_token_text:
    relevant_sentences = []
    
    for s in t:
        if s.endswith('니다.'):
            s = clean_text(s)
            s = s.strip()
            if len(s) > 5000:
                split_txt = [s[i*len(s)//30:(i+1)*len(s)//30] for i in range(30)]
                relevant_sentences += split_txt
            if len(s) > 3000:
                split_txt = [s[i*len(s)//20:(i+1)*len(s)//20] for i in range(20)]
                relevant_sentences += split_txt
            if len(s) > 1000:
                split_txt = [s[i*len(s)//10:(i+1)*len(s)//10] for i in range(10)]
                relevant_sentences += split_txt
            else:
                relevant_sentences += [s]
            
    sent_text.append(relevant_sentences)

# Save checkpoint
df_kospi_filing_2015_2019['Relevant_Sentences'] = sent_text

In [8]:
df_kospi_filing_2015_2019_parsed = df_kospi_filing_2015_2019.drop(['Text'], axis=1)
df_kospi_filing_2015_2019_parsed.to_csv('kospi200_2015_2019_parsed.csv', encoding='utf-8', index=False)

In [6]:
df_kospi_filing_2015_2019_parsed = pd.read_csv('kospi200_2015_2019_parsed.csv', encoding='utf-8')
df_kospi_filing_2015_2019_parsed['Relevant_Sentences'] = [literal_eval(s) for s in df_kospi_filing_2015_2019_parsed['Relevant_Sentences']]

In [7]:
def make_sublist(text):
    batch = []
    batch_size = 5
    num_batches = int(np.ceil(len(text)/batch_size))
    for i in range(num_batches):
        batch.append(' '.join(text[batch_size*i:batch_size*(i+1)]))
    return batch

text_series_sublists = df_kospi_filing_2015_2019_parsed['Relevant_Sentences'].apply(make_sublist)

In [2]:
df_kospi_filing_2015_2019_parsed = pd.read_csv('kospi200_2015_2019_translated.csv', encoding='utf-8')
df_kospi_filing_2015_2019_parsed['Relevant_Sentences'] = [literal_eval(s) for s in df_kospi_filing_2015_2019_parsed['Relevant_Sentences']]
def make_sublist(text):
    batch = []
    batch_size = 5
    num_batches = int(np.ceil(len(text)/batch_size))
    for i in range(num_batches):
        batch.append(' '.join(text[batch_size*i:batch_size*(i+1)]))
    return batch

text_series_sublists = df_kospi_filing_2015_2019_parsed[df_kospi_filing_2015_2019_parsed.Translated.isnull()]['Relevant_Sentences'].apply(make_sublist)

In [4]:
path = "C:\\Users\\sylim2357\\Documents\\chromedriver_win32\\chromedriver.exe"
driver = webdriver.Chrome(path)
driver.get('https://translate.google.com/#view=home&op=translate&sl=ko&tl=en')
elem = driver.find_element_by_id("source")
# df_kospi_filing_2015_2019_parsed['Translated'] = ''
for idx, text_sublist in text_series_sublists.iteritems():
    print(str(idx)+': translating')
    translated_text = []
    for text in text_sublist:
        elem.clear()
        time.sleep(np.random.randint(50,300)/101)
        elem.send_keys(text)
        status = True
        while status:
            try:
                time.sleep(np.random.randint(100,300)/101)
                translated_text.append(driver.find_element_by_class_name("tlid-translation").text)
                status = False
            except:
                status = True
    print(str(idx)+': translated')
    df_kospi_filing_2015_2019_parsed.loc[idx, 'Translated'] = ' '.join(translated_text)
    if idx % 20 == 0:
        df_kospi_filing_2015_2019_parsed.to_csv('kospi200_2015_2019_translated.csv', encoding='utf-8', index=False)
driver.close()
df_kospi_filing_2015_2019_parsed.to_csv('kospi200_2015_2019_translated.csv', encoding='utf-8', index=False)

512: translating
512: translated
514: translating
514: translated
2355: translating
2355: translated
2356: translating
2356: translated
2357: translating
2357: translated
2358: translating
2358: translated
2359: translating
2359: translated
2361: translating
2361: translated
2362: translating
2362: translated
2363: translating
2363: translated
2364: translating
2364: translated
2365: translating
2365: translated
2366: translating
2366: translated
2481: translating
2481: translated
2482: translating
2482: translated
2483: translating
2483: translated
2484: translating
2484: translated
2485: translating
2485: translated
2486: translating
2486: translated
2487: translating
2487: translated
2488: translating
2488: translated
2489: translating
2489: translated
2490: translating
2490: translated
2491: translating
2491: translated
2492: translating
2492: translated
2493: translating
2493: translated
2494: translating
2494: translated
2495: translating
2495: translated
2496: translating
2496: 

2702: translated
2703: translating
2703: translated
2704: translating
2704: translated
2705: translating
2705: translated
2706: translating
2706: translated
2707: translating
2707: translated
2708: translating
2708: translated
2709: translating
2709: translated
2710: translating
2710: translated
2711: translating
2711: translated
2712: translating
2712: translated
2713: translating
2713: translated
2714: translating
2714: translated
2715: translating
2715: translated
2716: translating
2716: translated
2717: translating
2717: translated
2718: translating
2718: translated
2719: translating
2719: translated
2720: translating
2720: translated
2721: translating
2721: translated
2722: translating
2722: translated
2723: translating
2723: translated
2724: translating
2724: translated
2725: translating
2725: translated
2726: translating
2726: translated
2727: translating
2727: translated
2728: translating
2728: translated
2729: translating
2729: translated
2730: translating
2730: translated
273

2937: translated
2938: translating
2938: translated
2939: translating
2939: translated
2940: translating
2940: translated
2941: translating
2941: translated
2942: translating
2942: translated
2943: translating
2943: translated
2944: translating
2944: translated
2945: translating
2945: translated
2946: translating
2946: translated
2947: translating
2947: translated
2948: translating
2948: translated
2949: translating
2949: translated
2950: translating
2950: translated
2951: translating
2951: translated
2952: translating
2952: translated
2953: translating
2953: translated
2954: translating
2954: translated
2955: translating
2955: translated
2956: translating
2956: translated
2957: translating
2957: translated
2958: translating
2958: translated
2959: translating
2959: translated
2960: translating
2960: translated
2961: translating
2961: translated
2962: translating
2962: translated
2963: translating
2963: translated
2964: translating
2964: translated
2965: translating
2965: translated
296

3172: translated
3173: translating
3173: translated
3174: translating
3174: translated
3175: translating
3175: translated
3176: translating
3176: translated
3177: translating
3177: translated
3178: translating
3178: translated
3179: translating
3179: translated
3180: translating
3180: translated
3181: translating
3181: translated
3182: translating
3182: translated
3183: translating
3183: translated
3184: translating
3184: translated
3185: translating
3185: translated
3186: translating
3186: translated
3187: translating
3187: translated
3188: translating
3188: translated
3189: translating
3189: translated
3190: translating
3190: translated
3191: translating
3191: translated
3192: translating
3192: translated
3193: translating
3193: translated
3194: translating
3194: translated
3195: translating
3195: translated
3196: translating
3196: translated
3197: translating
3197: translated
3198: translating
3198: translated
3199: translating
3199: translated
3200: translating
3200: translated
320

In [5]:
pd.read_csv('kospi200_2015_2019_translated.csv', encoding='utf-8')

Unnamed: 0,File,Start,End,Relevant_Sentences,Translated
0,316140_분기보고서 (2019.03),24,105,['업계의 현황 【국내외 경제 및 금융시장 환경】 (1) 국내외 경제 환경 금융은 ...,Industry Status 【Domestic and foreign economic...
1,000070_반기보고서 (2018.06),17,49,"['업계의 현황 지주회사(持株會社, Holding Company)란 다른 회사의 주...",Status of the industry Holding company (持 株 會 ...
2,000070_분기보고서 (2018.03),16,49,"['업계의 현황 지주회사(持株會社, Holding Company)란 다른 회사의 주...",Status of the industry Holding company (持 株 會 ...
3,000070_분기보고서 (2018.09),17,51,"['업계의 현황 지주회사(持株會社, Holding Company)란 다른 회사의 주...",Status of the industry Holding company (持 株 會 ...
4,000070_사업보고서 (2017.12),18,50,"['업계의 현황 지주회사(持株會社, Holding Company)란 다른 회사의 주...",Status of the industry Holding company (持 株 會 ...
...,...,...,...,...,...
3286,271560_반기보고서 (2017.06),14,32,['산업의 특성 제과산업은 국내 소비자들의 독특한 기호와 고급과자를 중심으로 한 선...,Characteristics of Industry In the confectione...
3287,271560_분기보고서 (2017.09),14,33,['산업의 특성 제과산업은 국내 소비자들의 독특한 기호와 고급과자를 중심으로 한 선...,Characteristics of Industry In the confectione...
3288,000050_반기보고서 (2015.06),10,27,['사업의 개요 (1) 면방직산업 국내 면방직산업은 해방 이후 우리나라의 산업 발전...,Business Overview (1) Cotton Textile Industry ...
3289,000050_반기보고서 (2016.06),9,25,"['주요 제품, 서비스 등 (1) 면방직산업 국내 면방직산업은 해방 이후 우리나라의...","Main products, services, etc. (1) Cotton texti..."


In [2]:
df = pd.read_csv('kospi200_2015_2019_translated.csv', encoding='utf-8')

In [5]:
df_kospi_filing_2015_2019_parsed.to_csv('kospi200_2015_2019_translated.csv', encoding='utf-8', index=False)

In [None]:
df_kospi_filing_2015_2019_parsed.to_csv('kospi200_2015_2019_translated.csv', encoding='utf-8', index=False)