In [1]:
import time, math
from bs4 import BeautifulSoup
from googleapiclient.discovery import build
import requests as rs
import operator, re
import numpy as np
from konlpy.tag import Hannanum
from google.cloud import translate
import os
from nltk.corpus import wordnet

In [50]:
def init_variables():
    """ Initialize global variables """
    print("1. init variables")

    global file_check, translate_client, hannanum

    # downloadable file string pattern
    file_check = ["download", "down", "file", "pdf", "excel", "xlsx", "docx", "hwp", "youtube"]

    # google translate API
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "My Project-a8e42c74ea7e.json"
    translate_client = translate.Client()

    # Hannanum pos tagger
    hannanum = Hannanum()


def init_count_dict():

    global company_count_dict
    company_count_dict = {}

In [3]:
def init_company_list():

    company_list = []
    with open("app/static/data/company_list3.csv", "r", encoding="utf-8") as f:
        company_list.extend(list(np.unique(f.read().splitlines())))

    with open("app/static/data/abroad_company_list.csv", "r", encoding="utf-8") as f:
        company_list.extend(list(np.unique(f.read().splitlines())))

    return company_list

In [4]:
# build an API-specific service
def getService():
    service = build("customsearch",
                    "v1",
                    developerKey="AIzaSyCNLAU_Lunh5aJIo17DlslQvKoQGU7yDjA")

    return service

In [5]:
def get_derived_query(keyword):
    nouns = [word for word, pos in hannanum.pos(keyword) if pos == "N"]
    syn_dict = {}
    query_list = [keyword]

    for noun in nouns:
        result = translate_client.translate(noun, target_language="en")
        if len(result["translatedText"].split(" ")) > 1:  # 복합 명사 처리 안함
            continue
        else:
            translated_noun = result["translatedText"]
            # print(noun, translated_noun)
            for syn in wordnet.synsets(translated_noun):
                synonyms = []
                if syn.pos() == "n":
                    syn_word = syn.name().split(".")[0]
                    synonyms.append(syn_word)

        syn_dict[noun] = synonyms

    if len(syn_dict) > 0:
        for noun in syn_dict:
            for syn in syn_dict[noun]:
                syn_ko = translate_client.translate(syn, target_language="ko")["translatedText"]
                query_list.append(keyword.replace(noun, syn_ko))

    return query_list

In [6]:
def google_search(keyword):
    service = getService()  # GOOGLE API 연결

    response_url = []  # URL list

    query_list = get_derived_query(keyword)
    print("2. retrieve related urls with keyword ", query_list)
    for query in query_list:
        startIndex = 1  # 시작 인덱스
        while (True):
            try:
                result = service.cse().list(
                    q=query,  # 검색 키워드
                    cx='001132580745589424302:jbscnf14_dw',  # CSE Key
                    lr='lang_ko',  # 검색 언어 (한국어)
                    start=startIndex,
                    filter="0"
                ).execute()

                # 검색된 결과가 있을 때
                if "items" in result:
                    for item in result["items"]:
                        url = item["link"]
                        response_url.append(url)

                    # INDEX 이동
                    if (len(result["items"]) < 10):  # 결과가 10개 미만이면 STOP
                        break
                    else:  # 결과가 10개면 이동
                        startIndex = startIndex + 10
                else:
                    # print("No more Results")
                    break
            except Exception as e:
                print(e)
                break

    response_url = list(np.unique(response_url))

    print("The number of all results : " + str(len(response_url)))

    return response_url

In [7]:
def classify_url(url_list):
    """extract urls  for exclusion from all url list"""
    print("3. classify page urls")

    download_list = []
    html_list = []

    for i, url in enumerate(url_list):
        # extract downloadable URL including youtube
        if (any(ext in url.lower() for ext in file_check)):
            download_list.append(url)
        else :
            html_list.append(url)

    print("All : " + str(len(url_list)) + " ( HTML URL : " + str(len(html_list)) + " / Downloadable URL : " + str(
        len(download_list)) + " )")
    return download_list, html_list

In [8]:
def locations_of_substring(string, substring):
    """Return a list of locations of a substring."""

    substring_length = len(substring)
    def recurse(locations_found, start):
        location = string.find(substring, start)
        if location != -1:
            return recurse(locations_found + [location], location+substring_length)
        else:
            return locations_found

    return recurse([], 0)

In [9]:
def remove_postposition(word):
    if word[-3:] == "에서는":
        return word[:-3]
    elif word[-1] in ["은", "는", "이", "가", "의"]:
        return word[:-1]
    else:
        return word

In [63]:
def parse_html(url):

    try:
        response = rs.get(url)
        if response.encoding != None:
            html = response.text.encode(response.encoding)
        else:
            html = response.text
    except Exception as e:
        return None

    soup = BeautifulSoup(html, 'html.parser').body

    if soup == None : return None

    # remove all script, style
    for item in ["script", "style", "a", "img"]:
        if soup.find(item):
            for e in soup.find_all(item):
                e.decompose()

    # remove div with "footer" class
    for div in soup.find_all("div", {'class': 'footer'}):
        div.decompose()
    for div in soup.find_all("div", {'id': 'footer'}):
        div.decompose()

    # remove "footer" element
    if soup.find("footer"):
        soup.find("footer").decompose()

    # extract only text
    html_text = soup.get_text().strip()

    return html_text

In [14]:
keyword = "비침습 혈당 센서"
init_variables()
company_list = init_company_list()
url_list = google_search(keyword)
download_list, html_list = classify_url(url_list)

1. init variables
2. retrieve related urls with keyword  ['비침습 혈당 센서', '비침습 혈당 탐지기']
The number of all results : 123
3. classify page urls
All : 123 ( HTML URL : 56 / Downloadable URL : 67 )


In [81]:
init_count_dict()

for i, url in enumerate(html_list):
    # parse html
    html_text = parse_html(url)
    if html_text == None : continue
        
    print("URL", str(i) + ". " + url)
    
    for company in company_list :
        company2 = company.replace("(주)", "")
        if len(company2) == 1 : continue 
        if company2 in html_text :
            temp_idx = html_text.find(company2)
            
            if (html_text[temp_idx + len(company2)] in ["은", "는", "이", "가", "의", "\n", "\t", "\r", " ", ",", ".",
                                                               ")"]) | (
                    html_text[temp_idx + len(company2): temp_idx + len(company2) + 2] == "에서"):
                print(company2)
                
                break
    break

URL 0. http://biomed.khu.ac.kr/professor/professor_detail.html?seq=3
대한


In [73]:
init_count_dict()

html_list = ["https://fic.ulsan.ac.kr/open_content/information/research/detail/major/"]

for i, url in enumerate(html_list):
    # if i < 49: continue
    # parse html
    html_text = parse_html(url)
    if html_text == None : continue
        
    print("URL", str(i) + ". " + url)
        
#     # Case 1) 기업명에서 "㈜" 포함 시
#     found_index_list = []
#     for company in company_list:
#         company2 = company.replace("(주)", "")
#         company3 = company.replace("(주)", "㈜")
        
#         if (company in html_text) | (company3 in html_text):
#             # 특수문자 "㈜" 로 대체
#             if company not in html_text:
#                 company = company3
#             temp_idx = html_text.find(company)
#             # 기업명 뒤가 주격조사, 특수문자, 공백 중 하나로 끝나는지 체크
#             if (html_text[temp_idx + len(company)] in ["은", "는", "이", "가", "의", "\n", "\t", "\r", " ", ",", ".",
#                                                                ")"]) | (
#                     html_text[temp_idx + len(company): temp_idx + len(company) + 2] == "에서"):
#                 company = company.replace("㈜", "(주)")
#                 print("Case 1)", company)
#                 if company in company_count_dict:
#                     company_count_dict[company]["count"] += 1
#                     company_count_dict[company]["url_list"].append(url)
#                 else:
#                     company_count_dict[company] = {"count": 1, "url_list": [url]}
#                 found_index_list.append(temp_idx)
#             else:
#                 pass
#                 # print(company, "is real company name?")
                
    '''
    Case 3) 기업 DB 사용 X, "(주)" 포함 기업명 추가
    ''' 
    try : 
        cor_words_index = locations_of_substring(html_text, "(주)")
        cor_words_index += locations_of_substring(html_text, "㈜")
    except RecursionError as e:
        continue

    cor_words_index = [i for i in cor_words_index if i not in found_index_list] # Case 1 경우 제외
    
    if len(cor_words_index) != 0:
        for x, idx in enumerate(cor_words_index):
            # print(x, "(", idx, ").", end=" ")
            # "(주)"가 앞에 붙은 경우
            if html_text[idx - 1] in [" ", "\n", "\r", "\t", ",", "."]:
                blank_idx_list = [html_text[idx:].find(c) for c in [" ", "\n", "\r", "\t", ".", ","]
                                            if html_text[idx:].find(c) > 0]
                cor_word = html_text[idx:idx + np.min(blank_idx_list)]
            # "(주)"가 뒤에 붙은 경우
            elif html_text[idx + 3] in [" ", "\n", "\r", "\t", ",", "."]:
                blank_idx_list = [html_text[:idx + 3][::-1].find(c) for c in
                                            [" ", "\n", "\r", "\t", ".", ","] if
                                            html_text[:idx + 3][::-1].find(c) > 0]
                if len(blank_idx_list) == 0:
                    cor_word = html_text[:idx + 3][::-1][:][::-1]
                else:
                    cor_word = html_text[:idx + 3][::-1][:np.min(blank_idx_list)][::-1]
            else:
                blank_idx_list = [html_text[idx:].find(c) for c in [" ", "\n", "\r", "\t"] if
                                                      html_text[idx:].find(c) > 0]
                cor_word = html_text[idx:idx + np.min(blank_idx_list)]

            # 특수문자 전체 제거
            ju = "(주)" if cor_word.find("(주)") != -1 else "㈜"
            loc_ju = "f" if cor_word.find(ju) == 0 else "b"
            cor_word = cor_word.replace(ju, "")
            # 괄호 안의 문자 모두 제거
            p = re.compile(r'\([^)]*\)')
            cor_word = re.sub(p, "", cor_word)
            cor_word = re.sub('[?|$|.,-=|!()•]', '', cor_word).strip()
            cor_word = ju + cor_word if loc_ju == "f" else cor_word + ju

            # 기업명이 문자 한 개 이하면 패스
            if len(cor_word.replace(ju, "")) <= 1:
                print(cor_word, " name too short")
                continue

            cor_word = cor_word.replace("㈜", "(주)")
            cor_word = remove_postposition(cor_word)
            # 기업명 DB에 이미 존재할 경우 패스
            exist = False
            # 주격조사 제거 시 고려
            # cor_word2 = remove_postposition(cor_word)
            for ext in company_list:
                if cor_word in ext:
                    if cor_word == ext:
                        # cor_word = cor_word2
                        print("Case 3)", cor_word, "already exists in the DB")
                        exist = True
                        break

            if exist == False:
                print("Case 3)", cor_word)
                company_list.append(cor_word)
                company_count_dict[cor_word] = {"count": 1, "url_list": [url]}
        

URL 0. https://fic.ulsan.ac.kr/open_content/information/research/detail/major/


In [72]:
def locations_of_substring(string, substring):
    """Return a list of locations of a substring."""

    substring_length = len(substring)
    def recurse(locations_found, start):
        location = string.find(substring, start)
        if location != -1:
            return recurse(locations_found + [location], location+substring_length)
        else:
            return locations_found

    return recurse([], 0)

In [None]:
company_count_dict