## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

import time
from time import strftime

## 2. Requests Server & Check Response

In [None]:
%%time
flag = "2"
# searchValue = "1370383911"
# branch = "2130"
searchValue = "2098149915"
branch = "2070"
url = "https://www.kcomwel.or.kr/kcomwel/paym/insu/srch.jsp?confirmyn=Y&flag" + flag + "&type=saeopja_drno&searchValue=" + searchValue + "&branch=" + branch
resp = requests.get(url)
print(resp)

## 3. Check Response Values

In [None]:
resp = requests.get("https://www.kcomwel.or.kr/kcomwel/paym/insu/srch.jsp?flag=2&type=saeopja_drno&searchValue=1370383911&branch=2130")
resp

In [None]:
dom = BeautifulSoup(resp.text, 'html.parser')

### 3-1. Crawl Data 1
- Business Number와 0번째 값
- 0 : 관리번호

In [None]:
dom.select('th')[0].text.strip()

### 3-2. Crawl Data 2
- 0 : 사업장명
- 1 : 업종
- 2 : 고용 상시 인원수
- 3 : 산재 상시 인원수
- 4 : 우편번호
- 5: 사업장 주소

### Q1. list comprehension에서 추가 조건(대입문)을 주고 싶다면? 

In [4]:
count = 0

[i for i in range(1, 10) if i <= 5]

[1, 2, 3, 4, 5]

In [None]:
count = 0

[i for i in range(1, 10) if count <= 5] # count += 1 을 주고 싶다면?

In [None]:
count = 0

for i in dom.select('td'):
    print('-'*30)
    print('number: ' + count + i.text.strip)
    count += 1

## 4. Setting Values
### 4-1. Setting 'branch_table'
- branch_table : 지사 테이블
- keys : 지사번호
- values : 지사명
- jisa_dict : {지사번호, 지사명}

In [None]:
branch_table = dom.find('select', attrs={'id':'insu2004'})
options = branch_table.find_all('option')
values = [op.text for op in options]
keys = [op.get('value') for op in options]

## 의미 없는 key, value 제거
del keys[0]
del values[0]

In [None]:
print(len(keys), len(values))
print(keys)
print(values)

In [None]:
jisa_dict = dict(zip(keys, values))
print(len(jisa_dict))
jisa_dict

### 4-2. Load & Reshape Business Number 1

In [None]:
import csv

wrk_list = []
f = open('../data/.csv', 'r', encoding='utf-8')
rdr = csv.reader(f)

for line in rdr:
    wrk_list.append(line)
f.close()

In [None]:
wrk_list[:5]

In [None]:
# Convert list of lists in list
str_wrk_list = [''.join(x) for x in wrk_list]
print(len(str_wrk_list))
str_wrk_list[:5]

In [None]:
# List comprehension - Convert list of lists in list
wrk_no_list = [i.replace('-', '') for i in str_wrk_list if i != '']

In [None]:
print(len(wrk_no_list))
print(wrk_no_list[:5])

In [None]:
f = lambda wrk_no_list_split, n=3500: [wrk_no_list[i:i+n] for i in range(0, len(wrk_no_list), n)]
# f(wrk_no_list)

In [None]:
print(len(f(wrk_no_list)))
print(len(f(wrk_no_list)[0]))
print(len(wrk_no_list) / 3500)

### 4-3. Test
- 피보나치 수열로 증가하게끔 list 생성 후
- dataframe 전치하기

In [None]:
a = [i for i in range(1, 100, 5)]
a

In [None]:
pd.DataFrame(a).T

### 4-4. Load & Reshape Business Number 2

In [None]:
f = open('../data/.txt', 'r')
lst = f.readlines()
f.close()

In [None]:
print(len(lst))
lst[:10]

#### Method 1

In [None]:
my_lst = []

for i in lst:
    my_list.append(i.strip())
    
print(len(my_lst))
my_lst[:10]

#### Method 2

In [None]:
my_lst = [i.strip() for i in lst]
print(len(my_lst))
my_lst[:10]

In [None]:
f = lambda my_lst, n=500: [my_lst[i:i+n] for i in range(0, len(my_lst), n)]

## 5. Definition Crawler

In [None]:
def get_kcomwel_data(number):
    count = 1
    result = []
    
    %%time
    start_time = time.time()
    now_1 = strftime("%y/%m/%d %H:%M:%S")
    print(now_1)
    print("start_time: ", start_time)
    
    number = int(number)
    my_lst_2 = f(my_lst)[number]
    
    for wrk_no in my_lst_2:
        flag = "2"
        searchValue = wrk_no
        
        for branch_no in keys:
            url = "https://www.kcomwel.or.kr/kcomwel/paym/insu/srch.jsp?confirmyn=Y&flag" + flag + "&type=saeopja_drno&searchValue=" + searchValue + "&branch=" + branch
            resp = requests.get(url)
            print('- count: ', count, '/', len(my_lst_2)*67, '...%.2f'%((count/(len(my_lst_2)*67))*100), '%')
            count += 1
            
            dom = BeautifulSoup(resp.text, 'html.parser')
            # 관리번호
            th_elem_count = len(dom.find_all('th', attrs={'class': 'r-none'}))
            th_elem = dom.find_all('th', attrs={'class': 'r-none'})
            
            td_list = []
            # 사업자 번호가 1개 이상일 경우에만 스크래핑 실시
            if th_elem_count >= 1:
                for el in dom.find_all('td'):
                    td_list.append(el.text.split())
                    
                td_list_modified = td_list
                # 업종 전처리
                td_list_modified[1] = [','.join(td_list_modified[1]).replace(',', '')]
                # 사업장 주소 전처리
                td_list_modified[5] = [','.join(td_list_modified[5]).replace(',', '')]
                # 지사 전처리
                jisa = jisa_dict[branch_no]
                td_list_modified.append(jisa)
                # 사업자 번호 전처리
                td_list_modified.append(searchValue)
                # 관리 번호 전처리
                for th in th_elem:
                    th_text = th.text.split()[2]
                    td_list_modified.append(th_text)
                result.append(td_list_modified)
                
            else:
                pass
            
    print()
    print("-"*30, "The End", "-"*30)
    print("-"*30, "%.2f seconds" %(time.time() - start_time), "-"*30)
    now_2 = strftime("%y/%m/%d %H:%M:%S")
    print("-"*30, now_2,"-"*30)
    print()
    
    df = pd.DataFrame(result)
#     df.rename(columns=[df.columns[0] :"사업장명",
#                        df.columns[1]: "업종",
#                        df.columns[2]: "고용 상시 인원수",
#                        df.columns[3]: "산재 상시 인원수",
#                        df.columns[4]: "우편번호",
#                        df.columns[5]: "사업장 주소",
#                        df.columns[6]: "-",
#                        df.columns[7]: "-",
#                        df.columns[8]: "지사",
#                        df.columns[9]: "사업자등록번호",
# #                        df.columns[10]: "",
# #                        df.columns[11]: "",
# #                        df.columns[12]: "",
#                       ], inplace=True)
    
    print(len(result))
    print(df.shape)
    return(df)

In [None]:
from urllib.error import HTTPError

def download(i, num_retries=10):
    df = pd.DataFrame()
    try:
        number = i
        print("Downloading: ", number)
        get_data = get_kco_data(number)
        df = df.append(get_data, ignore_index=True)
        
    except HTTPError as e:
        print("Download Error: ", e.reason)
#         df2 = df.append(get_data)
#         return df
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # 5xx HTTP 오류시 재시도
                return download(i, num_retries - 1)   
        else:
            if hasattr(e, 'code'):
                return download(i, num_retries - 1)
    return df

In [None]:
result = download(0)
result

In [None]:
result.to_excel('./kcomwel_business_info({0}_{1}).xlsx'.format(strftime('%y%m%d'), '0'), encoding='utf-8')