## 1. Database

### 1) scrapy 파일 생성

In [1]:
import scrapy, requests
from scrapy.http import TextResponse

In [2]:
!scrapy startproject covid_19_collect

New Scrapy project 'covid_19_collect', using template directory '/home/ubuntu/.pyenv/versions/3.6.9/envs/python3/lib/python3.6/site-packages/scrapy/templates/project', created in:
    /home/ubuntu/python3/notebook/covid_19/covid_19_collect

You can start your first spider with:
    cd covid_19_collect
    scrapy genspider example example.com


### 2) 데이터 크롤링 확인

#### 코로나 바이러스 감염증

In [3]:
headers = {'user-agent':'Mozilla/5.0'}
req = requests.get('http://ncov.mohw.go.kr/',headers=headers)
response = TextResponse(req.url, body=req.text, encoding='utf-8')
response

<200 http://ncov.mohw.go.kr/>

In [4]:
date = response.xpath('/html/body/div/div[5]/div[2]/div/div[1]/div[1]/h2/a/span[1]/text()').extract()
date = list(date)[0].split(" ") 
date = date[0:3]
date = " ".join(date)
date = date.replace("(","21.").replace(",","")
country_in =list(map(int, response.xpath('/html/body/div/div[5]/div[2]/div/div[1]/div[1]/div[1]/div/ul/li[1]/span[2]/text()').extract()))
country_out =list(map(int, response.xpath('/html/body/div/div[5]/div[2]/div/div[1]/div[1]/div[1]/div/ul/li[2]/span[2]/text()').extract()))
total_country = [country_in+country_out for country_in,country_out in zip(country_in,country_out)][0]
capital_distance = response.xpath('//*[@id="main_maplayout"]/button[9]/span[2]/text()').extract()[0]
noncapital_distance = response.xpath('//*[@id="main_maplayout"]/button[17]/span[2]/text()').extract()[0]

date,country_in, country_out, total_country,capital_distance,noncapital_distance


('21.3.15. 00시 기준', [370], [12], 382, '2', '1.5')

#### 코로나19 예방접종

In [3]:
headers = {'user-agent':'Mozilla/5.0'}
req = requests.get('http://ncv.kdca.go.kr/',headers=headers)
response = TextResponse(req.url, body=req.text, encoding='utf-8')
response

<200 http://ncv.kdca.go.kr/>

In [7]:
date = response.xpath('//*[@id="content"]/div[2]/div/h2/span[2]/em/text()').extract()
date = list(date)[0].split(" ") 
date = " ".join(date)
date = date.replace(",","")
date = '21.' + date

take_yesterday_1 = response.xpath('//*[@id="content"]/div[2]/div/div/div[1]/div[1]/ul/li[2]/div/p/span/text()').extract()
take_yesterday_1 = list(take_yesterday_1)[0]
take_yesterday_2 = response.xpath('//*[@id="content"]/div[2]/div/div/div[1]/div[2]/ul/li[2]/div/p/span/text()').extract()

total_1 = response.xpath('//*[@id="content"]/div[2]/div/div/div[1]/div[1]/ul/li[1]/div/p/span[1]/text()').extract()
total_2 = response.xpath('//*[@id="content"]/div[2]/div/div/div[1]/div[2]/ul/li[1]/div/p/span[1]/text()').extract()

seoul_vaccine = response.xpath('//*[@id="content"]/div[2]/div/div/div[2]/div/div[1]/div[2]/div[2]/div/p[1]/span[2]/text()').extract()
gyeonggi_vaccine = response.xpath('//*[@id="content"]/div[2]/div/div/div[2]/div/div[1]/div[2]/div[2]/div/p[9]/span[2]/text()').extract()
incheon_vaccine = response.xpath('//*[@id="content"]/div[2]/div/div/div[2]/div/div[1]/div[2]/div[2]/div/p[4]/span[2]/text()').extract()

date,take_yesterday_1,take_yesterday_2,total_1,total_2,seoul_vaccine, gyeonggi_vaccine, incheon_vaccine

('21.3.16. 0시 기준',
 '11,922',
 ['0'],
 ['602,150'],
 ['0'],
 ['3,216'],
 ['2,875'],
 ['501'])

### 3) items.py 파일 생성

In [6]:
%%writefile covid_19_collect/covid_19_collect/items.py
import scrapy


class Covid19CollectItem(scrapy.Item):
    date = scrapy.Field()
    country_in = scrapy.Field()
    country_out = scrapy.Field()
    total_country = scrapy.Field()
    capital_distance = scrapy.Field()
    noncapital_distance = scrapy.Field()
    take_yesterday_1 = scrapy.Field()
    take_yesterday_2 = scrapy.Field()
    total_1 = scrapy.Field()
    total_2 = scrapy.Field()
    seoul_vaccine = scrapy.Field()
    gyeonggi_vaccine = scrapy.Field()
    incheon_vaccine = scrapy.Field()

Overwriting covid_19_collect/covid_19_collect/items.py


### 4) spider.py 파일 생성

In [12]:
%%writefile covid_19_collect/covid_19_collect/spiders/spider.py
import scrapy,re
from scrapy.crawler import CrawlerProcess
from covid_19_collect.items import Covid19CollectItem

class Spider(scrapy.Spider):
    name = "covid_19_collect"
    start_urls =  ["http://ncov.mohw.go.kr/",
                   'http://ncv.kdca.go.kr/']
    
    def start_requests(self):  
        yield scrapy.Request("http://ncov.mohw.go.kr/", callback=self.parse_content)
        
    def parse_content(self,response):
        item = Covid19CollectItem()
        date = response.xpath('/html/body/div/div[5]/div[2]/div/div[1]/div[1]/h2/a/span[1]/text()').extract()
        date = list(date)[0].split(" ") 
        date = date[0:3]
        date = " ".join(date)
        item['date'] = date.replace("(","21.").replace(",","")
        item['country_in'] =list(map(int, response.xpath('/html/body/div/div[5]/div[2]/div/div[1]/div[1]/div[1]/div/ul/li[1]/span[2]/text()').extract()))
        item['country_out'] =list(map(int, response.xpath('/html/body/div/div[5]/div[2]/div/div[1]/div[1]/div[1]/div/ul/li[2]/span[2]/text()').extract()))
        item['total_country'] = [item['country_in']+item['country_out'] for item['country_in'],item['country_out'] in zip(item['country_in'],item['country_out'])][0]
        item['capital_distance'] = response.xpath('//*[@id="main_maplayout"]/button[9]/span[2]/text()').extract()[0]
        item['noncapital_distance'] = response.xpath('//*[@id="main_maplayout"]/button[17]/span[2]/text()').extract()[0]
        meta = {'item':item}
        yield scrapy.Request('http://ncv.kdca.go.kr/',meta=meta, callback=self.parse)
    
    def parse(self, response):
        item = Covid19CollectItem(response.meta['item'])
        take_yesterday_1 = response.xpath('//*[@id="content"]/div[2]/div/div/div[1]/div[1]/ul/li[2]/div/p/span/text()').extract()
        item['take_yesterday_1'] = list(take_yesterday_1)[0]
        take_yesterday_2 = response.xpath('//*[@id="content"]/div[2]/div/div/div[1]/div[2]/ul/li[2]/div/p/span/text()').extract()
        item['take_yesterday_2'] = list(take_yesterday_2)[0]
        
        
        total_1 = response.xpath('//*[@id="content"]/div[2]/div/div/div[1]/div[1]/ul/li[1]/div/p/span[1]/text()').extract()
        item['total_1'] = list(total_1)[0]
        total_2 = response.xpath('//*[@id="content"]/div[2]/div/div/div[1]/div[2]/ul/li[1]/div/p/span[1]/text()').extract()
        item['total_2'] = list(total_2)[0]
        
        seoul_vaccine = response.xpath('//*[@id="content"]/div[2]/div/div/div[2]/div/div[1]/div[2]/div[2]/div/p[1]/span[2]/text()').extract()
        item['seoul_vaccine'] = list(seoul_vaccine)[0]
        gyeonggi_vaccine = response.xpath('//*[@id="content"]/div[2]/div/div/div[2]/div/div[1]/div[2]/div[2]/div/p[9]/span[2]/text()').extract()
        item['gyeonggi_vaccine'] = list(gyeonggi_vaccine)[0]
        incheon_vaccine = response.xpath('//*[@id="content"]/div[2]/div/div/div[2]/div/div[1]/div[2]/div[2]/div/p[4]/span[2]/text()').extract()
        item['incheon_vaccine'] = list(incheon_vaccine)[0]
        yield item

Overwriting covid_19_collect/covid_19_collect/spiders/spider.py


### 5) shell script 파일 생성

In [8]:
%pwd

'/home/ubuntu/python3/notebook/covid_19'

In [9]:
%%writefile collect.sh
cd /home/ubuntu/python3/notebook/covid_19/covid_19_collect/
scrapy crawl covid_19_collect -o covid_19_total.csv

Writing collect.sh


In [13]:
!/bin/bash collect.sh

2021-03-16 20:18:37 [scrapy.utils.log] INFO: Scrapy 2.4.1 started (bot: covid_19_collect)
2021-03-16 20:18:37 [scrapy.utils.log] INFO: Versions: lxml 4.6.2.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.6.9 (default, Dec 25 2020, 07:37:56) - [GCC 7.5.0], pyOpenSSL 20.0.1 (OpenSSL 1.1.1j  16 Feb 2021), cryptography 3.4.6, Platform Linux-5.4.0-1039-aws-x86_64-with-debian-buster-sid
2021-03-16 20:18:37 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2021-03-16 20:18:37 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'covid_19_collect',
 'NEWSPIDER_MODULE': 'covid_19_collect.spiders',
 'ROBOTSTXT_OBEY': True,
 'SPIDER_MODULES': ['covid_19_collect.spiders']}
2021-03-16 20:18:37 [scrapy.extensions.telnet] INFO: Telnet Password: 3d472213bb453c36
2021-03-16 20:18:37 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.exte

### 6) 크롤링 된 데이터 저장 및 프레임화

In [14]:
import pandas as pd
df = pd.read_csv('./covid_19_collect/covid_19_total.csv')
df

Unnamed: 0,capital_distance,country_in,country_out,date,gyeonggi_vaccine,incheon_vaccine,noncapital_distance,seoul_vaccine,take_yesterday_1,take_yesterday_2,total_1,total_2,total_country
0,2,345,18,21.3.16. 00시 기준,2875,501,1.5,3216,11922,0,602150,0,363


### 7) mongodb pipeline.py 파일 생성

In [15]:
import pymongo

In [16]:
%%writefile covid_19_collect/covid_19_collect/mongodb.py
import pymongo

client = pymongo.MongoClient("mongodb://ID:Password@address")
collection = client.covid19_total.total

Writing covid_19_collect/covid_19_collect/mongodb.py


In [18]:
%%writefile covid_19_collect/covid_19_collect/pipelines.py
from itemadapter import ItemAdapter
from .mongodb import collection

class Covid19CollectPipeline:
    def process_item(self, item, spider):
        data = {"date": item['date'], "country_in": item['country_in'],
                "country_out": item['country_out'], "total_country": item['total_country'],
                "capital_distance": item['capital_distance'], "noncapital_distance": item['noncapital_distance'], 'total_vaccine_1' : item['total_1'], 'total_vaccine_2' : item['total_2'], 'vaccine_yesterday_1': item['take_yesterday_1'], 'vaccine_yesterday_2': item['take_yesterday_2'] , 'seoul_vaccine' : item['seoul_vaccine'], 'gyeonggi_vaccine' : item['gyeonggi_vaccine'] , 'incheon_vaccine' : item['incheon_vaccine']}
        collection.insert(data)
        return items
    

Overwriting covid_19_collect/covid_19_collect/pipelines.py


In [20]:
!/bin/bash collect.sh

2021-03-16 20:29:05 [scrapy.utils.log] INFO: Scrapy 2.4.1 started (bot: covid_19_collect)
2021-03-16 20:29:05 [scrapy.utils.log] INFO: Versions: lxml 4.6.2.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.6.9 (default, Dec 25 2020, 07:37:56) - [GCC 7.5.0], pyOpenSSL 20.0.1 (OpenSSL 1.1.1j  16 Feb 2021), cryptography 3.4.6, Platform Linux-5.4.0-1039-aws-x86_64-with-debian-buster-sid
2021-03-16 20:29:05 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2021-03-16 20:29:05 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'covid_19_collect',
 'NEWSPIDER_MODULE': 'covid_19_collect.spiders',
 'ROBOTSTXT_OBEY': True,
 'SPIDER_MODULES': ['covid_19_collect.spiders']}
2021-03-16 20:29:05 [scrapy.extensions.telnet] INFO: Telnet Password: 1f0e1c04ebd72868
2021-03-16 20:29:05 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.exte