### This is the ELT process for the CNN Timeline of Key Events related to COVID-19 found here: 
    
    https://dataviz.nbcnews.com/projects/20200302-covid-timeline/index.html?initialWidth=1160&childId=embed-20200302-covid-timeline&parentTitle=Coronavirus%20timeline%3A%20Tracking%20the%20critical%20moments%20of%20Covid-19&parentUrl=https%3A%2F%2Fwww.nbcnews.com%2Fhealth%2Fhealth-news%2Fcoronavirus-timeline-tracking-critical-moments-covid-19-n1154341

In [1]:
#Import Dependencies

from bs4 import BeautifulSoup as bs
import requests
import pymongo
import pandas as pd
from sqlalchemy import create_engine
import psycopg2


In [2]:
# URL of page to be scraped

url = 'https://dataviz.nbcnews.com/projects/20200302-covid-timeline/index.html?initialWidth=1160&childId=embed-20200302-covid-timeline&parentTitle=Coronavirus%20timeline%3A%20Tracking%20the%20critical%20moments%20of%20Covid-19&parentUrl=https%3A%2F%2Fwww.nbcnews.com%2Fhealth%2Fhealth-news%2Fcoronavirus-timeline-tracking-critical-moments-covid-19-n1154341'

# Retrieve page with the requests module

response = requests.get(url)

# Create BeautifulSoup object; parse with 'html.parser'

soup = bs(response.text, 'html.parser')


In [3]:
#Scrape CNN timeline for the key dates and add to a list

date_results = soup.find_all('div', class_='holder-container')


date_list =[]
for result in date_results:
    date = result.find('p', class_='vt-date')
    date_list.append(date.text+', 2020')



In [4]:
#Scrape CNN timeline for the news stories and add to list

news_results = soup.find_all('div', class_='holder-text')

news_list=[]
for result in news_results:
    news = result.find('p', class_='vt-contex')
    news_list.append(news.text.strip())


In [5]:
#Create dataframe with date and news story from CNN timeline using pandas
#Clean up Dates 

Timeline_df = pd.DataFrame({"Date":date_list, "News Event":news_list})
Timeline_df.loc[[0], ['Date']]="Dec. 31, 2019"
Timeline_df.loc[[1], ['Date']]="Jan. 7, 2020"
Timeline_df["Date"] = Timeline_df["Date"].str.replace(".","")
Timeline_df



Unnamed: 0,Date,News Event
0,"Dec 31, 2019",Chinese officials in Wuhan in China's central ...
1,"Jan 7, 2020",The outbreak was identified as a new coronavirus.
2,"Jan 11, 2020",China reported its first known death from an i...
3,"Jan 20, 2020",A World Health Organization situation report d...
4,"Jan 21, 2020",The United States announced its first confirme...
...,...,...
983,"Dec 9, 2020",The United Arab Emirates became the first nati...
984,"Dec 9, 2020","The U.S. reported a record 3,103 Covid deaths,..."
985,"Dec 11, 2020",The FDA authorized Pfizer's Covid vaccine for ...
986,"Dec 14, 2020",The vaccine rollout began across the United St...


In [6]:
#Change Date into datetime format

Timeline_df['Date_dt']=pd.to_datetime(Timeline_df["Date"], infer_datetime_format=True)
Timeline_df = Timeline_df[['Date_dt', 'News Event']]
Timeline_df

Unnamed: 0,Date_dt,News Event
0,2019-12-31,Chinese officials in Wuhan in China's central ...
1,2020-01-07,The outbreak was identified as a new coronavirus.
2,2020-01-11,China reported its first known death from an i...
3,2020-01-20,A World Health Organization situation report d...
4,2020-01-21,The United States announced its first confirme...
...,...,...
983,2020-12-09,The United Arab Emirates became the first nati...
984,2020-12-09,"The U.S. reported a record 3,103 Covid deaths,..."
985,2020-12-11,The FDA authorized Pfizer's Covid vaccine for ...
986,2020-12-14,The vaccine rollout began across the United St...


In [7]:
#Create engine

engine = create_engine('postgresql+psycopg2://cggjytcd:2Lf6GkD0Cb8TbV6e4-X7ZBCvNMh_zV3F@raja.db.elephantsql.com:5432/cggjytcd')


In [9]:
#Load table into Postgres database

Timeline_df.to_sql(name='cnn_news', schema='public', con=engine, method='multi', if_exists='replace', index=True)