Import packages...

In [34]:
# check python path  
# import sys
# print(sys.executable)

# import packages 
import numpy as np
import requests
import pandas as pd
import plotly.graph_objects as go
import datetime
import re

**Making plots of coronavirus data**

In [74]:
data = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv')

recovered_data = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Recovered.csv')

deaths_data = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv')

#Rename dates so the numbering aligns 
def rename_dates(df):
    orig = ["/1/","/2/","/3/","/4/","/5/","/6/","/7/","/8/","/9/"]
    replacement = ["/01/","/02/","/03/","/04/","/05/","/06/","/07/","/08/","/09/"]
    for i in range(len(orig)):
        df.columns = df.columns.str.replace(orig[i],replacement[i])

rename_dates(data)
rename_dates(recovered)
rename_dates(deaths)

In [56]:
def toTimeSeries(timeArr):
    datetimeArr = []
    for time in timeArr:
        time = time.split('/')
        year = int('20' + time[2])
        month = int(time[0])
        day = int(time[1])
        datetimeArr.append(datetime.datetime(year, month, day))
    return datetimeArr

In [67]:
# all_china is the dataframe with all columns for all provinces in china 
# all_china_data is specifically just the time data from all provinces in china. It selects columns by the fact that the time data starts after the "Long" longitude column
all_china = data[data["Country/Region"]=="Mainland China"]
all_china_data = all_china.iloc[:,(all_china.columns.get_loc("Long")+1):]

times = toTimeSeries(all_china_data.columns)

# select the states to display in the plot. Here we select all of the provinces in china 
states = all_china["Province/State"]

fig = go.Figure()
for state in states:
    # get the index of the specified state, so we can locate that data for the plot 
    ind = all_china["Province/State"][all_china["Province/State"] == state].index[0]
    # plot the figure, where x is the date and y is the number of confirmed cases in the specified state 
    # by default only show Hubei, but have other states appear on the legend
    if state == "Hubei":
        fig.add_trace(go.Scatter(x=times,y=all_china_data.loc[ind], mode='lines+markers',name=state))
    else:
        fig.add_trace(go.Scatter(x=times,y=all_china_data.loc[ind], mode='lines+markers',name=state, visible="legendonly"))

fig.update_layout(xaxis_title='Date',yaxis_title='Number of Confirmed Cases',xaxis=dict(tickangle=45))
# fig.show() 

In [82]:
# Let's look at totals by country instead of by province 
countries = data.drop(columns=["Lat","Long"]).groupby('Country/Region').sum()

# Let's specifically focus on countries that have over 70 cases, and let's drop the "Others" column that is things like Diamond Princess 
outbreak_countries = countries[countries[:-1:] > 100]
outbreak_countries = outbreak_countries.transpose().drop(columns=["Others"])

In [83]:
# This figure plots by date and shows the countries that surpass 70, starting plotting points from the date they have more than 50 cases 
fig = go.Figure()
for country in outbreak_countries.columns:
    t = toTimeSeries((outbreak_countries[outbreak_countries[country]>20]).index)
    y = outbreak_countries[outbreak_countries[country]>20][country]
    fig.add_trace(go.Scatter(x=t,y=y, mode='lines+markers',name=country))
fig.update_layout(title="Time series showing when each country grows beyond 20 confirmed cases", xaxis_title='Date',yaxis_title='Number of Confirmed Cases',xaxis=dict(tickangle=45))
fig.show() 

In [84]:
# this figure will instead line up the day they pass 50 confirmed cases, so we can compare their growth rates 

# only show US, Italy, and Iran to start, because I want to compare those, and have other traces hidden 
show = ["US", "Italy", "Iran"]

fig = go.Figure()
for country in outbreak_countries.columns:
    y = outbreak_countries[outbreak_countries[country]>20][country]
    t = np.arange(len(y))
    if country in show:
        fig.add_trace(go.Scatter(x=t,y=y, mode='lines+markers',name=country))
    else:
        fig.add_trace(go.Scatter(x=t,y=y, mode='lines+markers',name=country, visible="legendonly"))
fig.update_layout(title="Growth after 20 confirmed cases, by country", xaxis_title='Date',yaxis_title='Number of Confirmed Cases',xaxis=dict(tickangle=45))
fig.show() 


**Look at recovered cases**

In [76]:
# Recovered Cases 

# Let's look at totals by country instead of by province 
recovered = recovered_data.drop(columns=["Lat","Long"]).groupby('Country/Region').sum()
recovered = recovered.transpose()

fig = go.Figure()
for country in recovered.columns:
    t = toTimeSeries(recovered[recovered[country]>20].index)
    y = recovered[recovered[country]>20][country]
    fig.add_trace(go.Scatter(x=t,y=y, mode='lines+markers',name=country))
fig.update_layout(title="Time series showing recovered cases", xaxis_title='Date',yaxis_title='Number of Recovered Cases',xaxis=dict(tickangle=45))
fig.show() 

In [92]:
# Deaths

# Let's look at totals by country instead of by province 
deaths = deaths_data.drop(columns=["Lat","Long"]).groupby('Country/Region').sum()
deaths = deaths.transpose().drop(columns=["Others"])

fig = go.Figure()
for country in deaths.columns:
    t = toTimeSeries(deaths[deaths[country]>1].index)
    y = deaths[deaths[country]>1][country]
    fig.add_trace(go.Scatter(x=t,y=y, mode='lines+markers',name=country))
fig.update_layout(title="Time series showing deaths by country", xaxis_title='Date',yaxis_title='Number of Deaths',xaxis=dict(tickangle=45))
fig.show() 

In [93]:
# Compare China's confirmed cases, recovered cases, and deaths 

pairs = {
    "Confirmed" : outbreak_countries,
    "Recovered": recovered,
    "Deaths": deaths
}

fig = go.Figure()
for key in pairs:
    y = pairs[key]["Mainland China"]
    t = toTimeSeries(pairs[key]["Mainland China"].index)
    fig.add_trace(go.Scatter(x=t,y=y, mode='lines+markers',name=key))
fig.update_layout(title="Growth after 20 confirmed cases, by country", xaxis_title='Date',yaxis_title='Number of Confirmed Cases',xaxis=dict(tickangle=45))
fig.show() 

**Save the data into a csv file**

In [32]:
# this creates a dataframe that index as dates and includes all data for those countries that eventually surpass 70 confirmed cases 
all_data_outbreak_countries = pd.DataFrame()
for country in outbreak_countries.columns:
    y = outbreak_countries.reset_index()[country]
    all_data_outbreak_countries = pd.concat([all_data_outbreak_countries,y],axis=1)

# this creates a dataframe that index as dates and only includes data when it is over 50, otherwise the cell is blank 
outbreak_countries_date_only_above_20 = pd.DataFrame()
for country in outbreak_countries.columns:
    y = outbreak_countries[outbreak_countries[country]>20][country]
    outbreak_countries_date_only_above_20 = pd.concat([outbreak_countries_date_only_above_20,y],axis=1)


# this creates a dataframe that index from day 0 as the first instance they have over 50 cases
outbreak_countries_index_above_20 = pd.DataFrame()
for country in outbreak_countries.columns:
    y = outbreak_countries[outbreak_countries[country]>20].reset_index()[country]
    outbreak_countries_index_above_20 = pd.concat([outbreak_countries_index_above_20,y],axis=1)



In [33]:
# Save CSV files 

outbreak_countries_date_only_above_20.to_csv('countries_over_20_with_dates.csv')
outbreak_countries_index_above_20.to_csv('countries_over_20.csv')
all_data_outbreak_countries.to_csv('countries_over_70_all_data.csv')

**Using the NY Times API to get top articles**

In [None]:
#  get Top news articles with coronavirus query 
url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?begin_date=20191201&fl=abstract&fl=web_url&&fl=headline&q=coronavirus&sort=relevance&api-key=rZdmVnweDJ2f6a4nViqm3GGMhG6iF6jm"

r1 = requests.get(url).json()

#  examples on how to access the headline, abstract, and url from the json object 
print(r1['response']['docs'][2]['headline']['main'])
print(r1['response']['docs'][2]['abstract'])
print(r1['response']['docs'][2]['web_url'])

url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?begin_date=20191201&fl=abstract&fl=web_url&&fl=headline&q=coronavirus&sort=relevance&api-key=rZdmVnweDJ2f6a4nViqm3GGMhG6iF6jm"
r1 = requests.get(url).json()
r1 = r1['response']['docs']
articles = []
for i in range(len(r1)):
    articles.append({
        'headline': r1[i]['headline']['main'],
        'abstract': r1[i]['abstract'],
        'url': r1[i]['web_url']
    })
print(articles)