# Corona Virus Data Analysis

In [1]:
import numpy as np
import pandas as pd
import seaborn as sbn
import matplotlib.pyplot as plt
%matplotlib inline
sbn.set(rc={'figure.figsize':(11.7,8.27)})


In [2]:
rawData = pd.read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv")

In [3]:
dataFrames = dict(states={}, counties={})
columns = ['cases', 'deaths']

def getTotal(): 
    if not "total" in dataFrames:
        dataFrames["total"] =rawData.groupby('date').sum()[columns]
    return dataFrames["total"]

def getState(state):
    if not state in dataFrames["states"]:
        statedf = rawData[rawData.state==state].groupby('date').sum()[columns]
        if statedf.empty:
            print (state + " is not in the data, did you spell it right? capitalization?")
            return 
        else:
            dataFrames["states"][state] = statedf
    return dataFrames["states"][state]

def getCounty(county):
    if not county in dataFrames["counties"]:
        countydf = rawData[rawData.county==county].groupby('date').sum()[columns]
        if countydf.empty:
            print(county + " is not in the data, did you spell it right? capitalization?")
            return
        else:
            dataFrames["counties"][county] = countydf
    return dataFrames["counties"][county]
        

In [4]:
yesterday, today = getTotal().tail(2).index.values # last two dates data to check how recent this data is
print("Latest date: ", today)

Latest date:  2020-04-18


In [None]:

yesterdayLatest = rawData[rawData.date == yesterday].groupby(['state', 'county']).sum()[columns]
latest = rawData[rawData.date == today].groupby(['state', 'county']).sum()[columns]
deltas = latest - yesterdayLatest
latest["newcases"] = deltas.cases
latest["newdeaths"] = deltas.deaths
latest = latest.reset_index()

In [None]:
def worstPlaces(state=None, count=15):
    if state == None:
        count = 30
        print(f"Counties with the most new cases in US for {today}:")
        display(latest.sort_values(['newcases', 'newdeaths'], ascending=False)[:count].reset_index(drop=True))
        print(f"Counties with the most new deaths in US for {today}:")
        display(latest.sort_values(['newdeaths', 'newcases'], ascending=False)[:count].reset_index(drop=True))
    else:
        print(f"Counties with the most new cases in {state} for {today}:")
        display(latest[latest.state==state].sort_values(['newcases', 'newdeaths'], ascending=False)[:count].reset_index(drop=True))
        print(f"Counties with the most new deaths in {state} for {today}:")
        display(latest[latest.state==state].sort_values(['newdeaths', 'newcases'], ascending=False)[:count].reset_index(drop=True))


In [None]:
worstPlaces()

Counties with the most new cases in US for 2020-04-18:


Unnamed: 0,state,county,cases,deaths,newcases,newdeaths
0,New York,New York City,131273,8632,3921.0,0.0
1,New York,Suffolk,26143,706,1108.0,0.0
2,Illinois,Cook,20395,860,1004.0,100.0
3,New York,Westchester,23179,738,703.0,0.0
4,New York,Nassau,29180,1356,641.0,0.0
5,California,Los Angeles,12021,576,630.0,81.0
6,Ohio,Marion,983,1,555.0,0.0
7,Massachusetts,Middlesex,8297,334,553.0,40.0
8,New Jersey,Union,8959,356,530.0,26.0
9,Pennsylvania,Philadelphia,9014,342,451.0,44.0


Counties with the most new deaths in US for 2020-04-18:


Unnamed: 0,state,county,cases,deaths,newcases,newdeaths
0,New York,Unknown,0,1170,0.0,540.0
1,Illinois,Cook,20395,860,1004.0,100.0
2,California,Los Angeles,12021,576,630.0,81.0
3,New Jersey,Essex,9901,732,229.0,48.0
4,Pennsylvania,Philadelphia,9014,342,451.0,44.0
5,Massachusetts,Middlesex,8297,334,553.0,40.0
6,New Jersey,Bergen,12163,741,300.0,27.0
7,New Jersey,Union,8959,356,530.0,26.0
8,Michigan,Wayne,13471,1070,238.0,26.0
9,Maryland,Unknown,0,96,0.0,25.0


In [None]:
worstPlaces("California")

Counties with the most new cases in California for 2020-04-18:


Unnamed: 0,state,county,cases,deaths,newcases,newdeaths
0,California,Los Angeles,12021,576,630.0,81.0
1,California,Riverside,2602,74,145.0,5.0
2,California,San Bernardino,1219,57,123.0,2.0
3,California,San Francisco,1140,20,79.0,0.0
4,California,Kern,619,3,66.0,0.0
5,California,Orange,1556,32,55.0,4.0
6,California,San Diego,2213,71,55.0,1.0
7,California,Alameda,1135,41,51.0,1.0
8,California,San Mateo,838,28,41.0,0.0
9,California,Contra Costa,685,19,37.0,0.0


Counties with the most new deaths in California for 2020-04-18:


Unnamed: 0,state,county,cases,deaths,newcases,newdeaths
0,California,Los Angeles,12021,576,630.0,81.0
1,California,Riverside,2602,74,145.0,5.0
2,California,Orange,1556,32,55.0,4.0
3,California,San Bernardino,1219,57,123.0,2.0
4,California,San Diego,2213,71,55.0,1.0
5,California,Alameda,1135,41,51.0,1.0
6,California,Sacramento,914,33,18.0,1.0
7,California,Santa Barbara,385,4,12.0,1.0
8,California,San Francisco,1140,20,79.0,0.0
9,California,Kern,619,3,66.0,0.0


In [None]:
worstPlaces("Colorado")

Counties with the most new cases in Colorado for 2020-04-18:


Unnamed: 0,state,county,cases,deaths,newcases,newdeaths
0,Colorado,Denver,1723,74,87.0,8.0
1,Colorado,Arapahoe,1498,62,79.0,4.0
2,Colorado,Weld,1029,61,43.0,0.0
3,Colorado,Jefferson,930,36,42.0,1.0
4,Colorado,Adams,894,33,34.0,2.0
5,Colorado,Boulder,342,17,15.0,2.0
6,Colorado,El Paso,721,49,13.0,0.0
7,Colorado,Morgan,147,2,12.0,0.0
8,Colorado,Larimer,231,12,7.0,2.0
9,Colorado,Douglas,360,15,7.0,0.0


Counties with the most new deaths in Colorado for 2020-04-18:


Unnamed: 0,state,county,cases,deaths,newcases,newdeaths
0,Colorado,Denver,1723,74,87.0,8.0
1,Colorado,Arapahoe,1498,62,79.0,4.0
2,Colorado,Adams,894,33,34.0,2.0
3,Colorado,Boulder,342,17,15.0,2.0
4,Colorado,Larimer,231,12,7.0,2.0
5,Colorado,Jefferson,930,36,42.0,1.0
6,Colorado,Routt,49,2,3.0,1.0
7,Colorado,Weld,1029,61,43.0,0.0
8,Colorado,El Paso,721,49,13.0,0.0
9,Colorado,Morgan,147,2,12.0,0.0


In [None]:
worstPlaces("Arizona")

Counties with the most new cases in Arizona for 2020-04-18:


Unnamed: 0,state,county,cases,deaths,newcases,newdeaths
0,Arizona,Maricopa,2491,70,87.0,1.0
1,Arizona,Pima,856,56,37.0,5.0
2,Arizona,Apache,169,4,28.0,0.0
3,Arizona,Pinal,235,6,23.0,1.0
4,Arizona,Navajo,452,11,17.0,0.0
5,Arizona,Coconino,314,29,10.0,0.0
6,Arizona,Yavapai,72,1,3.0,0.0
7,Arizona,Cochise,22,0,2.0,0.0
8,Arizona,Gila,7,0,2.0,0.0
9,Arizona,Mohave,52,2,1.0,0.0


Counties with the most new deaths in Arizona for 2020-04-18:


Unnamed: 0,state,county,cases,deaths,newcases,newdeaths
0,Arizona,Pima,856,56,37.0,5.0
1,Arizona,Maricopa,2491,70,87.0,1.0
2,Arizona,Pinal,235,6,23.0,1.0
3,Arizona,Apache,169,4,28.0,0.0
4,Arizona,Navajo,452,11,17.0,0.0
5,Arizona,Coconino,314,29,10.0,0.0
6,Arizona,Yavapai,72,1,3.0,0.0
7,Arizona,Cochise,22,0,2.0,0.0
8,Arizona,Gila,7,0,2.0,0.0
9,Arizona,Mohave,52,2,1.0,0.0


In [None]:
def fitExponential(series):
    series = series.dropna()
    series = series[series>0]
    logseries = np.log(series)
    n = len(logseries)
    rate, intercept = np.polyfit(np.arange(n), logseries, 1) 
    base = np.exp(intercept)
    return (rate, base, n) 


In [None]:
def plotFitted(series, name, log=True): 
    (rate, base, n) = fitExponential(series)
    x = np.arange(n)
    series = series[-n:]
    if log:
        plt.semilogy()
    sbn.set(rc={'figure.figsize':(11.7,8.27)})
    sbn.scatterplot(x, series, label=f"{name} actual")
    sbn.lineplot(x, base * np.exp(rate*x) , label=f"{name} expected with constant exponential growth")
    plt.show()

In [None]:
plotFitted(getTotal()["cases"][-30:], "USA Total")

In [None]:
def growthRateAnalysis(series, name, n=30):
    rate, base, n = fitExponential(series[-n:])
    name = f"{name} {series.name}"
    formula = f"{series.name} = {np.round(base, 2)} * exp({np.round(rate, 4)} * day)"
    dailyGrowth = np.round(100 * (np.exp(rate) - 1), 2)
    daysToDoubling = np.round(np.log(2)/rate, 2)
    print(f"{name} exponential formula fitted with {n} points:")
    print(formula)
    print(f"fitted daily growth rate: {dailyGrowth} %, fitted days to doubling: {daysToDoubling}")
    deltas = series.diff(1)
    rates = 100*deltas/series
    data = {
        f"{name} day to day growth rate (%)":rates, 
        f"{name} 3 day moving average":rates.rolling(window=3).mean(), 
        f"{name} 5 day moving average":rates.rolling(window=5).mean(), 
    }
    df = pd.DataFrame(data).dropna()[-n:]
    gr = df.tail(1)[f"{name} 5 day moving average"].values[0]
    dailyGrowth = np.round(gr, 2)
    daysToDoubling = np.round(np.log(2)*100/gr, 2)
    print(f"actual daily growth rate: {dailyGrowth} %, actual days to doubling: {daysToDoubling}")    
    sbn.lineplot(data=df)
    plt.show()
    plotFitted(series[-n:], name)


In [None]:
growthRateAnalysis(getTotal()['cases'], "Total USA")

In [None]:
def trajectoryPlot(series, name):
    deltas = series.diff(5)
    fullname = f"{series.name} in {name}"
    x = f"total {series.name}"
    y = f"new {series.name} since 5 days ago"
    data = {x: series, y: deltas}
    plt.loglog()
    df = pd.DataFrame(data).dropna()[-30:]
    sbn.lineplot(x, y, data=df, label=fullname)
    
trajectoryPlot(getTotal()['cases'], "USA")
trajectoryPlot(getState("California")['cases'], "CA")
trajectoryPlot(getCounty("San Francisco")["cases"], "SF")
plt.show()
trajectoryPlot(getTotal()['deaths'], "USA")
trajectoryPlot(getState("California")['deaths'], "CA")
trajectoryPlot(getCounty("San Francisco")["deaths"], "SF")
plt.show()

In [None]:
def laggedPlot(df, name):
    vals = df.copy()
    vals = df.dropna()[df.cases > 0]
    # show correlation with wide range of time lags
    for i in range(30):
        vals[i] = df.cases.shift(i)
    vals = vals.drop(columns='cases')
    vals = vals[vals.deaths > 0][-20:]
    corrs = vals.corr()[['deaths']].drop('deaths')
    lc = np.log(vals).corr()[['deaths']].drop('deaths')
    lc[0] = lc.deaths
    for i in [3, 5, 7]:
        lc[i] = lc.deaths.rolling(i).mean()
    lc = lc.dropna()
    corrdata = {
        f"{name} correlation of log of deaths to log of cases by # of days ago": lc[0],
        f"{name} correlation 3 day moving average": lc[3],
        f"{name} correlation 5 day moving average": lc[5],
        f"{name} correlation 7 day moving average": lc[7],
    }
    corrsdf = pd.DataFrame(corrdata)
    print(f"Correlations for {name}:")
    sbn.lineplot(data=corrsdf)
    plt.show()
    # dig deeper into an interesting set of time lags
    values = vals[["deaths"]].copy()
    ratios = vals[["deaths"]].copy()
    interesting = [7, 14]
    for i in [0, 3, 5, 7]:
        l = lc[i].dropna()
        lmax = l[l==max(l)] # max at this correlation moving average
        interesting += list(lmax.index)
    for i in sorted(list(set(interesting))):
        values[f"{name} cases {i} days ago"] = vals[i]
        ratios[f"{name} deaths/cases {i} days ago"] = vals.deaths/vals[i]
    ratios = ratios.drop(columns=['deaths'])
    print(f"Deaths vs Lagged Values For {name}:")
    plt.semilogy()
    sbn.lineplot(data=values, dashes=False)
    plt.show()
    print(f"Deaths / Lagged Values For {name}:")
    plt.semilogy()
    sbn.lineplot(data=ratios, dashes=False)
    plt.show()

    

In [None]:
# laggedPlot(getTotal(), "USA")
# laggedPlot(getState("California"), "CA")
# laggedPlot(getCounty("San Francisco"), "SF")

In [None]:
def analyse(counties=[], states=[], n=20):
    sbn.set(rc={'figure.figsize':(11.7,8.27)})
    names = counties + states + ["USA total:"]
    rates= []
    dfs = [getCounty(c) for c in counties] + [getState(s) for s in states] + [getTotal()] 
    print("Trajectory Plots to see if the curve is flattening:")
    for col in ["cases", "deaths"]:
        for df, name in zip(dfs, names):
            trajectoryPlot(df[col], name)
        plt.show()
    for df, name in zip(dfs, names):
        print(f"\n### Analysis for {name:} ###\n")
        growthRateAnalysis(df["cases"], name)
        growthRateAnalysis(df["deaths"], name)
        print(f"Trajectory plots for {name}, to see when the curve starts flattening")
        trajectoryPlot(df.cases, name)
        trajectoryPlot(df.deaths, name)
        plt.show()
#         laggedPlot(df, name)


        
    
    

In [None]:
counties =["San Francisco", "Boulder", "Maricopa", "Denver", "Los Angeles"]
states = [ "California", "Colorado", "Arizona"]
analyse(counties=counties, states=states)