In [1]:
%pylab inline
import altair as alt
import pandas as pd
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

alt.data_transformers.disable_max_rows()

def days_between(d1, d2):
    if (isinstance(d1, str) and isinstance(d2, str)):
        d1 = datetime.strptime(d1, "%m-%d-%Y")
        d2 = datetime.strptime(d2, "%m-%d-%Y")
        return (d2 - d1).days
    return None;

Populating the interactive namespace from numpy and matplotlib


In [2]:
total_cases_df       = pd.read_csv("../91-DIVOC/pages/covid-visualization/jhu-data.csv",index_col=0)
empty_rows           = total_cases_df[(total_cases_df["Confirmed"]==0)&(total_cases_df["Recovered"]==0)&(total_cases_df["Active"]==0)&(total_cases_df["Deaths"]==0)].index
total_cases_df       = total_cases_df.drop(empty_rows, axis=0)

# # Tabulate the top K countries with the highest total number of Confirmed cases
# K              = 10  
# topKcountries  = list(total_cases_df.groupby("Country_Region").sum()["Confirmed"].nlargest(20).index)
# cases_df       = total_cases_df[total_cases_df["Country_Region"].isin(topKcountries)]
cases_df = total_cases_df
# cases_df = total_cases_df
# Compute days since N = 50 confirmed cases; drop those with no N confirmed cases
N              = 50
days_since_N                   = cases_df[cases_df["Confirmed"]>N].groupby("Country_Region")["Date"].min().to_dict()
cases_df['Date N Confirmed']   = cases_df.apply  (lambda x: days_since_N.get(x['Country_Region']),axis=1)
cases_df                       = cases_df.dropna (subset=['Date N Confirmed'])
cases_df['Days from N']        = cases_df.apply  (lambda x: days_between(x["Date N Confirmed"], x["Date"]), axis = 1)
cases_df

Unnamed: 0,Country_Region,Province_State,Confirmed,Recovered,Active,Deaths,Date,Date N Confirmed,Days from N
34,United States,Washington,1.0,0.0,1.0,0.0,01-22-2020,03-03-2020,-41
3,China,,547.0,28.0,502.0,17.0,01-22-2020,01-22-2020,0
4,South Korea,,1.0,0.0,1.0,0.0,01-22-2020,02-20-2020,-29
5,Taiwan,,1.0,0.0,1.0,0.0,01-22-2020,03-14-2020,-52
6,Thailand,,2.0,0.0,2.0,0.0,01-22-2020,03-16-2020,-54
...,...,...,...,...,...,...,...,...,...
150,Thailand,,721.0,52.0,668.0,1.0,03-23-2020,03-16-2020,7
155,Turkey,,1529.0,0.0,1492.0,37.0,03-23-2020,03-20-2020,3
156,United States,,43618.0,0.0,0.0,552.0,03-23-2020,03-03-2020,20
159,United Arab Emirates,,198.0,41.0,155.0,2.0,03-23-2020,03-10-2020,13


In [3]:
chart = alt.Chart(cases_df, width=500, height=500).mark_line(point=True, size=3).encode(
        x   = alt.X("Days from N",scale=alt.Scale(domain=(0,40)), title = "Days Since First 50 Confirmed"),
        y   = alt.Y("Confirmed",aggregate="sum",title="Total Confirmed Cases (Log)", scale=alt.Scale(type='log',domain=(100,100000))),
    color   = alt.Color("Country_Region"),
    tooltip =["Country_Region"]
)
chart.interactive()

The US data looks jagged-ey, there's probably something off.

In [4]:
# The overall US trends are the ones with Province_State is NaN
overall_us_df = cases_df[(cases_df["Country_Region"]=="United States")&(cases_df["Province_State"].isna())]
cases_df = cases_df[cases_df["Country_Region"]!="United States"] #drop everything from the US
cases_df = pd.concat([overall_us_df,cases_df]) #combine the US overall df with the df without the US 
cases_df = cases_df.drop(columns="Province_State")

In [5]:
cases_df

Unnamed: 0,Country_Region,Confirmed,Recovered,Active,Deaths,Date,Date N Confirmed,Days from N
14,United States,1.0,0.0,1.0,0.0,01-23-2020,03-03-2020,-40
12,United States,2.0,0.0,2.0,0.0,01-25-2020,03-03-2020,-38
13,United States,5.0,0.0,5.0,0.0,01-26-2020,03-03-2020,-37
16,United States,5.0,0.0,5.0,0.0,01-27-2020,03-03-2020,-36
16,United States,5.0,0.0,5.0,0.0,01-28-2020,03-03-2020,-35
...,...,...,...,...,...,...,...,...
146,Switzerland,8795.0,131.0,8544.0,120.0,03-23-2020,03-03-2020,20
150,Thailand,721.0,52.0,668.0,1.0,03-23-2020,03-16-2020,7
155,Turkey,1529.0,0.0,1492.0,37.0,03-23-2020,03-20-2020,3
159,United Arab Emirates,198.0,41.0,155.0,2.0,03-23-2020,03-10-2020,13


In [6]:
chart = alt.Chart(cases_df, width=500, height=500).mark_line(point=True, size=3).encode(
        x   = alt.X("Days from N",scale=alt.Scale(domain=(0,40)), title = "Days Since First 50 Confirmed"),
        y   = alt.Y("Confirmed",aggregate="sum",title="Total Confirmed Cases (Log)", scale=alt.Scale(type='log',domain=(100,100000))),
    color   = alt.Color("Country_Region"),
    tooltip =["Country_Region"]
)
chart.interactive()

### Normalizing by total number of cases to look at just the shape

In [7]:
maxCasesByCountry = cases_df.groupby("Country_Region").max()["Confirmed"]
maxCasesByCountry.name = "Max Confirmed"

cases_df = cases_df.merge(maxCasesByCountry,on="Country_Region")

cases_df["Normalized Confirmed"] = cases_df["Confirmed"]/cases_df["Max Confirmed"]

In [8]:
chart = alt.Chart(cases_df, width=500, height=500).mark_line(point=True, size=3).encode(
        x   = alt.X("Days from N",scale=alt.Scale(domain=(0,40)), title = "Days Since First 50 Confirmed"),
        y   = alt.Y("Normalized Confirmed",aggregate="sum",title="Total Confirmed Cases (Log)", scale=alt.Scale(type='log',domain=(0.001,2))),
    color   = alt.Color("Country_Region"),
    tooltip =["Country_Region"]
)
chart.interactive()

In [9]:
chart = alt.Chart(cases_df, width=500, height=500).mark_line(size=2).encode(
        x   = alt.X("Days from N",scale=alt.Scale(domain=(0,40)), title = "Days Since First 50 Confirmed"),
        y   = alt.Y("Normalized Confirmed",aggregate="sum",title="Normalized Total Confirmed Cases (Log)", scale=alt.Scale(type='log',domain=(0.001,1.1))),
    color   = alt.Color("Country_Region"),
    tooltip =["Country_Region"]
).configure_mark(
    opacity=0.2,
    color='blue'
)
chart.interactive()

Hard to draw any clusters or patterns

### Normalize by x by doing linear interpolation

In [10]:
maxDaysByCountry = cases_df.groupby("Country_Region").max()["Days from N"]
maxDaysByCountry.name = "Max Days"

cases_df = cases_df.merge(maxDaysByCountry,on="Country_Region")

cases_df["Normalized Days from N"] = cases_df["Days from N"]/cases_df["Max Days"]

In [11]:
chart = alt.Chart(cases_df, width=500, height=500).mark_line(point=True, size=3).encode(
        x   = alt.X("Normalized Days from N",scale=alt.Scale(domain=(0,1)), title = "Normalized Days Since First 50 Confirmed"),
        y   = alt.Y("Normalized Confirmed",aggregate="sum",title="Normalized Total Confirmed Cases (Log)", scale=alt.Scale(type='log',domain=(0.001,1.1))),
    color   = alt.Color("Country_Region"),
    tooltip =["Country_Region"]
)
chart.interactive()

In [12]:
chart = alt.Chart(cases_df, width=500, height=500).mark_line(size=2).encode(
        x   = alt.X("Normalized Days from N",scale=alt.Scale(domain=(0,1)), title = "Normalized Days Since First 50 Confirmed"),
        y   = alt.Y("Normalized Confirmed",aggregate="sum",title="Normalized Total Confirmed Cases (Log)", scale=alt.Scale(type='log',domain=(0.001,1.1))),
    color   = alt.Color("Country_Region"),
    tooltip =["Country_Region"]
).configure_mark(
    opacity=0.2,
    color='blue'
)
chart.interactive()

Normalizing by x is effectively doing linear interpolation. There isn't really any clear clusters showing up visually.

# Linear Interpolation 

In [13]:
intrp_df = cases_df.groupby("Country_Region").apply(lambda x: np.interp(np.linspace(0,1,100),x["Days from N"],x["Normalized Confirmed"]))

In [14]:
all_tseries = intrp_df.values

In [15]:
all_tseries = np.array([list(ts) for ts in all_tseries])

In [16]:
intrp_df

Country_Region
Andorra           [0.39849624060150374, 0.40016708437761067, 0.4...
Argentina         [0.21052631578947367, 0.21098200045568466, 0.2...
Armenia           [0.45217391304347826, 0.45445761967501097, 0.4...
Australia         [0.40625, 0.40648674242424243, 0.4067234848484...
Austria           [0.5288461538461539, 0.5311771561771562, 0.533...
                                        ...                        
United Kingdom    [0.008883469778784184, 0.008943291124095863, 0...
United States     [0.0017653262414599477, 0.0017767893988720252,...
Uruguay           [0.5851851851851851, 0.5863075196408529, 0.587...
Venezuela         [0.9090909090909091, 0.9090909090909091, 0.909...
Vietnam           [0.4690265486725664, 0.46929471708232773, 0.46...
Length: 98, dtype: object

In [17]:
# plt.figure()
# for country in intrp_df.iteritems():
#     plt.plot(country[1],color="blue",alpha=0.2)
#     plt.yscale("log")

In [18]:
countryIdx = intrp_df.index

In [19]:
shape(all_tseries)

(98, 100)

# Clustering 

In [20]:
from sklearn.cluster import KMeans

In [21]:
y_pred = KMeans(n_clusters=4, random_state=999).fit_predict(all_tseries)

In [22]:
countryCluster = pd.DataFrame(list(zip(*[list(countryIdx), list(y_pred)])),columns=["Country_Region","Cluster"])

In [23]:
cases_df = cases_df.merge(countryCluster,on= "Country_Region")

In [24]:
chart = alt.Chart(cases_df, width=300, height=100).mark_line(size=2).encode(
        x   = alt.X("Normalized Days from N",scale=alt.Scale(domain=(0,1)), title = "Normalized Days Since First 50 Confirmed"),
        y   = alt.Y("Normalized Confirmed",aggregate="sum",title="Normalized Total Confirmed Cases (Log)", scale=alt.Scale(type='log',domain=(0.001,1.1))),
    color   = alt.Color("Country_Region"),
    row = "Cluster",
    tooltip =["Country_Region"]
).configure_mark(
    opacity=0.2,
    color='blue'
)
chart.interactive()

In [25]:
chart = alt.Chart(cases_df, width=300, height=500).mark_line(size=2).encode(
        x   = alt.X("Days from N",scale=alt.Scale(domain=(0,40)), title = "Days Since First 50 Confirmed"),
        y   = alt.Y("Confirmed",aggregate="sum",title="Total Confirmed Cases (Log)", scale=alt.Scale(type='log',domain=(100,100000))),
    color   = alt.Color("Country_Region"),
    column = "Cluster",
    tooltip =["Country_Region"]
).configure_mark(
    opacity=0.5,
    color='blue'
)
chart.interactive()

In [29]:
chart = alt.Chart(cases_df, width=300, height=500).mark_line(size=2).encode(
        x   = alt.X("Days from N",scale=alt.Scale(domain=(0,40)), title = "Days Since First 50 Confirmed"),
        y   = alt.Y("Confirmed",aggregate="sum",title="Total Confirmed Cases (Log)", scale=alt.Scale(type='log',domain=(100,100000))),
    color   = alt.Color("Cluster:N"),
    detail = "Country_Region",
    tooltip =["Country_Region"]
)
chart.interactive()

In [27]:
cases_df.Cluster

0       1
1       1
2       1
3       1
4       1
       ..
2078    0
2079    0
2080    0
2081    0
2082    0
Name: Cluster, Length: 2083, dtype: int64