In [17]:
import urllib.request
import shutil
import csv
import os
import pandas as pd
from datetime import datetime
import tea

In [18]:
# Download and save the daily data locally 
daily_url = 'https://covidtracking.com/api/states/daily.csv'
today = datetime.today().strftime('%Y%m%d')
daily_local = 'states_daily_' + today + '.csv'
daily_local_clean = 'states_daily_' + today + '_clean.csv'
print(today)
print(daily_local)
print(daily_local_clean)

# Download and save data locally
# https://stackoverflow.com/questions/7243750/download-file-from-web-in-python-3
with urllib.request.urlopen(daily_url) as response, open(daily_local, 'wb') as out_file:
    shutil.copyfileobj(response, out_file)

20200326
states_daily_20200326.csv
states_daily_20200326_clean.csv


In [19]:
# Data cleaning/wrangling.

# Analyze only the latest data
df = pd.read_csv(daily_local)
df = df.loc[df['date'].isin([today])]
print(f"Dataset that we are analyzing contains data from {df.shape[0]} states and territories:\n{df}")
df.to_csv(daily_local_clean)

Dataset that we are analyzing contains data from 56 states and territories:
        date state  positive  negative  pending  hospitalized  death   total  \
0   20200326    AK      59.0    1801.0      NaN           3.0    1.0    1860   
1   20200326    AL     506.0    3593.0      NaN           NaN    1.0    4099   
2   20200326    AR     335.0    1504.0      0.0          41.0    3.0    1839   
3   20200326    AS       0.0       NaN      NaN           NaN    0.0       0   
4   20200326    AZ     577.0     347.0     33.0          66.0    8.0     957   
5   20200326    CA    3006.0   17380.0  57400.0           NaN   65.0   77786   
6   20200326    CO    1086.0    6978.0      NaN         148.0   19.0    8064   
7   20200326    CT    1012.0    5625.0      NaN         125.0   21.0    6637   
8   20200326    DC     231.0    1626.0      1.0           NaN    3.0    1858   
9   20200326    DE     130.0      36.0      NaN          13.0    1.0     166   
10  20200326    FL    2355.0   23741.0   144

In [20]:
# Tea: Load data from URL. Could also load data from local copy.
tea.data(daily_local_clean)

In [21]:
# Tea: Specify variables of interest in dataset
variables = [
    {
        'name' : 'positive',
        'data type' : 'ratio'   # Options: 'nominal', 'ordinal', 'interval', 'ratio'
    },
    {
        'name' : 'positiveIncrease',
        'data type' : 'ratio'   # Options: 'nominal', 'ordinal', 'interval', 'ratio'
    }
]

tea.define_variables(variables)

In [22]:
# Tea: [OPTIONAL] We don't have any assumptions, so we can skip this step. 
assumptions = {
    'Type I (False Positive) Error Rate': 0.05
}

tea.assume(assumptions)

In [25]:
# Tea: Specify experimental design
experimental_design = {
                        'study type': 'observational study',   # 'study type' could be 'experiment'
                        'contributor variables': ['positive', 'positiveIncrease'],   # 'experiment's have 'independent variables'
                        'outcome variables': []
                    }
tea.define_study_design(experimental_design)

In [26]:
# Tea: State and test hypothesis
tea.hypothesize(['positive', 'positiveIncrease'], ['positive~positiveIncrease'])


Currently considering pearson_corr
Testing assumption: is_bivariate.
Property holds.
Testing assumption: is_continuous.
Property holds.
Testing assumption: is_continuous.
Property holds.
Testing assumption: is_normal.
Property FAILS
Testing assumption: is_normal.

Currently considering kendalltau_corr
Testing assumption: is_bivariate.
Property holds.
Testing assumption: is_continuous_or_ordinal.
Property holds.
Testing assumption: is_continuous_or_ordinal.
Property holds.

Currently considering spearman_corr
Testing assumption: is_bivariate.
Property holds.
Testing assumption: is_continuous_or_ordinal.
Property holds.
Testing assumption: is_continuous_or_ordinal.
Property holds.

Currently considering pointbiserial_corr_a
Testing assumption: is_bivariate.
Property holds.
Testing assumption: is_continuous.
Property holds.
Testing assumption: is_normal.
Property FAILS
Testing assumption: is_categorical.
Testing assumption: has_two_categories.
Testing assumption: has_equal_variance.

Cur


Results:
--------------
Test: kendalltau_corr
***Test assumptions:
None

***Test results:
name = Kendall Tau Correlation
test_statistic = 0.8419716882004634
p_value = 6.597280710785405e-20

Test: spearman_corr
***Test assumptions:
None

***Test results:
name = Spearman R Correlation
test_statistic = 0.9540055204762345
p_value = 6.3309596402051e-30

In [None]:
# Result: There seems to be a strong positive correlation between the number of people testing positive today and the increase in positive tests from yesterday. 
# This result seems like something we might expect?

# BUG!: Tea doesn't seem to be outputting the interpretation!