In [1]:
import urllib.request
import shutil
import csv
import os
import pandas as pd
import tea

In [2]:
# Download and save data locally
states_url = 'https://covidtracking.com/api/states.csv'
states_local = 'states.csv'
states_local_clean = 'states_clean.csv'

# https://stackoverflow.com/questions/7243750/download-file-from-web-in-python-3
with urllib.request.urlopen(states_url) as response, open(states_local, 'wb') as out_file:
    shutil.copyfileobj(response, out_file)

## How do we define "tested"?
Two options: 
1. tested_1 = positive + negative
2. tested_2 = positive + negative + pending

## First Definition of TESTED
AKA: 1. tested_1 = positive + negative

In [10]:
# Data cleaning/wrangling.
# Note: This is a workaround. A slightly newer version of Tea supports data "import" from Pandas.

# CLEAN DATA and REMOVE any MISSING data from the dataframe.
# This is necessary because Tea assumes that the dataset is cleaned (no missing values).
df = pd.read_csv(states_local)

df = df.dropna(subset=['positive', 'negative', 'death']) # These are the columns we care about/want to make sure there are no missing values

# CREATE a NEW COLUMN/VARIABLE for the total number of tested cases. 
print(f"Sample size: {len(df)}")
df['tested_1'] = df['positive'] + df['negative']
print(df)
df.to_csv(states_local_clean)

   state  positive  positiveScore  negativeScore  negativeRegularScore  \
1     AL     138.0            1.0            1.0                   0.0   
2     AR     165.0            1.0            1.0                   1.0   
3     AZ     152.0            1.0            1.0                   1.0   
4     CA    1536.0            1.0            1.0                   1.0   
5     CO     475.0            1.0            1.0                   1.0   
6     CT     223.0            1.0            1.0                   1.0   
7     DC      98.0            1.0            1.0                   1.0   
8     DE      56.0            1.0            1.0                   0.0   
9     FL     830.0            1.0            1.0                   1.0   
10    GA     600.0            1.0            1.0                   1.0   
13    ID      42.0            1.0            1.0                   1.0   
14    IL    1058.0            1.0            1.0                   1.0   
15    IN     201.0            1.0     

In [4]:
# Tea: Load data from URL. Could also load data from local copy.
tea.data(states_local_clean)

In [5]:
# Tea: Specify variables of interest in dataset
variables = [
    {
        'name' : 'positive',
        'data type' : 'ratio'   # Options: 'nominal', 'ordinal', 'interval', 'ratio'
    },
    {
        'name' : 'negative',
        'data type' : 'ratio'   # Options: 'nominal', 'ordinal', 'interval', 'ratio'
    },
    {
        'name' : 'pending',
        'data type' : 'ratio'
    },
    {
        'name' : 'tested',
        'data type' : 'ratio'
    },
    {
        'name' : 'death',
        'data type' : 'ratio'
    }
]

tea.define_variables(variables)

In [6]:
# Tea: [OPTIONAL] We don't have any assumptions, so we can skip this step. 
assumptions = {
    'Type I (False Positive) Error Rate': 0.05
}

tea.assume(assumptions)

In [7]:
# Tea: Specify experimental design
experimental_design = {
                        'study type': 'observational study',   # 'study type' could be 'experiment'
                        'contributor variables': ['positive', 'negative', 'tested'],   # 'experiment's have 'independent variables'
                        'outcome variables': 'death',   # 'experiment's have 'dependent variables'
                    }
tea.define_study_design(experimental_design)

In [8]:
# Tea: State and test hypothesis
tea.hypothesize(['tested', 'death'], ['tested ~ death'])


Currently considering pearson_corr
Testing assumption: is_bivariate.
Property holds.
Testing assumption: is_continuous.
Property holds.
Testing assumption: is_continuous.
Property holds.
Testing assumption: is_normal.
Property FAILS
Testing assumption: is_normal.

Currently considering kendalltau_corr
Testing assumption: is_bivariate.
Property holds.
Testing assumption: is_continuous_or_ordinal.
Property holds.
Testing assumption: is_continuous_or_ordinal.
Property holds.

Currently considering spearman_corr
Testing assumption: is_bivariate.
Property holds.
Testing assumption: is_continuous_or_ordinal.
Property holds.
Testing assumption: is_continuous_or_ordinal.
Property holds.

Currently considering pointbiserial_corr_a
Testing assumption: is_bivariate.
Property holds.
Testing assumption: is_continuous.
Property holds.
Testing assumption: is_normal.
Property FAILS
Testing assumption: is_categorical.
Testing assumption: has_two_categories.
Testing assumption: has_equal_variance.

Cur


Results:
--------------
Test: kendalltau_corr
***Test assumptions:
None

***Test results:
name = Kendall Tau Correlation
test_statistic = 0.45056355688958294
p_value = 0.16658764010038207

Test: spearman_corr
***Test assumptions:
None

***Test results:
name = Spearman R Correlation
test_statistic = 0.6182840223353118
p_value = 0.13889518793753775

In [9]:
# Result: There seems to be moderate positive statistically significant relationship between how many people are tested and how many deaths there are. 
# In other words, in states that test more people, more people die. This doesn't suggest causality!

# BUG!: Tea doesn't seem to be outputting the interpretation!

## Second Definition of TESTED
AKA: 1. tested_2 = positive + negative + pending

In [3]:
# Data cleaning/wrangling.
# Note: This is a workaround. A slightly newer version of Tea supports data "import" from Pandas.

# CLEAN DATA and REMOVE any MISSING data from the dataframe.
# This is necessary because Tea assumes that the dataset is cleaned (no missing values).
df = pd.read_csv(states_local)

df = df.dropna(subset=['positive', 'negative', 'death']) # These are the columns we care about/want to make sure there are no missing values

# CREATE a NEW COLUMN/VARIABLE for the total number of tested cases. 
print(f"Sample size: {len(df)}")
df['tested_2'] = df['positive'] + df['negative'] + df['pending']
print(df)
df.to_csv(states_local_clean)

   state  positive  positiveScore  negativeScore  negativeRegularScore  \
2     AR     165.0            1.0            1.0                   1.0   
3     AZ     152.0            1.0            1.0                   1.0   
9     FL     830.0            1.0            1.0                   1.0   
31    NJ    1914.0            1.0            1.0                   1.0   
36    OK      67.0            1.0            1.0                   1.0   
41    SD      21.0            1.0            1.0                   1.0   
49    WV      12.0            1.0            1.0                   1.0   

    commercialScore grade  score  negative  pending  hospitalized  death  \
2               1.0     A    4.0     711.0    119.0          13.0    0.0   
3               0.0     B    3.0     282.0     87.0           NaN    2.0   
9               1.0     A    4.0    7990.0    963.0         185.0   13.0   
31              0.0     B    3.0     327.0     49.0           NaN   20.0   
36              1.0     A  

In [4]:
# Tea: Load data from URL. Could also load data from local copy.
tea.data(states_local_clean)

In [5]:
# Tea: Specify variables of interest in dataset
variables = [
    {
        'name' : 'positive',
        'data type' : 'ratio'   # Options: 'nominal', 'ordinal', 'interval', 'ratio'
    },
    {
        'name' : 'negative',
        'data type' : 'ratio'   # Options: 'nominal', 'ordinal', 'interval', 'ratio'
    },
    {
        'name' : 'pending',
        'data type' : 'ratio'
    },
    {
        'name' : 'tested',
        'data type' : 'ratio'
    },
    {
        'name' : 'death',
        'data type' : 'ratio'
    }
]

tea.define_variables(variables)

In [6]:
# Tea: [OPTIONAL] We don't have any assumptions, so we can skip this step. 
assumptions = {
    'Type I (False Positive) Error Rate': 0.05
}

tea.assume(assumptions)

In [7]:
# Tea: Specify experimental design
experimental_design = {
                        'study type': 'observational study',   # 'study type' could be 'experiment'
                        'contributor variables': ['positive', 'negative', 'tested'],   # 'experiment's have 'independent variables'
                        'outcome variables': 'death',   # 'experiment's have 'dependent variables'
                    }
tea.define_study_design(experimental_design)

In [8]:
# Tea: State and test hypothesis
tea.hypothesize(['tested', 'death'], ['tested ~ death'])


Currently considering pearson_corr
Testing assumption: is_bivariate.
Property holds.
Testing assumption: is_continuous.
Property holds.
Testing assumption: is_continuous.
Property holds.
Testing assumption: is_normal.
Property FAILS
Testing assumption: is_normal.

Currently considering kendalltau_corr
Testing assumption: is_bivariate.
Property holds.
Testing assumption: is_continuous_or_ordinal.
Property holds.
Testing assumption: is_continuous_or_ordinal.
Property holds.

Currently considering spearman_corr
Testing assumption: is_bivariate.
Property holds.
Testing assumption: is_continuous_or_ordinal.
Property holds.
Testing assumption: is_continuous_or_ordinal.
Property holds.

Currently considering pointbiserial_corr_a
Testing assumption: is_bivariate.
Property holds.
Testing assumption: is_continuous.
Property holds.
Testing assumption: is_normal.
Property FAILS
Testing assumption: is_categorical.
Testing assumption: has_two_categories.
Testing assumption: has_equal_variance.

Cur


Results:
--------------
Test: kendalltau_corr
***Test assumptions:
None

***Test results:
name = Kendall Tau Correlation
test_statistic = 0.45056355688958294
p_value = 0.16658764010038207

Test: spearman_corr
***Test assumptions:
None

***Test results:
name = Spearman R Correlation
test_statistic = 0.6182840223353118
p_value = 0.13889518793753775