# Import packages

In [1]:
import pandas as pd
import os
from scipy.stats import f_oneway, kruskal

# Initial observations

In [2]:
severity_historical = pd.read_csv("severity-data/severity_historical.csv")
severity_historical.corr()

Unnamed: 0,FIPS,year,percent_65plus,percent_smokers,pct_diabetes,pct_obese,copd_death_rate,hypertension_death_rate,total_heart_disease_death_rate
FIPS,1.0,-1.042344e-18,0.052471,-0.040713,-0.058241,-0.023353,-0.089099,0.105624,-0.070901
year,-1.042344e-18,1.0,0.26665,-0.295032,0.136955,0.195074,0.042018,0.139851,-0.065718
percent_65plus,0.05247134,0.2666503,1.0,-0.139812,-0.101278,0.00243,-0.053377,-0.091122,-0.068018
percent_smokers,-0.04071272,-0.295032,-0.139812,1.0,0.364971,0.346446,0.499792,0.169998,0.498589
pct_diabetes,-0.05824057,0.1369554,-0.101278,0.364971,1.0,0.634311,0.382408,0.339235,0.576924
pct_obese,-0.02335252,0.1950739,0.00243,0.346446,0.634311,1.0,0.353073,0.323396,0.51535
copd_death_rate,-0.08909892,0.0420175,-0.053377,0.499792,0.382408,0.353073,1.0,0.268754,0.551128
hypertension_death_rate,0.1056245,0.1398512,-0.091122,0.169998,0.339235,0.323396,0.268754,1.0,0.40489
total_heart_disease_death_rate,-0.07090062,-0.06571789,-0.068018,0.498589,0.576924,0.51535,0.551128,0.40489,1.0


# One-Way ANOVA

The one-way ANOVA tests the null hypothesis that two or more groups  have the same population mean. The test is applied to samples from two or more groups, possibly with differing sizes. The code below calculates the ANOVA for each variable across years. A p-value of 0.05 tell us that there is a less than 5% probability than the difference in  means is observed due to random chance; in other words, the difference is statistically significant.

In [3]:
for colname in severity_historical.drop(labels = ["FIPS", "year"], axis = 1).columns.tolist():
    df = severity_historical.loc[severity_historical[colname].isnull() == False, ["year", colname]]
    
    for_anova = [df.loc[df["year"] == year, colname].tolist() for year in df.year.unique()]
    
    print("Oneway ANOVA for {col}:".format(col = colname) )
    print(f_oneway(for_anova[0], for_anova[1], for_anova[2], for_anova[3], for_anova[4], for_anova[5], for_anova[6]))
    print("\n")
    
#del df, for_anova

Oneway ANOVA for percent_65plus:
F_onewayResult(statistic=137.71805490891103, pvalue=5.758164785293364e-172)


Oneway ANOVA for percent_smokers:
F_onewayResult(statistic=146.35718442700266, pvalue=4.437363844713898e-182)


Oneway ANOVA for pct_diabetes:
F_onewayResult(statistic=58.60639802606615, pvalue=2.6757457998796247e-72)


Oneway ANOVA for pct_obese:
F_onewayResult(statistic=99.68925168995672, pvalue=3.038311883517604e-124)


Oneway ANOVA for copd_death_rate:
F_onewayResult(statistic=8.812197406087728, pvalue=1.2786528199149814e-09)


Oneway ANOVA for hypertension_death_rate:
F_onewayResult(statistic=54.46609146470737, pvalue=4.92366583841111e-67)


Oneway ANOVA for total_heart_disease_death_rate:
F_onewayResult(statistic=15.19675969306489, pvalue=1.8824646284400096e-17)


