# Collecting PVI data

In [39]:
import pandas as pd; import requests; from bs4 import BeautifulSoup
from tqdm import tqdm

# read in fips codes
fips = pd.read_csv(
    "https://github.com/COVID19PVI/data/raw/master/Model11.2.1/data/Model_11.2.1_20200228_data.csv", 
    header=12, usecols=["name","casrn"]
).rename(columns={'name':'Name','casrn':'FIPS'}, inplace=True)

# get the PVI data links
r = requests.get("https://github.com/COVID19PVI/data/tree/master/Model11.2.1")
soup = BeautifulSoup(r.content, 'html.parser')
links = [
    x.attrs["href"].replace("blob","raw")
    for x in soup.find_all('a', {'class':'js-navigation-open Link--primary'})[1:-1]
]

# combine data
all_data = []
for link in tqdm(links):
    new_data = pd.read_csv("https://github.com"+link)
    new_data["date"] = link.split("_")[2]
    all_data.append(new_data)
all_data = pd.concat(all_data)
all_data["date"] = [f"{i[:4]}-{i[4:6]}-{i[6:]}" for i in all_data.date]
all_data

100%|█████████████████████████████████████████| 634/634 [06:42<00:00,  1.57it/s]


Unnamed: 0,ToxPi Score,HClust Group,KMeans Group,Name,Source,Infection Rate: Transmissible Cases!25!0xcc3333ff,Infection Rate: Disease Spread!5!0xe64d4dff,Pop Concentration: Pop Mobility!10!0x57b757ff,Pop Concentration: Residential Density!10!0x5ced5cff,Intervention: Social Distancing!10!0x4258c9ff,Intervention: Testing!10!0x6079f7ff,Health & Environment: Pop Demographics!10!0x6b0b9eff,Health & Environment: Air Pollution!10!0x8e26c4ff,Health & Environment: Age Distribution!10!0x9a42c8ff,Health & Environment: Co-morbidities!10!0xb460e0ff,Health & Environment: Health Disparities!10!0xc885ecff,Health & Environment: Hospital Beds!5!0xdeb9f1ff,date
0,0.659739,1,7,"California, Solano","-121.9357,38.2683",1.000000,1.000000,0.717074,0.6905,1.000000,0.504366,0.622993,0.479042,0.504195,0.236237,0.235189,0.514283,20200228
1,0.624339,2,7,"California, Madera","-119.7666,37.2151",0.638652,1.000000,0.601609,0.8239,0.825000,0.504366,0.605671,0.658683,0.469686,0.321223,0.590624,0.613680,20200228
2,0.598916,1,7,"California, Santa Cruz","-122.0067,37.0558",0.817887,1.000000,0.704589,0.9035,1.000000,0.504366,0.409785,0.269461,0.511724,0.100021,0.272557,0.531471,20200228
3,0.590967,2,7,"California, Imperial","-115.3669,33.0393",0.591782,0.666667,0.585525,0.9869,0.750000,0.504366,0.525758,0.628743,0.443774,0.270404,0.620412,0.516830,20200228
4,0.578017,1,7,"California, Santa Clara","-121.697,37.231",0.830152,0.880000,0.793513,0.7310,1.000000,0.504366,0.452019,0.353293,0.452613,0.060895,0.131427,0.461418,20200228
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3137,0.250812,5,10,"Texas, Glasscock","-101.5208,31.8694",0.000000,0.000000,0.415793,0.0774,0.333333,0.652564,0.314507,0.251497,0.431653,0.266842,0.391556,0.000000,20211122
3138,0.248758,5,10,"Nebraska, Thomas","-100.5557,41.9136",0.000000,0.000000,0.347559,0.0029,0.566667,0.680991,0.392738,0.149701,0.659510,0.233666,0.075746,0.000000,20211122
3139,0.248515,5,10,"Alaska, Skagway","-135.3338,59.5615",0.000000,0.000000,0.422700,0.8115,0.333333,0.091970,0.474167,0.000000,0.406160,0.289325,0.277288,0.000000,20211122
3140,0.240688,5,10,"Utah, Beaver","-113.2342,38.3566",0.000000,0.000000,0.414580,0.1258,0.433333,0.505531,0.271965,0.101796,0.491298,0.191358,0.313182,0.319525,20211122


In [80]:
columns = [x for x in all_data.columns if x not in ['Name', 'Source','date']]
for column in columns:
    column_name = column.split('!')[0]
    if ":" in column_name:
        column_name = column_name.split(": ")[1]
    print(f"Saving {column_name}", end = "... ")
    mx = all_data.pivot(index="Name",columns="date", values=column).reset_index()
    mx = pd.merge(mx, fips, on="Name", how="left")
    mx.insert(0, "FIPS", mx.pop("FIPS"))
    mx.to_csv(f"~/covid-19/PVI data/{column_name}.csv", index=False)
    print(u'\u2705')

Saving ToxPi Score... ✅
Saving HClust Group... ✅
Saving KMeans Group... ✅
Saving Transmissible Cases... ✅
Saving Disease Spread... ✅
Saving Pop Mobility... ✅
Saving Residential Density... ✅
Saving Social Distancing... ✅
Saving Testing... ✅
Saving Pop Demographics... ✅
Saving Air Pollution... ✅
Saving Age Distribution... ✅
Saving Co-morbidities... ✅
Saving Health Disparities... ✅
Saving Hospital Beds... ✅
