### Import Packages

In [7]:
import pandas as pd

### Mount to Files

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Demographic and Pollutant

In [9]:
# EPA Pollutant data
pol_data = pd.read_csv("/content/drive/My Drive/COVID19_Cities/Data/pollutant_data.csv")
pol_data['fips'] = pol_data['FIPS']
pol_data = pol_data[['fips', 'ozone', 'pm25', 'no2', 'so2']]
pol_data['fips'] = pol_data['fips'].astype(int).astype(str).apply(lambda x: x.zfill(5)[0:5])

# Hopkins Population Center (HPC) 2018 https://github.com/QFL2020/COVID_DataHub
eth = pd.read_csv("/content/drive/My Drive/COVID19_Cities/Data/Prepandemic_v2.csv", encoding = "ISO-8859-1")
ethj = eth[['fips', 'popdensity_2018', 'tot_2018', 'male_2018', 'hispanic_2018', 'nhwhite_2018', 'nhblack_2018', 'nhindian_2018', 'nhasian_2018', 'povprop']]
ethj['ageab55'] = eth['ageg12_2018'] + eth['ageg13_2018'] + eth['ageg14_2018'] + eth['ageg15_2018'] + eth['ageg16_2018'] + eth['ageg17_2018'] + eth['ageg18_2018']
ethj['ageab65'] = eth['ageg14_2018'] + eth['ageg15_2018'] + eth['ageg16_2018'] + eth['ageg17_2018'] + eth['ageg18_2018']
ethj['fips'] = ethj['fips'].astype(int).astype(str).apply(lambda x: x.zfill(5)[0:5])
ethj = ethj.rename(columns = {'popdensity_2018' : 'popdense', 'tot_2018' : 'population'})

edu = pd.read_excel('/content/drive/MyDrive/07252021_Entrie_USA_Covid_Study_Chaya/Raw_Data/Education.xlsx')
edu['fips'] = edu['fips'].astype(int).astype(str).apply(lambda x: x.zfill(5)[0:5])
edu = edu[['fips', 'Percent of adults with less than a high school diploma, 2015-19', 'Percent of adults with a high school diploma only, 2015-19', "Percent of adults completing some college or associate's degree, 2015-19", "Percent of adults with a bachelor's degree or higher, 2015-19"]]
edu.rename(columns={"Percent of adults with less than a high school diploma, 2015-19" : 'AR',
                    "Percent of adults with a high school diploma only, 2015-19" : 'AS',
                    "Percent of adults completing some college or associate's degree, 2015-19" : 'AT',
                    "Percent of adults with a bachelor's degree or higher, 2015-19" : 'AU'}, inplace = True)
edu['pct_highschool_or_less'] = edu['AR'] + edu['AS']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the 

### Wave 1 Data

In [10]:
# New york Times covid data
covid = pd.read_csv('/content/drive/My Drive/COVID19_Cities/Data/us-counties.csv')
covid = covid[covid['date'] == '2020-06-30']
covid = covid.drop(['date', 'county', 'state'], axis = 1)
covid = covid.dropna()
covid['fips'] = covid['fips'].astype(int).astype(str).apply(lambda x: x.zfill(5)[0:5])
covid['fips'].nunique()

# Merge all data
data = covid.merge(ethj, on='fips')
data = data.merge(edu, on='fips').drop_duplicates().reset_index(drop=True)
data['cases rate 10k'] = data['cases'] / data['population'] * 10000
data['deaths rate 10k'] = data['deaths'] / data['population'] * 10000
data['deaths cases 10k'] = data['deaths'] / data['cases'] * 10000
data['ageab55'] = data['ageab55'] * 100
data['pctpov'] = data['povprop'] * 100
data['pctmale'] = data['male_2018'] * 100
data['pctwhite'] = data['nhwhite_2018'] * 100
data['pctblack'] = data['nhblack_2018'] * 100
data['pctindian'] = data['nhindian_2018'] * 100
data['pctasian'] = data['nhasian_2018'] * 100
data['pcthispanic'] = data['hispanic_2018'] * 100
data = data.drop(['povprop','male_2018','nhwhite_2018','nhblack_2018','nhindian_2018','nhasian_2018','hispanic_2018'], axis=1)
data = data.sort_values('cases rate 10k', ascending=False)[data['population'] > 100000]
epifips = data.iloc[0:200]['fips'].tolist()
data['type'] = data['fips'].apply(lambda x: 0 if x in epifips else 1)
cdata = data

cdata = cdata.merge(pol_data, on='fips').drop_duplicates().reset_index(drop=True)

wave1_data = cdata
wave1_data.to_csv('wave1_data.csv')



### Wave 2 Data

In [11]:
covid1 = pd.read_csv('/content/drive/My Drive/COVID19_Cities/Data/us-counties.csv')
covid1 = covid1[covid1['date'] == '2020-12-31']
covid1 = covid1.drop(['date', 'county', 'state'], axis = 1)
covid1 = covid1.dropna()
covid1['fips'] = covid1['fips'].astype(int).astype(str).apply(lambda x: x.zfill(5)[0:5])

covid2 = pd.read_csv('/content/drive/My Drive/COVID19_Cities/Data/us-counties.csv')
covid2 = covid2[covid2['date'] == '2020-06-30']
covid2 = covid2.drop(['date', 'county', 'state'], axis = 1)
covid2 = covid2.dropna()
covid2['fips'] = covid2['fips'].astype(int).astype(str).apply(lambda x: x.zfill(5)[0:5])

covid = covid1.merge(covid2, on="fips")
covid['cases'] = covid['cases_x'] - covid['cases_y']
covid['deaths'] = covid['deaths_x'] - covid['deaths_y']
covid = covid.drop(['cases_x', 'cases_y', 'deaths_x', 'deaths_y'], axis = 1)
covid = covid.dropna()
covid['fips'].nunique()

data = covid.merge(ethj, on='fips')
data = data.merge(edu, on='fips').drop_duplicates().reset_index(drop=True)
data['cases rate 10k'] = data['cases'] / data['population'] * 10000
data['deaths rate 10k'] = data['deaths'] / data['population'] * 10000
data['deaths cases 10k'] = data['deaths'] / data['cases'] * 10000
data['ageab55'] = data['ageab55'] * 100
data['pctpov'] = data['povprop'] * 100
data['pctmale'] = data['male_2018'] * 100
data['pctwhite'] = data['nhwhite_2018'] * 100
data['pctblack'] = data['nhblack_2018'] * 100
data['pctindian'] = data['nhindian_2018'] * 100
data['pctasian'] = data['nhasian_2018'] * 100
data['pcthispanic'] = data['hispanic_2018'] * 100
data = data.drop(['povprop','male_2018','nhwhite_2018','nhblack_2018','nhindian_2018','nhasian_2018','hispanic_2018'], axis=1)
data = data.sort_values('cases rate 10k', ascending=False)[data['population'] > 100000]
epifips = data.iloc[0:200]['fips'].tolist()
data['type'] = data['fips'].apply(lambda x: 0 if x in epifips else 1)
cdata = data

cdata = cdata.merge(pol_data, on='fips').drop_duplicates().reset_index(drop=True)

wave2_data = cdata
wave2_data.to_csv('wave2_data.csv')



### All Year Data

In [12]:
covid = pd.read_csv('/content/drive/My Drive/COVID19_Cities/Data/us-counties.csv')
covid = covid[covid['date'] == '2020-12-31']
covid = covid.drop(['date', 'county', 'state'], axis = 1)
covid = covid.dropna()
covid['fips'] = covid['fips'].astype(int).astype(str).apply(lambda x: x.zfill(5)[0:5])

data = covid.merge(ethj, on='fips')
data = data.merge(edu, on='fips').drop_duplicates().reset_index(drop=True)
data['cases rate 10k'] = data['cases'] / data['population'] * 10000
data['deaths rate 10k'] = data['deaths'] / data['population'] * 10000
data['deaths cases 10k'] = data['deaths'] / data['cases'] * 10000
data['ageab55'] = data['ageab55'] * 100
data['pctpov'] = data['povprop'] * 100
data['pctmale'] = data['male_2018'] * 100
data['pctwhite'] = data['nhwhite_2018'] * 100
data['pctblack'] = data['nhblack_2018'] * 100
data['pctindian'] = data['nhindian_2018'] * 100
data['pctasian'] = data['nhasian_2018'] * 100
data['pcthispanic'] = data['hispanic_2018'] * 100
data = data.drop(['povprop','male_2018','nhwhite_2018','nhblack_2018','nhindian_2018','nhasian_2018','hispanic_2018'], axis=1)
data = data.sort_values('cases rate 10k', ascending=False)[data['population'] > 100000]
epifips = data.iloc[0:200]['fips'].tolist()
data['type'] = data['fips'].apply(lambda x: 0 if x in epifips else 1)
cdata = data

cdata = cdata.merge(pol_data, on='fips').drop_duplicates().reset_index(drop=True)

whole_data = cdata
whole_data.to_csv('whole_data.csv')

