In [None]:
import pandas as pd
import time
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

pd.set_option('display.max_columns', None)

In [None]:
start = time.time()
data_df = pd.read_table('CovidBeliefsBehaviorsNormsSurvey/covid_survey_responses.txt.gz', 
                   sep='\t', 
                   low_memory=False)
print('reading finished', time.time()-start)

In [None]:
selected_survey = data_df[
    (data_df['finished'] == True)
    &
    (data_df['demo_complete'] == 1)
    &
    (data_df['survey_type'] == 'waves')

]

print('wave type', selected_survey.shape, selected_survey.shape[0]/data_df.shape[0])


selected_survey = selected_survey.dropna(subset=['geoip_country', 'weight_full_survey'])
print('missing weights', selected_survey.shape, selected_survey.shape[0]/data_df.shape[0])


selected_survey = selected_survey.loc[
    (data_df['distancing_norms_wear_a_face_mask_or_covering'] != -1)
    &
    (data_df['distancing_norms_wear_a_face_mask_or_covering'] != -2)        
]

print('missing values', selected_survey.shape, selected_survey.shape[0]/data_df.shape[0])


In [None]:
print('num countries:', len(pd.unique(data_df['geoip_country'])))

print('num countries:', pd.unique(data_df['geoip_country']))

In [None]:
simplified_survey = selected_survey[[
 'id',
 'finished', 
 'progress',   
 'start_date',
 'geoip_country',
 'wave',
 'weight_demo',
 'weight_full_survey',
 'distancing_norms_wear_a_face_mask_or_covering',
 'community_norms_mask',                               
]]

print(simplified_survey.shape)
simplified_survey.head()

# mobility data

In [None]:
mobility_df = pd.read_csv('data/google_mobility/Global_Mobility_Report.csv')

mobility_df.head()

In [None]:
print('survey countries', pd.unique(selected_survey['geoip_country']))
print('\nmobility countries', pd.unique(mobility_df['country_region']))

In [None]:
mobility_df = mobility_df[['country_region', 'date',
       'retail_and_recreation_percent_change_from_baseline',
       'grocery_and_pharmacy_percent_change_from_baseline',
       'parks_percent_change_from_baseline',
       'transit_stations_percent_change_from_baseline',
       'workplaces_percent_change_from_baseline',
       'residential_percent_change_from_baseline']]
mobility_df.head()

In [None]:
# country data is at the state/county level, so we want to aggregate at the country level
mobility_agg = mobility_df.groupby(['country_region', 'date']).agg(['mean'])
mobility_agg.reset_index(inplace=True)
mobility_agg.columns = ['country_region', 'date',
       'retail_and_recreation_percent_change_from_baseline',
       'grocery_and_pharmacy_percent_change_from_baseline',
       'parks_percent_change_from_baseline',
       'transit_stations_percent_change_from_baseline',
       'workplaces_percent_change_from_baseline',
       'residential_percent_change_from_baseline']

print(mobility_agg.shape)
mobility_agg.head()

In [None]:
simplified_survey['start_date'] = pd.to_datetime(simplified_survey['start_date'])
mobility_agg['date'] = pd.to_datetime(mobility_agg['date'])

survey_mobility = simplified_survey.merge(mobility_agg,
                                            left_on = ['geoip_country', 'start_date'],
                                            right_on = ['country_region', 'date'],
                                            how='inner')                                         

print(survey_mobility.shape,
    simplified_survey.shape,
    mobility_agg.shape)

survey_mobility.head()

# adding covid case, deaths, etc. data

In [None]:
# covid data from oxford
# https://ourworldindata.org/covid-cases
# data dictionary: https://github.com/owid/covid-19-data/blob/330c09376bb5035c6c35495c8ab3f603451c53d9/public/data/owid-covid-codebook.csv

cases_df = pd.read_csv('data/cases/owid-covid-data.csv')

cases_df.head()

In [None]:
cases_df.columns

In [None]:
cases_df = cases_df[[
    'location',
    'date',
    'new_cases_smoothed_per_million',
    'new_deaths_smoothed_per_million',
    'hosp_patients_per_million',
    'icu_patients_per_million',
    'new_tests_smoothed_per_thousand',
    'population_density',
    'human_development_index'
    
]]

cases_df.head()

In [None]:
cases_df['date'] = pd.to_datetime(cases_df['date'])

survey_mobility_cases = survey_mobility.merge(cases_df,                                            
                                            left_on = ['country_region', 'date'],
                                            right_on = ['location', 'date'],  
                                            how='inner')                                         

print(survey_mobility_cases.shape,
    survey_mobility.shape,
    cases_df.shape)

survey_mobility_cases.head()

In [None]:
survey_mobility_cases.to_csv('data/processed/survey_mobility_cases_new.csv', index=False)