## Imports / Global Variables

In [1]:
# Required Imports
import os
import pandas as pd
import numpy as np
import zipfile
from urllib.request import urlopen
from sqlalchemy import create_engine
from io import BytesIO
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Global Path Variables
PARENT_DIRECTORY = os.pardir

# Global Paths to Data Folders
RAW_DATA_FOLDER = os.path.join(PARENT_DIRECTORY, 'raw')
PROCESSED_DATA_FOLDER = os.path.join(PARENT_DIRECTORY, 'processed')

## Download / Load the Data

In [3]:
# Download the data function
def download_google_mobility_data(output_folder=''):
    """Downloads the google mobility data and saves it as a CSV.
    data source: https://www.google.com/covid19/mobility/
       
    Args:
        output_folder (str, optional): the folder where the data will be output. Defaults to ''.
    """
    
    # Get URL for the data
    url = 'https://www.gstatic.com/covid19/mobility/Region_Mobility_Report_CSVs.zip'
    
    # Download and extract the zipfolder
    with urlopen(url) as zipResponse:
        with zipfile.ZipFile(BytesIO(zipResponse.read())) as zipFolder:
            # Identify the United States Data
            US_files = [file for file in zipFolder.namelist() if(file[5:7] == 'US')]
            
            # Extract the US data
            for file in US_files:
                zipFolder.extract(file, path=output_folder)

In [4]:
# Function to load the data
def load_google_mobility_data(data_folder=''):
    """Loads the google mobility data into a dataframe and returns the dataframe. 
    Any data that is missing a state or county is removed. 

    Args:
        data_folder (str, optional): path to the folder containing the data. Defaults to ''.

    Returns:
        google_mobility_data (pd.DataFrame): The dataframe containing the google mobility data
    """
    
    # Combine all the data
    google_data = []
    for file in os.listdir(data_folder):
        if file[-3:] == "csv":
            file_path = os.path.join(data_folder, file)
            google_data.append(pd.read_csv(file_path))
            
    google_mobility_data = pd.concat([data for data in google_data])
    
    # Rename the columns
    google_mobility_data = google_mobility_data.rename(columns={
                                                'country_region_code':'country_code',
                                                'country_region':'country',
                                                'sub_region_1':'state',
                                                'sub_region_2':'county',
                                                'retail_and_recreation_percent_change_from_baseline':'retail_and_recreation_percent_change',
                                                'grocery_and_pharmacy_percent_change_from_baseline':'grocery_and_pharmacy_percent_change',
                                                'parks_percent_change_from_baseline':'parks_percent_change',
                                                'transit_stations_percent_change_from_baseline':'transit_stations_percent_change',
                                                'workplaces_percent_change_from_baseline':'workplaces_percent_change',
                                                'residential_percent_change_from_baseline':'residential_percent_change',
                                                })
    
    # Keep only the important columns
    google_mobility_data = google_mobility_data[['date', 'country_code', 'country', 'state', 'county', \
                                                'retail_and_recreation_percent_change', 'grocery_and_pharmacy_percent_change', \
                                                'parks_percent_change', 'transit_stations_percent_change', 'workplaces_percent_change', \
                                                'residential_percent_change', 'place_id', 'census_fips_code']]
    
    # Remove data where state or county info is null
    google_mobility_data = google_mobility_data[google_mobility_data['state'].isna() == False]
    google_mobility_data = google_mobility_data[google_mobility_data['county'].isna() == False]

    # Sort by country, state, county, and date
    google_mobility_data = google_mobility_data.sort_values(by=['country', 'state', 'county', 'date'], ascending=True, inplace=False).reset_index(drop=True)
        
    # Return the data
    return google_mobility_data

In [5]:
# Download the data
output_folder = os.path.join(RAW_DATA_FOLDER, 'google_mobility_data')
download_google_mobility_data(output_folder=output_folder)

In [6]:
# Load the data
google_mobility_data = load_google_mobility_data(data_folder=output_folder)
display(google_mobility_data)

Unnamed: 0,date,country_code,country,state,county,retail_and_recreation_percent_change,grocery_and_pharmacy_percent_change,parks_percent_change,transit_stations_percent_change,workplaces_percent_change,residential_percent_change,place_id,census_fips_code
0,2020-02-15,US,United States,Alabama,Autauga County,5.0,7.0,,,-4.0,,ChIJg9z7ewWPjogRA_8QrB0va7o,1001.0
1,2020-02-16,US,United States,Alabama,Autauga County,0.0,1.0,-23.0,,-4.0,,ChIJg9z7ewWPjogRA_8QrB0va7o,1001.0
2,2020-02-17,US,United States,Alabama,Autauga County,8.0,0.0,,,-27.0,5.0,ChIJg9z7ewWPjogRA_8QrB0va7o,1001.0
3,2020-02-18,US,United States,Alabama,Autauga County,-2.0,0.0,,,2.0,0.0,ChIJg9z7ewWPjogRA_8QrB0va7o,1001.0
4,2020-02-19,US,United States,Alabama,Autauga County,-2.0,0.0,,,2.0,0.0,ChIJg9z7ewWPjogRA_8QrB0va7o,1001.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2461341,2022-10-10,US,United States,Wyoming,Weston County,,,,,-26.0,,ChIJd4Rqhed3YocR7ubT5-HgoJg,56045.0
2461342,2022-10-11,US,United States,Wyoming,Weston County,,,,,-20.0,,ChIJd4Rqhed3YocR7ubT5-HgoJg,56045.0
2461343,2022-10-12,US,United States,Wyoming,Weston County,,,,,-17.0,,ChIJd4Rqhed3YocR7ubT5-HgoJg,56045.0
2461344,2022-10-13,US,United States,Wyoming,Weston County,,,,,-15.0,,ChIJd4Rqhed3YocR7ubT5-HgoJg,56045.0


## Clean the Data

In [7]:
# Get the NY data
google_mobility_data_NY = google_mobility_data[google_mobility_data['state'] == 'New York']

# Drop some columns
google_mobility_data_NY.drop(columns=['country_code', 'country', 'state', 'place_id'], inplace=True)

# Convert the date to a datetime type
google_mobility_data_NY['date'] = pd.to_datetime(google_mobility_data_NY['date'])

# Forward-fill missing values for each county
for county in np.unique(google_mobility_data_NY['county']):
    google_mobility_data_NY[google_mobility_data_NY['county'] == county] = google_mobility_data_NY[google_mobility_data_NY['county'] == county].ffill(inplace=False)

# Replace the remaining NaN with zero
google_mobility_data_NY = google_mobility_data_NY.replace(np.nan, 0)

# Average using sunday to next saturday
google_mobility_data_NY = google_mobility_data_NY.groupby([pd.Grouper(key='date', freq='W-SAT'), 'county', 'census_fips_code']).mean().reset_index()

# Make fips codes integers
google_mobility_data_NY['census_fips_code'] = google_mobility_data_NY['census_fips_code'].astype(int)

# Ensure every county has data for all the dates
county_data_list = []
for county in google_mobility_data_NY['county'].unique():
    if(google_mobility_data_NY[google_mobility_data_NY['county'] == county]['date'].shape[0] < 140):
        county_data = google_mobility_data_NY[google_mobility_data_NY['county'] == county].merge(google_mobility_data_NY[google_mobility_data_NY['county'] == 'Albany County']['date'], how='right', on='date').ffill()
    else:
        county_data = google_mobility_data_NY[google_mobility_data_NY['county'] == county]
    county_data_list.append(county_data)
# Concatenate the county specific data together again
google_mobility_data_NY = pd.concat([x for x in county_data_list])

# Save as CSV
google_mobility_data_NY.to_csv(os.path.join(PROCESSED_DATA_FOLDER, 'google_mobility_data.csv'))

# Display the data
display(google_mobility_data_NY.head())

Unnamed: 0,date,county,census_fips_code,retail_and_recreation_percent_change,grocery_and_pharmacy_percent_change,parks_percent_change,transit_stations_percent_change,workplaces_percent_change,residential_percent_change
0,2020-02-15,Albany County,36001.0,8.0,-4.0,45.0,4.0,0.0,0.0
61,2020-02-22,Albany County,36001.0,3.857143,-3.571429,-1.285714,-1.0,-9.857143,2.428571
122,2020-02-29,Albany County,36001.0,6.428571,1.857143,28.428571,5.285714,3.285714,-0.571429
183,2020-03-07,Albany County,36001.0,8.428571,12.285714,29.0,5.571429,4.0,-0.571429
244,2020-03-14,Albany County,36001.0,0.285714,19.571429,54.0,-1.571429,-0.857143,1.428571


## Upload to the Database

In [8]:
# Set path to local MySQL password file
sql_pw_filepath = os.path.join(PARENT_DIRECTORY, os.path.join('..', 'sql_password.txt'))

# Raise exception if key file not found
if not os.path.exists(sql_pw_filepath):
    raise FileNotFoundError('Local MySQL password file not found! Please check directory.')

# Read in MySQL username and password as environment variable
with open(sql_pw_filepath, 'r') as f:
    os.environ['sql_username'] = f.readline().strip()
    os.environ['sql_password'] = f.readline().strip()

In [9]:
# Connect to the Database
db_connection_str = f'mysql+pymysql://{os.environ.get("sql_username")}:{os.environ.get("sql_password")}@aipi510.mysql.database.azure.com:3306/project'
db_connection_args = {'ssl': {'enable_tls': True}}
sql_engine = create_engine(db_connection_str, connect_args=db_connection_args)
db_connection= sql_engine.connect()

In [10]:
# Create a new google mobility data and try loading the data
mobility_table = 'google_mobility'
try:
    google_mobility_data_NY.to_sql(mobility_table, db_connection, if_exists='replace')
except ValueError as vx:
    print(vx)
except Exception as ex:
    print(ex)
else:
    print(f'Table {mobility_table} created successfully!');   

Table google_mobility created successfully!


In [11]:
# Test if it worked
mobility_table = 'google_mobility'

test_sql_query = f'SELECT * FROM {mobility_table} LIMIT 10'
df_test = pd.read_sql(test_sql_query, db_connection)
display(df_test)

Unnamed: 0,index,date,county,census_fips_code,retail_and_recreation_percent_change,grocery_and_pharmacy_percent_change,parks_percent_change,transit_stations_percent_change,workplaces_percent_change,residential_percent_change
0,0,2020-02-15,Albany County,36001.0,8.0,-4.0,45.0,4.0,0.0,0.0
1,61,2020-02-22,Albany County,36001.0,3.857143,-3.571429,-1.285714,-1.0,-9.857143,2.428571
2,122,2020-02-29,Albany County,36001.0,6.428571,1.857143,28.428571,5.285714,3.285714,-0.571429
3,183,2020-03-07,Albany County,36001.0,8.428571,12.285714,29.0,5.571429,4.0,-0.571429
4,244,2020-03-14,Albany County,36001.0,0.285714,19.571429,54.0,-1.571429,-0.857143,1.428571
5,305,2020-03-21,Albany County,36001.0,-33.0,9.857143,32.428571,-30.857143,-31.142857,13.285714
6,366,2020-03-28,Albany County,36001.0,-55.285714,-18.857143,11.142857,-54.714286,-49.142857,20.285714
7,427,2020-04-04,Albany County,36001.0,-53.142857,-17.0,8.714286,-58.142857,-51.428571,20.857143
8,488,2020-04-11,Albany County,36001.0,-60.0,11.714286,18.857143,-56.714286,-52.571429,21.142857
9,549,2020-04-18,Albany County,36001.0,-66.285714,-22.428571,-0.857143,-59.571429,-52.142857,21.285714


In [12]:
# Close the connection to the DB
db_connection.close()