In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import datetime
from scipy.stats import linregress

In [2]:
# Import the air data for the 85 countries
world_air = pd.read_csv("output_csv/clean_world_air.csv", parse_dates=True)
world_air.head()

Unnamed: 0,Date,Country_code,City,Specie,count,min,max,median,variance,Country_name
0,2019-01-16,AE,Abu Dhabi,pm10,24,86.0,99.0,97.0,179.4,United Arab Emirates
1,2019-01-22,AE,Abu Dhabi,pm10,24,51.0,57.0,55.0,23.75,United Arab Emirates
2,2019-01-26,AE,Abu Dhabi,pm10,24,136.0,173.0,160.0,941.96,United Arab Emirates
3,2019-01-07,AE,Abu Dhabi,pm10,24,60.0,91.0,72.0,1006.88,United Arab Emirates
4,2019-01-10,AE,Abu Dhabi,pm10,24,82.0,93.0,87.0,57.97,United Arab Emirates


In [3]:
# Check the data types and any missing values
world_air.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 977319 entries, 0 to 977318
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Date          977319 non-null  object 
 1   Country_code  977319 non-null  object 
 2   City          977319 non-null  object 
 3   Specie        977319 non-null  object 
 4   count         977319 non-null  int64  
 5   min           977319 non-null  float64
 6   max           977319 non-null  float64
 7   median        977319 non-null  float64
 8   variance      977319 non-null  float64
 9   Country_name  977319 non-null  object 
dtypes: float64(4), int64(1), object(5)
memory usage: 74.6+ MB


In [4]:
# Change the data type of the Date column into datetime
world_air['Date'] = world_air['Date'].astype('datetime64[ns]')

In [5]:
world_air.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 977319 entries, 0 to 977318
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Date          977319 non-null  datetime64[ns]
 1   Country_code  977319 non-null  object        
 2   City          977319 non-null  object        
 3   Specie        977319 non-null  object        
 4   count         977319 non-null  int64         
 5   min           977319 non-null  float64       
 6   max           977319 non-null  float64       
 7   median        977319 non-null  float64       
 8   variance      977319 non-null  float64       
 9   Country_name  977319 non-null  object        
dtypes: datetime64[ns](1), float64(4), int64(1), object(4)
memory usage: 74.6+ MB


In [6]:
# Import the covid data total active cases for the 85 countries 
active_world_covid = pd.read_csv("output_csv/active_world_covid.csv", parse_dates=True)
active_world_covid.head()

Unnamed: 0,Date,Total active cases
0,2020-01-22,510
1,2020-01-23,605
2,2020-01-24,876
3,2020-01-25,1346
4,2020-01-26,2002


In [8]:
# Check the data types and any missing values
active_world_covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 520 entries, 0 to 519
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Date                520 non-null    object
 1   Total active cases  520 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 8.2+ KB


In [9]:
# Change the data type of the Date column into datetime
active_world_covid['Date'] = active_world_covid['Date'].astype('datetime64[ns]')

In [10]:
# Double check the data types
active_world_covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 520 entries, 0 to 519
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                520 non-null    datetime64[ns]
 1   Total active cases  520 non-null    int64         
dtypes: datetime64[ns](1), int64(1)
memory usage: 8.2 KB


In [17]:
# Define a function to do the following steps:
    ## Transpose an air dataframe from long to wide: for each specie, turn the country_code into columns
    ## For the world air data, get the median of specie values of the 88 countries to represent the world
    ## Merge the covid data with the air data on Date.
    
def world_air_covid_each_specie(air_df, specie, covid_df):
    air_median_specie_transposed = air_df[air_df["Specie"] == specie].pivot_table(
        index="Date", columns="Country_code", values="median", aggfunc=np.median)

    air_median_specie_transposed[f"Median {specie}"] = air_median_specie_transposed.median(axis=1)

    final_air_median_specie_transposed = air_median_specie_transposed.iloc[:, -1].to_frame().reset_index()

    air_covid_median_specie = pd.merge(
        covid_df, final_air_median_specie_transposed, how="inner", on="Date")
    return air_covid_median_specie

In [20]:
# Create a merged data frame for median PM2.5 data and covid active case data for the world based on shared date

world_air_covid_median_pm25 = world_air_covid_each_specie(world_air, "pm25", active_world_covid)
world_air_covid_median_pm25.head()

Unnamed: 0,Date,Total active cases,Median pm25
