In [184]:
import pandas as pd
import numpy as np
import utils.util as ut

In [185]:
pd.options.display.float_format = '{:.2f}'.format

In [186]:
df = pd.read_csv("../data/data.csv")

## Data inspection

In [187]:
df

Unnamed: 0,Entity,Continent,Latitude,Longitude,Average temperature per year,Hospital beds per 1000 people,Medical doctors per 1000 people,GDP/Capita,Population,Median age,Population aged 65 and over (%),Date,Daily tests,Cases,Deaths
0,Albania,Europe,41.15,20.17,14,2.89,1.29,5353.20,2873457,38,14,2020-02-25,8.00,,
1,Albania,Europe,41.15,20.17,14,2.89,1.29,5353.20,2873457,38,14,2020-02-26,5.00,,
2,Albania,Europe,41.15,20.17,14,2.89,1.29,5353.20,2873457,38,14,2020-02-27,4.00,,
3,Albania,Europe,41.15,20.17,14,2.89,1.29,5353.20,2873457,38,14,2020-02-28,1.00,,
4,Albania,Europe,41.15,20.17,14,2.89,1.29,5353.20,2873457,38,14,2020-02-29,8.00,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38467,Zimbabwe,Africa,-19.02,29.15,20,1.70,0.08,1464.00,16529904,19,3,2021-02-24,1804.00,35960.00,1456.00
38468,Zimbabwe,Africa,-19.02,29.15,20,1.70,0.08,1464.00,16529904,19,3,2021-02-25,2965.00,35994.00,1458.00
38469,Zimbabwe,Africa,-19.02,29.15,20,1.70,0.08,1464.00,16529904,19,3,2021-02-26,,36044.00,1463.00
38470,Zimbabwe,Africa,-19.02,29.15,20,1.70,0.08,1464.00,16529904,19,3,2021-02-27,,36058.00,1463.00


In [188]:
df.describe()

Unnamed: 0,Latitude,Longitude,Average temperature per year,Hospital beds per 1000 people,Medical doctors per 1000 people,GDP/Capita,Population,Median age,Population aged 65 and over (%),Daily tests,Cases,Deaths
count,38472.0,38472.0,38472.0,38472.0,38472.0,38472.0,38472.0,38472.0,38472.0,30577.0,38218.0,34862.0
mean,23.74,20.21,17.72,3.17,2.09,19002.33,48969829.03,32.75,10.66,39440.59,287902.66,8090.5
std,26.06,61.07,8.13,2.56,1.52,22271.11,142725118.68,8.47,6.77,150184.66,1405242.87,29548.75
min,-40.9,-106.35,-2.0,0.2,0.02,411.6,341284.0,16.0,1.0,-239172.0,1.0,1.0
25%,8.62,-3.44,11.0,1.4,0.82,3659.0,4793900.0,27.0,5.0,1505.0,2074.0,77.0
50%,27.51,21.82,20.0,2.5,1.89,8821.8,11484636.0,32.0,8.0,5520.0,21431.0,527.0
75%,45.94,47.48,25.0,4.49,3.21,25946.2,42862958.0,41.0,16.0,20382.0,137377.0,3480.5
max,64.96,179.41,29.0,13.05,7.52,114704.6,1339180127.0,48.0,28.0,2945871.0,28605669.0,513091.0


In [189]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38472 entries, 0 to 38471
Data columns (total 15 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Entity                           38472 non-null  object 
 1   Continent                        38472 non-null  object 
 2   Latitude                         38472 non-null  float64
 3   Longitude                        38472 non-null  float64
 4   Average temperature per year     38472 non-null  int64  
 5   Hospital beds per 1000 people    38472 non-null  float64
 6   Medical doctors per 1000 people  38472 non-null  float64
 7   GDP/Capita                       38472 non-null  float64
 8   Population                       38472 non-null  int64  
 9   Median age                       38472 non-null  int64  
 10  Population aged 65 and over (%)  38472 non-null  int64  
 11  Date                             38472 non-null  object 
 12  Daily tests       

### Observations

From an initial inspection we can observe that:

* The `Daily tests` column has a lot of missing values.
* The `Cases` and `Deaths` columns have some missing values.
* The `Daily tests` column has some negative values, which would be impossible.

## Handle Negative values

In the context of this dataset, negative values in the `Daily tests` column are impossible. Therefore, we'll have to
deal with them. As a first step, we'll replace them with NaN values, and then we'll handle them later along with the
other missing values.

In [190]:
df.loc[df['Daily tests'] < 0, 'Daily tests'] = np.nan

## Handle missing values

### Replace with 0

In the `Deaths` and `Cases` columns there the missing values, but they take up only the first few rows of each 
country. Therefore, we can safely assume that the missing values actually represent 0, and we can replace them.

In the `Daily tests` column there are a lot more missing values, and they're scattered throughout the dataset for 
each country. Assuming they're all 0 is therefore not an option.

In [191]:
df['Cases'].fillna(0, inplace=True)
df['Deaths'].fillna(0, inplace=True)

In [192]:
df['Cases'] = df.groupby('Entity')['Cases'].transform('cummax')
df['Deaths'] = df.groupby('Entity')['Deaths'].transform('cummax')

### Add columns

Since we've dealt with the missing values in the `Cases` and `Deaths` columns, we can now calculate and add the
`Daily cases` and `Daily deaths` columns, which will be useful later.

In [193]:
df['Daily cases'] = df.groupby('Entity')['Cases'].diff().fillna(0)
df['Daily deaths'] = df.groupby('Entity')['Deaths'].diff().fillna(0)

### Interpolate

For the `Daily tests` column, we'll first try to fill in the smaller gaps with linear interpolation. To do this, we'll
group the data by country, and then interpolate the missing values for each country separately, applying a limit of 3
consecutive missing values.

In [194]:
df['Daily tests'] = (df.groupby('Entity')['Daily tests']
                     .transform(lambda x: x.interpolate(method='linear', limit_area='inside', limit=3)))

df

Unnamed: 0,Entity,Continent,Latitude,Longitude,Average temperature per year,Hospital beds per 1000 people,Medical doctors per 1000 people,GDP/Capita,Population,Median age,Population aged 65 and over (%),Date,Daily tests,Cases,Deaths,Daily cases,Daily deaths
0,Albania,Europe,41.15,20.17,14,2.89,1.29,5353.20,2873457,38,14,2020-02-25,8.00,0.00,0.00,0.00,0.00
1,Albania,Europe,41.15,20.17,14,2.89,1.29,5353.20,2873457,38,14,2020-02-26,5.00,0.00,0.00,0.00,0.00
2,Albania,Europe,41.15,20.17,14,2.89,1.29,5353.20,2873457,38,14,2020-02-27,4.00,0.00,0.00,0.00,0.00
3,Albania,Europe,41.15,20.17,14,2.89,1.29,5353.20,2873457,38,14,2020-02-28,1.00,0.00,0.00,0.00,0.00
4,Albania,Europe,41.15,20.17,14,2.89,1.29,5353.20,2873457,38,14,2020-02-29,8.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38467,Zimbabwe,Africa,-19.02,29.15,20,1.70,0.08,1464.00,16529904,19,3,2021-02-24,1804.00,35960.00,1456.00,50.00,8.00
38468,Zimbabwe,Africa,-19.02,29.15,20,1.70,0.08,1464.00,16529904,19,3,2021-02-25,2965.00,35994.00,1458.00,34.00,2.00
38469,Zimbabwe,Africa,-19.02,29.15,20,1.70,0.08,1464.00,16529904,19,3,2021-02-26,,36044.00,1463.00,50.00,5.00
38470,Zimbabwe,Africa,-19.02,29.15,20,1.70,0.08,1464.00,16529904,19,3,2021-02-27,,36058.00,1463.00,14.00,0.00


### Find edge-cases

Some countries might be missing a lot of values in the `Daily tests` column, so it wouldn't be accurate enough to try
and guess them. It's better to drop any edge-cases we find before we proceed.

In [195]:
nan_df = (df[['Entity', 'Daily tests']]
          .groupby('Entity')
          .apply(lambda x: x.isna().mean() * 100)
          .sort_values(by='Daily tests', ascending=False))
nan_df

Unnamed: 0_level_0,Entity,Daily tests
Entity,Unnamed: 1_level_1,Unnamed: 2_level_1
Algeria,0.00,97.57
Oman,0.00,83.56
Vietnam,0.00,76.67
Armenia,0.00,74.25
Belarus,0.00,74.11
...,...,...
Israel,0.00,0.80
Slovenia,0.00,0.80
Denmark,0.00,0.80
Switzerland,0.00,0.75


### Drop the edge-cases

We will treat all countries with more than 40% missing values in the `Daily tests` column as edge-cases and drop them.

In [196]:
countries_to_drop = nan_df[(nan_df['Daily tests'] > 40)].index.tolist()

df = df[~df['Entity'].isin(countries_to_drop)].copy()

df.reset_index(drop=True, inplace=True)

### Replace with average ratio

For the remaining missing values in the `Daily tests` column, we'll try to fill them in based on the average ratio
between the `Daily tests` and `Daily cases` columns for each country. We'll first calculate the average ratio for each
country, and then we'll use it to fill in the missing values. This method should leave no missing values behind.

In [197]:
avg_ratios = df.groupby('Entity').apply(ut.compute_avg_ratio)

df['Daily tests'] = df.apply(lambda x: ut.fill_na_based_on_ratio(x, avg_ratios), axis=1)

## Add columns 

We will add some additional columns that we will need later.

The additional columns are:
1. 'Tests' - The total number of tests.
2. 'Daily cases' - The number of new cases per day.
3. 'Daily deaths' - The number of new deaths per day.
4. 'Daily positivity rate (%)' - The positivity rate of the tests per day.
5. 'Daily mortality rate (%)' - The mortality rate of the cases per day.
6. 'Positivity rate (%)' - The positivity rate of the tests.
7. 'Mortality rate (%)' - The mortality rate of the cases.

In [198]:
df['Daily tests'] = df['Daily tests'].round(0)
df['Tests'] = df.groupby('Entity')['Daily tests'].cumsum()
df['Positivity rate (%)'] = (df['Cases'] / df['Tests'] * 100).round(2).fillna(0)
df['Mortality rate (%)'] = (df['Deaths'] / df['Cases'] * 100).round(2).fillna(0)

In [0]:
df.to_csv("../data/pp_data.csv", index=False)