In [308]:
import pandas as pd
import numpy as np

In [309]:
pd.options.display.float_format = '{:.2f}'.format

In [310]:
df = pd.read_csv("../data/data.csv")

## Data inspection

In [311]:
df

Unnamed: 0,Entity,Continent,Latitude,Longitude,Average temperature per year,Hospital beds per 1000 people,Medical doctors per 1000 people,GDP/Capita,Population,Median age,Population aged 65 and over (%),Date,Daily tests,Cases,Deaths
0,Albania,Europe,41.15,20.17,14,2.89,1.29,5353.20,2873457,38,14,2020-02-25,8.00,,
1,Albania,Europe,41.15,20.17,14,2.89,1.29,5353.20,2873457,38,14,2020-02-26,5.00,,
2,Albania,Europe,41.15,20.17,14,2.89,1.29,5353.20,2873457,38,14,2020-02-27,4.00,,
3,Albania,Europe,41.15,20.17,14,2.89,1.29,5353.20,2873457,38,14,2020-02-28,1.00,,
4,Albania,Europe,41.15,20.17,14,2.89,1.29,5353.20,2873457,38,14,2020-02-29,8.00,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38467,Zimbabwe,Africa,-19.02,29.15,20,1.70,0.08,1464.00,16529904,19,3,2021-02-24,1804.00,35960.00,1456.00
38468,Zimbabwe,Africa,-19.02,29.15,20,1.70,0.08,1464.00,16529904,19,3,2021-02-25,2965.00,35994.00,1458.00
38469,Zimbabwe,Africa,-19.02,29.15,20,1.70,0.08,1464.00,16529904,19,3,2021-02-26,,36044.00,1463.00
38470,Zimbabwe,Africa,-19.02,29.15,20,1.70,0.08,1464.00,16529904,19,3,2021-02-27,,36058.00,1463.00


In [312]:
df.describe()

Unnamed: 0,Latitude,Longitude,Average temperature per year,Hospital beds per 1000 people,Medical doctors per 1000 people,GDP/Capita,Population,Median age,Population aged 65 and over (%),Daily tests,Cases,Deaths
count,38472.0,38472.0,38472.0,38472.0,38472.0,38472.0,38472.0,38472.0,38472.0,30577.0,38218.0,34862.0
mean,23.74,20.21,17.72,3.17,2.09,19002.33,48969829.03,32.75,10.66,39440.59,287902.66,8090.5
std,26.06,61.07,8.13,2.56,1.52,22271.11,142725118.68,8.47,6.77,150184.66,1405242.87,29548.75
min,-40.9,-106.35,-2.0,0.2,0.02,411.6,341284.0,16.0,1.0,-239172.0,1.0,1.0
25%,8.62,-3.44,11.0,1.4,0.82,3659.0,4793900.0,27.0,5.0,1505.0,2074.0,77.0
50%,27.51,21.82,20.0,2.5,1.89,8821.8,11484636.0,32.0,8.0,5520.0,21431.0,527.0
75%,45.94,47.48,25.0,4.49,3.21,25946.2,42862958.0,41.0,16.0,20382.0,137377.0,3480.5
max,64.96,179.41,29.0,13.05,7.52,114704.6,1339180127.0,48.0,28.0,2945871.0,28605669.0,513091.0


In [313]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38472 entries, 0 to 38471
Data columns (total 15 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Entity                           38472 non-null  object 
 1   Continent                        38472 non-null  object 
 2   Latitude                         38472 non-null  float64
 3   Longitude                        38472 non-null  float64
 4   Average temperature per year     38472 non-null  int64  
 5   Hospital beds per 1000 people    38472 non-null  float64
 6   Medical doctors per 1000 people  38472 non-null  float64
 7   GDP/Capita                       38472 non-null  float64
 8   Population                       38472 non-null  int64  
 9   Median age                       38472 non-null  int64  
 10  Population aged 65 and over (%)  38472 non-null  int64  
 11  Date                             38472 non-null  object 
 12  Daily tests       

### Observations

From an initial inspection we can observe that:

* The `Daily tests` column has a lot of missing values.
* The `Cases` and `Deaths` columns have some missing values.
* The `Daily tests` column has some negative values, which would be impossible.

## Handle Negative values

To handle the negative values we'll simply replace them with `NaN` for the time being.

In [314]:
df.loc[df['Daily tests'] < 0, 'Daily tests'] = np.nan

## Handle missing values

### Replace with 0

In the `Deaths` and `Cases` columns there the missing values, but they take up only the first few rows of each 
country. Therefore, we can safely assume that the missing values are 0, so we'll replace them. 

In the `Daily tests` column there are a lot more missing values, and they're scattered throughout the dataset for 
each country. Assuming they're all 0 is therefore not an option.

In [315]:
df['Cases'].fillna(0, inplace=True)
df['Deaths'].fillna(0, inplace=True)

### Add columns

We will add some additional columns that we will need later.

In [316]:
df['Daily cases'] = df.groupby('Entity')['Cases'].diff().fillna(0)
df['Daily deaths'] = df.groupby('Entity')['Deaths'].diff().fillna(0)

### Interpolate



First we'll try to interpolate some of the missing values. We'll use a linear interpolation, and we'll limit the
interpolation to 3 consecutive missing values.

In [317]:
df['Daily tests'] = (df.groupby('Entity')['Daily tests']
                     .transform(lambda x: x.interpolate(method='linear', limit_area='inside', limit=5)))

df

Unnamed: 0,Entity,Continent,Latitude,Longitude,Average temperature per year,Hospital beds per 1000 people,Medical doctors per 1000 people,GDP/Capita,Population,Median age,Population aged 65 and over (%),Date,Daily tests,Cases,Deaths,Daily cases,Daily deaths
0,Albania,Europe,41.15,20.17,14,2.89,1.29,5353.20,2873457,38,14,2020-02-25,8.00,0.00,0.00,0.00,0.00
1,Albania,Europe,41.15,20.17,14,2.89,1.29,5353.20,2873457,38,14,2020-02-26,5.00,0.00,0.00,0.00,0.00
2,Albania,Europe,41.15,20.17,14,2.89,1.29,5353.20,2873457,38,14,2020-02-27,4.00,0.00,0.00,0.00,0.00
3,Albania,Europe,41.15,20.17,14,2.89,1.29,5353.20,2873457,38,14,2020-02-28,1.00,0.00,0.00,0.00,0.00
4,Albania,Europe,41.15,20.17,14,2.89,1.29,5353.20,2873457,38,14,2020-02-29,8.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38467,Zimbabwe,Africa,-19.02,29.15,20,1.70,0.08,1464.00,16529904,19,3,2021-02-24,1804.00,35960.00,1456.00,50.00,8.00
38468,Zimbabwe,Africa,-19.02,29.15,20,1.70,0.08,1464.00,16529904,19,3,2021-02-25,2965.00,35994.00,1458.00,34.00,2.00
38469,Zimbabwe,Africa,-19.02,29.15,20,1.70,0.08,1464.00,16529904,19,3,2021-02-26,,36044.00,1463.00,50.00,5.00
38470,Zimbabwe,Africa,-19.02,29.15,20,1.70,0.08,1464.00,16529904,19,3,2021-02-27,,36058.00,1463.00,14.00,0.00


### Find edge-cases

We want to find the percentage of missing values for each country, so we can find the edge-cases.

In [318]:
nan_df = (df[['Entity', 'Daily tests']]
          .groupby('Entity')
          .apply(lambda x: x.isna().mean() * 100)
          .sort_values(by='Daily tests', ascending=False))
nan_df

Unnamed: 0_level_0,Entity,Daily tests
Entity,Unnamed: 1_level_1,Unnamed: 2_level_1
Algeria,0.00,96.49
Oman,0.00,83.02
Vietnam,0.00,73.45
Mongolia,0.00,70.22
Belarus,0.00,69.21
...,...,...
Slovenia,0.00,0.80
Denmark,0.00,0.80
United Arab Emirates,0.00,0.76
Switzerland,0.00,0.75


### Drop the edge-cases

We will treat all countries with more than 40% missing values in the `Daily tests` column as edge-cases and drop them.

In [319]:
countries_to_drop = nan_df[(nan_df['Daily tests'] > 40)].index.tolist()

df = df[~df['Entity'].isin(countries_to_drop)]

In [320]:
def compute_avg_ratio(group):
    return group['Daily tests'].mean() / group['Daily cases'].mean()

In [321]:
# Compute the average ratio for each 'Entity'
avg_ratios = df.groupby('Entity').apply(compute_avg_ratio)

# Define a function to fill NaN values based on the computed average ratio
def fill_na_based_on_ratio(row):
    if pd.isna(row['Daily tests']):
        multiplier = row['Daily cases'] if row['Daily cases'] > 1 else 1
        return avg_ratios[row['Entity']] * multiplier
    return row['Daily tests']

## Add columns 

We will add some additional columns that we will need later.

The additional columns are:
1. 'Tests' - The total number of tests.
2. 'Daily cases' - The number of new cases per day.
3. 'Daily deaths' - The number of new deaths per day.
4. 'Daily positivity rate (%)' - The positivity rate of the tests per day.
5. 'Daily mortality rate (%)' - The mortality rate of the cases per day.
6. 'Positivity rate (%)' - The positivity rate of the tests.
7. 'Mortality rate (%)' - The mortality rate of the cases.

In [322]:
df['Daily tests'] = df['Daily tests'].round(0)

df['Tests'] = df.groupby('Entity')['Daily tests'].cumsum()
df['Daily positivity rate (%)'] = (df['Daily cases'] / df['Daily tests'] * 100).round(2).fillna(0)
df['Daily mortality rate (%)'] = (df['Daily deaths'] / df['Daily cases'] * 100).round(2).fillna(0)
df['Positivity rate (%)'] = (df['Cases'] / df['Tests'] * 100).round(2).fillna(0)
df['Mortality rate (%)'] = (df['Deaths'] / df['Cases'] * 100).round(2).fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Daily tests'] = df['Daily tests'].round(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Tests'] = df.groupby('Entity')['Daily tests'].cumsum()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Daily positivity rate (%)'] = (df['Daily cases'] / df['Daily tests'] * 100).round(2).fillna(0)


In [323]:
df.to_csv("../data/pp_data.csv", index=False)