# Processing Raw Data Collected From NCDC
Steps used in generating the raw data collected from [NCDC](https://covid19.ncdc.gov.ng/)

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [25]:
data = pd.read_csv('./raw_data/data.csv')
data = data.drop('Unnamed: 10', axis=1)

# Remove whitespaces in columns
for col in data.columns:
    data = data.rename(columns={col: col.strip()})
    
# Drop unncessary columns; these columns can either be derived or data reporting is inconsistent
data = data.drop(['confirmed new', 'discharged new', 'death new', 'active cases', 'days since last reported case'], axis=1)

# Remove whitespaces in state column and 
for col in ['state','discharged case', 'death case']:
    data[col] = data[col].str.strip()
    
#  Replace unknowns with Nans
data = data.replace('Nan', np.nan)
data = data.replace('nan', np.nan)
data = data.replace('-', np.nan)

# Convert columns to floats
num_cols = ['confirmed case', 'discharged case', 'death case']
for col in num_cols:
    data[col] = pd.to_numeric(data[col])

# Convert date column to datetime
data['date'] = pd.to_datetime(data['date'], format='%d-%m-%Y')

# Rename states correctly
data = data.replace('Akwa', 'Akwa Ibom')
data = data.replace('Cross', 'Cross River')
data = data.drop(data.loc[data.state == 'Abuja'].index[0])

In [27]:
# missing_dates -> '2020-03-20', '2020-03-27

from datetime import datetime, timedelta

# DataFrame of dates to makeup for missing dates
yesterday = datetime.strftime(datetime.now() - timedelta(1), format='%d-%m-%Y')

dates = pd.date_range('2020-02-29', yesterday)
date_df = pd.DataFrame(index=dates)

In [28]:
covid = data.groupby(['date','state']).sum().sort_values('date').unstack('state').sort_index()
covid.iloc[0] = covid.iloc[0].replace({np.nan: 0})    

# Combine with date dataframe to fill in all dates
confirmed = date_df.join(covid['confirmed case'])
deaths = date_df.join(covid['death case'])
discharged = date_df.join(covid['discharged case'])

# Forward fill Nans with the previous value
confirmed = confirmed.fillna(method='ffill')
deaths = deaths.fillna(method='ffill')
discharged = discharged.fillna(method='ffill')

In [29]:
def save_to_csv(df, name):
    """Save dataframes to pdfs
    Params:
        df: DataFrame, dataframe to save
        name: str, name of the csv to save DataFrame to.
    """
    # Convert float to ints
    for col in df.columns:
        df[col] = df[col].astype(int)
    df['date'] = df.index
    df.to_csv(f'{name}.csv', index=False)
    return

In [30]:
save_to_csv(deaths, 'deaths')
save_to_csv(confirmed, 'confirmed')
save_to_csv(discharged, 'discharged')

In [33]:
pd.read_csv('discharged.csv', parse_dates=['date'], index_col='date')

Unnamed: 0_level_0,Abia,Adamawa,Akwa Ibom,Anambra,Bauchi,Bayelsa,Benue,Borno,Cross River,Delta,...,Ogun,Ondo,Osun,Oyo,Plateau,Rivers,Sokoto,Taraba,Yobe,Zamfara
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-02-29,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-03-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-03-02,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-03-03,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-03-04,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-03-05,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-03-06,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-03-07,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-03-08,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-03-09,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
