In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import urllib.request as ur
import json
import datetime
%matplotlib inline

In [2]:
df = pd.read_csv('Data/zika_cleaned_data.csv', low_memory=False)

In [8]:
df = df.rename(columns={"report_date": "date", "value": "zika_cases"})

In [9]:
df.head()

Unnamed: 0,date,location,zika_cases
0,2015-11-28,El_Salvador,854
1,2015-11-28,El_Salvador-Ahuachapan,4
2,2015-11-28,El_Salvador-Cabanas,3
3,2015-11-28,El_Salvador-Chalatenango,8
4,2015-11-28,El_Salvador-Cuscatlan,4


In [16]:
# create new columns cases_first_date(number of cases on a first observation)
# date_first_date(first date of observation)
# cases_first_nonzero(number of nonzero cases)
# date_first_nonzero(report date for non zero cases observations)
# cases_max(max number of cases per location)
# date_max (report date that corresponds to max number of cases)
# cases_last (number of cases of last observation)
# date_last (last date of observation)
# cases_total (total number of cases) per location
def divide_groups(df):
    cases_first_date = df.loc[df.date==df.date.min(), 'zika_cases'].values[0]
    date_first_date  = df.date.min()
    
    cases_max   = df.zika_cases.max()
    date_max    = df.loc[df.zika_cases==df.zika_cases.max(), 'date'].values[0]
    
    cases_last  = df.loc[df.date==df.date.max(), 'zika_cases'].values[0]
    date_last   = df.date.max()
    
    cases_total = df.zika_cases.sum()
    
    df2 = df.loc[df.zika_cases>0]
    
    if df2.shape[0]>=1:
        cases_first_nonzero = df2.loc[df2.date==df2.date.min(),'zika_cases'].values[0]
        date_first_nonzero  = df2.date.min()
    else:
        cases_first_nonzero = np.NaN
        date_first_nonzero = np.NaN
        
        
    return pd.Series({'cases_first_date' : cases_first_date,
                      'date_first_date'  : date_first_date,
                      'cases_first_nonzero' : cases_first_nonzero,
                      'date_first_nonzero'  : date_first_nonzero,
                      'cases_max'  : cases_max,
                      'date_max'   : date_max,
                      'cases_last' : cases_last,
                      'date_last'  : date_last,
                      'cases_total': cases_total})

In [17]:
# group by  all occurences per location and date
df = (df[['date','location','zika_cases']]
                 .groupby(['date','location'], as_index=False)
                 .sum())

In [18]:
df_groups = (df.groupby('location').apply(divide_groups))

In [19]:
df_groups[df_groups['cases_first_date'] == 0].head()

Unnamed: 0_level_0,cases_first_date,date_first_date,cases_first_nonzero,date_first_nonzero,cases_max,date_max,cases_last,date_last,cases_total
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Argentina-San_Juan,0,2016-03-19,1.0,2016-04-29,2,2016-05-07,1,2016-06-26,11
Argentina-Tierra_Del_Fuego,0,2016-04-29,,,0,2016-04-29,0,2016-06-26,0
Argentina-Tierra_del_Fuego,0,2016-03-19,,,0,2016-03-19,0,2016-04-16,0
Colombia-Amazonas-El_Encanto,0,2016-01-09,,,0,2016-01-09,0,2016-06-25,0
Colombia-Amazonas-La_Chorrera,0,2016-01-09,,,0,2016-01-09,0,2016-06-25,0


In [20]:
mask = df_groups.cases_max > 0
# create new column zika_confirmed
# If number of cases for location is 0 then zika_confirmed = 0 and we take first available report_date for that location.
# If any number of cases > 0 for specified location, we assigned zika_confirmed = 1 and we took first report_date where number of cases > 0. 
df_clusters = pd.concat([(df_groups # non-zero case data are taken from first cases occurence
                                .loc[mask, ['date_first_nonzero']]
                                .assign(zika_confirmed=1)
                                .rename(columns={'date_first_nonzero':'date'})),
                               
                                # zero case data are taken from first date for this location
                               (df_groups
                                .loc[mask.pipe(np.invert), ['date_first_date']]
                                .assign(zika_confirmed=0)
                                .rename(columns={'date_first_date':'date'}))]).sort_index().reset_index()


In [21]:
df_clusters.head()

Unnamed: 0,location,date,zika_confirmed
0,Argentina-Buenos_Aires,2016-03-19,1
1,Argentina-CABA,2016-03-19,1
2,Argentina-Catamarca,2016-03-19,1
3,Argentina-Chaco,2016-03-19,1
4,Argentina-Chubut,2016-03-19,1


In [22]:
# save csv for future needs
df_clusters.to_csv('zika_clusters.csv', encoding='utf-8', index=False)

In [23]:
df_clusters.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1673 entries, 0 to 1672
Data columns (total 3 columns):
location          1673 non-null object
date              1673 non-null object
zika_confirmed    1673 non-null int64
dtypes: int64(1), object(2)
memory usage: 39.3+ KB
