# Rickettsiosis - Data Wrangling

### Import libraries

In [3]:
import numpy as np 
import pandas as pd 

### Data Wrangling

**2016**

In [81]:
DATA2016 = '../data/2016.xlsx'
xls = pd.ExcelFile(DATA2016)

weeks = xls.sheet_names    

dataframes = []    # empty list to group dataframes

col_names = {0: 'state', 5: 'week_cases', 6: 'm_accum', 7: 'f_accum'}

for week in weeks:
    df = pd.read_excel(xls, sheet_name=week, header=None)                     
    df.drop([1,2,3,4,8], axis=1, inplace=True)    # delete unnecesary columns
    df.drop(32, inplace=True)                     # delete "Total" row
    df.replace('-', np.NaN, inplace=True)         # replace '-' for NaN value
    df['week'] = week.split('sem')[-1]                                        
    df['year'] = 2016                                                         
    df.rename(columns=col_names, inplace=True)                                
    df = df[['state', 'year', 'week', 'week_cases', 'm_accum', 'f_accum']]    
    dataframes.append(df)    # appends all dataframes

In [82]:
combined_dataframes = pd.concat(dataframes)
combined_dataframes.to_csv('../data/csv/2016.csv', index=False)
combined_dataframes.head()

Unnamed: 0,state,year,week,week_cases,m_accum,f_accum
0,Aguascalientes,2016,1,,,
1,Baja California,2016,1,,,
2,Baja California Sur,2016,1,,,
3,Campeche,2016,1,,,
4,Coahuila,2016,1,,,


**2017**

In [83]:
DATA2017 = '../data/2017.xlsx'
xls = pd.ExcelFile(DATA2017)

weeks = xls.sheet_names    

dataframes = []    # empty list to group dataframes

col_names = {0: 'state', 5: 'week_cases', 6: 'm_accum', 7: 'f_accum'}

for week in weeks:
    df = pd.read_excel(xls, sheet_name=week, header=None)                     
    df.drop([1,2,3,4,8], axis=1, inplace=True)    # delete unnecesary columns
    df.drop(32, inplace=True)                     # delete "Total" row
    df.replace('-', np.NaN, inplace=True)         # replace '-' for NaN
    df['week'] = week.split('sem')[-1]                                        
    df['year'] = 2017                                                      
    df.rename(columns=col_names, inplace=True)                                
    df = df[['state', 'year', 'week', 'week_cases', 'm_accum', 'f_accum']]    
    dataframes.append(df)                         # appends all dataframes

In [84]:
combined_dataframes = pd.concat(dataframes)
combined_dataframes.to_csv('../data/csv/2017.csv', index=False)
combined_dataframes.head()

Unnamed: 0,state,year,week,week_cases,m_accum,f_accum
0,Aguascalientes,2017,1,,,
1,Baja California,2017,1,,,
2,Baja California Sur,2017,1,,,
3,Campeche,2017,1,,,
4,Coahuila,2017,1,,,


**2015**

**Semana 1 - equivalent to last week of 2014**

In [85]:
DATA2015 = '../data/2015.xlsx'
xls = pd.ExcelFile(DATA2015)

col_names = {0: 'state', 4: 'week_cases', 5: 'm_accum', 6: 'f_accum'}

df = pd.read_excel(DATA2015, header=None)   
df.drop([1, 2, 3, 7], axis=1, inplace=True)
df.drop(32, inplace=True)                   
df.replace('-', np.NaN, inplace=True)
df['week'] = 53
df['year'] = 2014
df.rename(columns=col_names, inplace=True)
df = df[['state', 'year', 'week', 'week_cases', 'm_accum', 'f_accum']]
df.to_csv('../data/csv/2014.csv', index=False)

**Semana 15 - no data** 

**Las demás semanas**

In [95]:
DATA2015 = '../data/2015.xlsx'
xls = pd.ExcelFile(DATA2015)

weeks = xls.sheet_names
weeks.remove('sem1+')
weeks.remove('sem15+')

dataframes = []

col_names = {0: 'state', 5: 'week_cases', 6: 'm_accum', 7: 'f_accum'}

for week in weeks:
    df = pd.read_excel(xls, sheet_name=week, header=None)
    df.drop([1,2,3,4,8], axis=1, inplace=True)
    df.drop(32, inplace=True)
    df.replace('-', np.NaN, inplace=True)
    df['week'] = week.split('sem')[-1]
    df['year'] = 2015
    df.rename(columns=col_names, inplace=True)
    df = df[['state', 'year', 'week', 'week_cases', 'm_accum', 'f_accum']]   
    dataframes.append(df)                                               

In [96]:
combined_dataframes = pd.concat(dataframes)
combined_dataframes.to_csv('../data/csv/2015.csv', index=False)
combined_dataframes.head()

Unnamed: 0,state,year,week,week_cases,m_accum,f_accum
0,Aguascalientes,2015,2,,,
1,Baja California,2015,2,,,
2,Baja California Sur,2015,2,,,
3,Campeche,2015,2,,,
4,Coahuila,2015,2,,,


**2014**

**Semana 1 - no data** 

In [7]:
DATA2014 = '../data/2014.xlsx'
xls = pd.ExcelFile(DATA2014)

weeks = xls.sheet_names
weeks.remove('sem1+')

dataframes = []

col_names = {0: 'state', 4: 'week_cases', 5: 'm_accum', 6: 'f_accum'}

for week in weeks:
    df = pd.read_excel(xls, sheet_name=week, header=None)
    df.drop([1,2,3,7], axis=1, inplace=True)
    df.drop(32, inplace=True)
    df.replace('-', np.NaN, inplace=True)
    df['week'] = week.split('sem')[-1]
    df['year'] = 2014
    df.rename(columns=col_names, inplace=True)
    df = df[['state', 'year', 'week', 'week_cases', 'm_accum', 'f_accum']]   
    dataframes.append(df) 

In [8]:
combined_dataframes = pd.concat(dataframes)
combined_dataframes.to_csv('../data/csv/2014.csv', index=False)
combined_dataframes.head() 

Unnamed: 0,state,year,week,week_cases,m_accum,f_accum
0,Aguascalientes,2014,2,,,
1,Baja California,2014,2,2.0,1.0,1.0
2,Baja California Sur,2014,2,,,
3,Campeche,2014,2,,,
4,Coahuila,2014,2,,,


## Append all CSV files

In [9]:
csv2014 = pd.read_csv('../data/csv/2014.csv')
csv2015 = pd.read_csv('../data/csv/2015.csv')
csv2016 = pd.read_csv('../data/csv/2016.csv')
csv2017 = pd.read_csv('../data/csv/2017.csv')

In [15]:
completecsv = csv2014.append([csv2015, csv2016, csv2017])

In [16]:
completecsv.to_csv('../data/csv/data.csv', index=False)