# Rickettsiosis - Data Wrangling

### Import libraries

In [7]:
import numpy as np 
import pandas as pd 

### Data Wrangling

**2016**

In [64]:
DATA2016 = '../data/2016.xlsx'
xls = pd.ExcelFile(DATA2016)

weeks = xls.sheet_names    

dataframes = []    # empty list to group dataframes

col_names = {0: 'state', 5: 'week_cases', 6: 'm_accum', 7: 'f_accum'}

for week in weeks:
    df = pd.read_excel(xls, sheet_name=week, header=None)                     
    df.drop([1,2,3,4,8], axis=1, inplace=True)    # delete unnecesary columns
    df.drop(32, inplace=True)                     # delete "Total" row
    df['week'] = week.split('sem')[-1]                                        
    df['year'] = 2016                                                         
    df.rename(columns=col_names, inplace=True)                                
    df.replace('-', np.NaN, inplace=True)         # replace '-' for non value
    df = df[['state', 'year', 'week', 'week_cases', 'm_accum', 'f_accum']]    
    dataframes.append(df)                         # appends all dataframes

In [65]:
combined_dataframes = pd.concat(dataframes)
combined_dataframes.to_csv('../data/csv/2016.csv', index=False)
combined_dataframes.head(500)

Unnamed: 0,state,year,week,week_cases,m_accum,f_accum
0,Aguascalientes,2016,1,,,
1,Baja California,2016,1,,,
2,Baja California Sur,2016,1,,,
3,Campeche,2016,1,,,
4,Coahuila,2016,1,,,
5,Colima,2016,1,,,
6,Chiapas,2016,1,,,
7,Chihuahua,2016,1,,,
8,Ciudad de México,2016,1,,,
9,Durango,2016,1,,,


**2017**

In [60]:
DATA2017 = '../data/2017.xlsx'
xls = pd.ExcelFile(DATA2017)

semanas = xls.sheet_names    

listings = []                    # create empty list to collect DataFrames

col_names = {0: 'estado', 5: 'sem', 6: 'm_acum', 7: 'f_acum', 8: '2016_acum'}

for semana in semanas:
    listing = pd.read_excel(xls, sheet_name=semana, header=None)    # read a sheet
    listing.drop([1,2,3,4], axis=1, inplace=True)                   # delete unnecesary columns
    listing['no_sem'] = semana.split('sem')[-1]                     # add 'no_sem' column
    listing['anio'] = 2017                                          # add 'anio' column
    listing.rename(columns=col_names, inplace=True)                 # rename columns
    listing = listing[:].replace('-',0)                             # replace '-' for 0
    listings.append(listing)                                        # appends all DataFrames

In [61]:
combined_listings = pd.concat(listings)
combined_listings.to_csv('../data/csv/2017.csv', index=False)
combined_listings.head()

Unnamed: 0,estado,sem,m_acum,f_acum,2016_acum,no_sem,anio
0,Aguascalientes,0,0,0,0,1,2017
1,Baja California,0,0,0,0,1,2017
2,Baja California Sur,0,0,0,0,1,2017
3,Campeche,0,0,0,0,1,2017
4,Coahuila,0,0,0,0,1,2017


**2015**

**Semana 1 - hasta la semana epidemiológica 53 del 2014**

In [83]:
DATA2015 = '../data/2015.xlsx'
xls = pd.ExcelFile(DATA2015)

col_names = {0: 'estado', 4: 'sem', 5: 'm_acum', 6: 'f_acum', 7: '2013_acum'}

dfw1 = pd.read_excel(DATA2015, header=None)    # DataFrame week 1
dfw1.drop([1, 2, 3], axis=1, inplace=True)
dfw1.rename(columns=col_names, inplace=True)
dfw1['no_sem'] = 53
dfw1['anio'] = 2014
dfw1 = dfw1[:].replace('-',0)
dfw1.to_csv('../data/csv/2014.csv', index=False)

**Semana 15 - No hay información** 

**Las demás semanas**

In [84]:
DATA2015 = '../data/2015.xlsx'
xls = pd.ExcelFile(DATA2015)

semanas = xls.sheet_names
semanas.remove('sem1+')
semanas.remove('sem15+')

dataframes = []                    # create empty list to collect DataFrames

col_names = {0: 'estado', 5: 'sem', 6: 'm_acum', 7: 'f_acum', 8: '2016_acum'}

for semana in semanas:
    dataframe = pd.read_excel(xls, sheet_name=semana, header=None)    # read a sheet
    dataframe.drop([1,2,3,4], axis=1, inplace=True)                   # delete unnecesary columns
    dataframe['no_sem'] = semana.split('sem')[-1]                     # add 'no_sem' column
    dataframe['anio'] = 2015                                          # add 'anio' column
    dataframe.rename(columns=col_names, inplace=True)                 # rename columns
    dataframe = dataframe[:].replace('-',0)                           # replace '-' for 0
    dataframes.append(listing)                                        # appends all DataFrames

In [85]:
combined_dataframes = pd.concat(dataframes)
combined_dataframes.to_csv('../data/csv/2015.csv', index=False)
combined_dataframes.head()

Unnamed: 0,estado,sem,m_acum,f_acum,2014_acum,no_sem,anio
0,Aguascalientes,0,0,0,0,52,2015
1,Baja California,0,7,7,0,52,2015
2,Baja California Sur,0,1,8,0,52,2015
3,Campeche,0,0,3,0,52,2015
4,Coahuila,0,4,2,0,52,2015


In [None]:
# estado sem year incidencia male female