In [1]:
import os
import glob
import itertools

import numpy as np
import pandas as pd

from pptx import Presentation

In [2]:
muni_df = pd.read_csv('other/sdsn/sdsn.gen.csv')

In [3]:
file_names = glob.glob('ppt/*')
file_names = sorted(file_names)

file_names

['ppt/2021.10.23-1ra.pptx',
 'ppt/2021.10.23-2da.pptx',
 'ppt/2021.10.30-1ra.pptx',
 'ppt/2021.10.30-2da.pptx',
 'ppt/2021.11.13-1ra.pptx',
 'ppt/2021.11.13-2da.pptx',
 'ppt/2021.11.20-1ra.pptx',
 'ppt/2021.11.20-2da.pptx',
 'ppt/2021.11.27-1ra.pptx',
 'ppt/2021.11.27-2da.pptx']

In [4]:
def get_tables(doc_data):
    tables = []

    for slide in doc_data.slides:
        slide_tables = []
        for shape in slide.shapes:
            if shape.has_table:
                table_data = []
                for row in shape.table.rows:
                    row_data = []
                    for column in row.cells:
                        row_data.append(column.text_frame.text)

                    if len(row_data) == 2 and '%' in row_data[1]:
                        table_data.append(row_data)

                if table_data:
                    slide_tables.append(table_data)

        if slide_tables:
            slide_tables = itertools.chain(*slide_tables)
            slide_tables = list(slide_tables)
            tables.append(slide_tables)
            
    return tables

def process_tables(doc_tables):
    doc_table = pd.DataFrame([])

    for current_table in doc_tables:
        current_table = pd.DataFrame(current_table)
        current_table.columns = ['municipio', 'porcentaje']

        current_muni_df = muni_df[
            muni_df['municipio'].isin(current_table['municipio'])
        ]
        current_table['departamento'] = current_muni_df['departamento'].value_counts(
            ascending=False
        ).index[0]

        doc_table = pd.concat([doc_table, current_table])
        
    return doc_table

In [5]:
df = pd.DataFrame([])

for file_name in file_names:
    doc_data = Presentation(file_name)
    
    doc_tables = get_tables(doc_data)
    doc_table = process_tables(doc_tables)
    
    doc_date = os.path.basename(file_name).split('-')[0]
    doc_date = pd.to_datetime(doc_date)
    
    doc_table['fecha'] = doc_date
    
    if '1ra' in file_name:
        doc_table['dosis'] = 1
    elif '2da' in file_name:
        doc_table['dosis'] = 2
    else:
        raise('nombre sin formato')
    
    df = pd.concat([df, doc_table])
    
df = df[['fecha', 'departamento', 'municipio', 'dosis', 'porcentaje']]
df.porcentaje = df.porcentaje.str[:-1]

#remove departamento which already exists in merger
df = df.loc[:,["fecha","municipio","dosis","porcentaje"]]
df

Unnamed: 0,fecha,municipio,dosis,porcentaje
0,2021-10-23,Aucapata,1,10
1,2021-10-23,Caquiaviri,1,11
2,2021-10-23,Tito Yupanqui (Parquipujio),1,11
3,2021-10-23,Ancoraimes,1,12
4,2021-10-23,Calacoto,1,13
...,...,...,...,...
30,2021-11-27,Machacamarca,2,61
31,2021-11-27,Curahuara De Carangas,2,63
32,2021-11-27,Oruro,2,66
33,2021-11-27,Antequera,2,82


In [6]:
edu_merger = pd.read_csv("https://gist.githubusercontent.com/pr0nstar/df9829a59214fea0c7f9b70913c0af2c/raw/d872c1c53fc6e1ae7ff64c6fa06756b6ff2ae06a/vac.muni.csv", index_col=0)
edu_merger

Unnamed: 0,c_ut,municipio,departamento
0,10101,Sucre,Chuquisaca
1,10102,Yotala,Chuquisaca
2,10103,Poroma,Chuquisaca
3,10201,Azurduy,Chuquisaca
4,10202,Tarvita,Chuquisaca
...,...,...,...
334,90401,Nacebe (Santa Rosa Del Abuna),Pando
335,90402,Ingavi (Humaita),Pando
336,90501,Nuevo Manoa (Nueva Esperanza),Pando
337,90502,Villa Nueva (Loma Alta),Pando


In [7]:
df = df.merge(edu_merger, left_on= "municipio", right_on="municipio")
df = df.rename({"c_ut":"cod_ine"}, axis=1)
df = df.loc[:,["fecha","cod_ine","municipio","departamento","dosis","porcentaje"]]
df

Unnamed: 0,fecha,cod_ine,municipio,departamento,dosis,porcentaje
0,2021-10-23,20503,Aucapata,La Paz,1,10
1,2021-10-23,20503,Aucapata,La Paz,2,9
2,2021-10-30,20503,Aucapata,La Paz,1,10
3,2021-10-30,20503,Aucapata,La Paz,2,9
4,2021-11-13,20503,Aucapata,La Paz,1,13
...,...,...,...,...,...,...
3340,2021-11-13,70706,Camiri,Santa Cruz,2,92
3341,2021-11-20,70706,Camiri,Santa Cruz,1,97
3342,2021-11-20,70706,Camiri,Santa Cruz,2,92
3343,2021-11-27,70706,Camiri,Santa Cruz,1,98


In [8]:
# def get_codines(data, mundict):
#     municipios = pd.read_csv(mundict)
#     # df = pd.read_csv(data, index_col=0)
#     # df = df
#     df['dm'] = df.apply(lambda x: '{} - {}'.format(x['departamento'].lower().strip(), x['municipio'].lower().strip()), axis=1)
#     df['cod_ine'] = df.dm.map(municipios.set_index('dm').cod_ine.to_dict())
#     df['porcentaje'] = df['porcentaje'].apply(lambda x: float(x.replace('%', '')) / 100)
#     return df.drop(columns=['dm'])

# dfx = get_codines(data=df, mundict="other/mundict.csv")

# display(dfx)
# dfx[dfx.municipio=="Santivañez"]

In [9]:
df

Unnamed: 0,fecha,cod_ine,municipio,departamento,dosis,porcentaje
0,2021-10-23,20503,Aucapata,La Paz,1,10
1,2021-10-23,20503,Aucapata,La Paz,2,9
2,2021-10-30,20503,Aucapata,La Paz,1,10
3,2021-10-30,20503,Aucapata,La Paz,2,9
4,2021-11-13,20503,Aucapata,La Paz,1,13
...,...,...,...,...,...,...
3340,2021-11-13,70706,Camiri,Santa Cruz,2,92
3341,2021-11-20,70706,Camiri,Santa Cruz,1,97
3342,2021-11-20,70706,Camiri,Santa Cruz,2,92
3343,2021-11-27,70706,Camiri,Santa Cruz,1,98


In [10]:
df.to_csv("datos/vacunacion_municipios_bo.csv")