In [1]:
import os
import glob
import itertools

import numpy as np
import pandas as pd

from pptx import Presentation

In [2]:
muni_df = pd.read_csv('./data/sdsn.gen.csv')

In [3]:
file_names = glob.glob('./ppt/*')
file_names = sorted(file_names)

file_names

['./ppt/2021.10.23-1ra.pptx',
 './ppt/2021.10.23-2da.pptx',
 './ppt/2021.10.30-1ra.pptx',
 './ppt/2021.10.30-2da.pptx',
 './ppt/2021.11.13-1ra.pptx',
 './ppt/2021.11.13-2da.pptx',
 './ppt/2021.11.20-1ra.pptx',
 './ppt/2021.11.20-2da.pptx']

In [4]:
def get_tables(doc_data):
    tables = []

    for slide in doc_data.slides:
        slide_tables = []
        for shape in slide.shapes:
            if shape.has_table:
                table_data = []
                for row in shape.table.rows:
                    row_data = []
                    for column in row.cells:
                        row_data.append(column.text_frame.text)

                    if len(row_data) == 2 and '%' in row_data[1]:
                        table_data.append(row_data)

                if table_data:
                    slide_tables.append(table_data)

        if slide_tables:
            slide_tables = itertools.chain(*slide_tables)
            slide_tables = list(slide_tables)
            tables.append(slide_tables)
            
    return tables

def process_tables(doc_tables):
    doc_table = pd.DataFrame([])

    for current_table in doc_tables:
        current_table = pd.DataFrame(current_table)
        current_table.columns = ['municipio', 'vacunacion']

        current_muni_df = muni_df[
            muni_df['municipio'].isin(current_table['municipio'])
        ]
        current_table['departamento'] = current_muni_df['departamento'].value_counts(
            ascending=False
        ).index[0]

        doc_table = pd.concat([doc_table, current_table])
        
    return doc_table

In [5]:
df = pd.DataFrame([])

for file_name in file_names:
    doc_data = Presentation(file_name)
    
    doc_tables = get_tables(doc_data)
    doc_table = process_tables(doc_tables)
    
    doc_date = os.path.basename(file_name).split('-')[0]
    doc_date = pd.to_datetime(doc_date)
    
    doc_table['fecha'] = doc_date
    
    if '1ra' in file_name:
        doc_table['dosis'] = 1
    elif '2da' in file_name:
        doc_table['dosis'] = 2
    else:
        raise('nombre sin formato')
    
    df = pd.concat([df, doc_table])

In [6]:
df = df[['fecha', 'departamento', 'municipio', 'dosis', 'vacunacion']]

In [7]:
df

Unnamed: 0,fecha,departamento,municipio,dosis,vacunacion
0,2021-10-23,La Paz,Aucapata,1,10%
1,2021-10-23,La Paz,Caquiaviri,1,11%
2,2021-10-23,La Paz,Tito Yupanqui (Parquipujio),1,11%
3,2021-10-23,La Paz,Ancoraimes,1,12%
4,2021-10-23,La Paz,Calacoto,1,13%
...,...,...,...,...,...
30,2021-11-20,Oruro,Machacamarca,2,61%
31,2021-11-20,Oruro,Curahuara De Carangas,2,63%
32,2021-11-20,Oruro,Oruro,2,66%
33,2021-11-20,Oruro,Antequera,2,82%
