In [9]:
import os, re
import pandas as pd
import cchardet as chardet
from tqdm import tqdm

In [3]:
path = '../data/'

filenames = [os.path.join(path, x) for x in os.listdir(path)]

In [16]:
def open_file(filepath):
    with open(filepath, 'rb') as f:
        data = f.read()
        encoding = chardet.detect(data).get('encoding', None)
    with open(filepath, 'r', encoding=encoding) as f:
        data = f.read()
        
    lista = re.split(r'Página\(\d+\)', data)
    
    return [x for x in lista if 'FECHA' in x]

In [30]:
class Page:


    def __init__(self, page):
        self.page = page
    
    
    def _create_df(self):
        page = pd.DataFrame(self.page.split('\n'), columns=['text'])
        page['main'] = page['text'].str.contains('\d{2}\-\d+\-\d{1}')
        page['offset'] = page['text'].shift(-1)
        self.df = page


    def _find_index_title(self):
        index = self.df.loc[self.df['text'].str.contains('TIPO\sNUMERO')]
        self.index = index.index[0]
        self.title = index['text'].iloc[0]


    def _find_slices(self):
        slices = pd.DataFrame([i.span() for i in re.finditer('\|', self.title)], columns=['start', 'finish'])
        slices['shift'] = slices['start'].shift(-1)
        self.slices = slices[['finish', 'shift']].dropna().astype(int).to_numpy().tolist()


    def _parse_columns(self):
        for i, (start, finish) in enumerate(self.slices):
            self.df[f'text_{i}'] = self.df['text'].apply(lambda x: x[start:finish])
            self.df[f'offset_{i}'] = self.df['offset'].fillna('').apply(lambda x: x[start:finish])
        
        texts = ['M', 'CUIT', 'DOCUMENTO', 'NOMBRE', 'SEXO', 'I V', 'S E', 'NACIMIENTO', 'DOMICILIO', 'TELEFEONO', 'A C', 'C A', 'S I', 'CODIGO', 'USUARIO', 'FECHA']
        self.texts = dict(zip([col for col in self.df.columns if 'text_' in col], texts))
        self.offset = {'offset_8':'EMAIL', 'offset_9':'CELULAR'}
        
        self.df = self.df.rename(columns=self.texts)
        cols = [x for x in [col for col in self.df.columns if 'offset_' in col] if x not in self.offset.keys()]
        self.df = self.df.drop(columns=cols)
        self.df = self.df.rename(columns=self.offset)
        
        
    def get_page_df(self):
        self._create_df()
        self._find_index_title()
        self._find_slices()
        self._parse_columns()
        cols = [x for x in self.texts.values()] + [x for x in self.offset.values()]
        
        self.result = self.df.loc[self.df['main']==True, cols]
        
        return self.result

In [31]:
def main(origin, destination):
    
    filenames = [os.path.join(origin, x) for x in os.listdir(origin)]
    
    for file in filenames:
        try:
            data = open_file(file)
            pages = []
            try:
                for page in data:
                    result = Page(page)
                    result = result.get_page_df()
                    pages.append(result)
                    
                pages = pd.concat(pages)
                
                if os.path.exists(destination):
                    pages.to_csv(destination, mode='a', index=False, header=False)
                else:
                    pages.to_csv(destination, mode='w', index=False)
            except Exception as e:
                print(e)
        except Exception as e:
            print(f"Failed to load {file}. Reason:\n{e}")
    
    

In [32]:
main(path, 'test2.csv')

In [33]:
df = pd.read_csv('test2.csv')

In [34]:
df

Unnamed: 0,M,CUIT,DOCUMENTO,NOMBRE,SEXO,I V,S E,NACIMIENTO,DOMICILIO,TELEFEONO,A C,C A,S I,CODIGO,USUARIO,FECHA,EMAIL,CELULAR
0,M,20-14609607-8,DU 14.609.607,PAZCEL ROBERTO WASHINGTON,M,4,1,29/08/1961,TELESFORA S. DE BENAVIDEZ ESTE 5400 SAN JUAN ...,,813,2,,G00111,,01/02/2021,ROBERTOWAS29@GMAIL.COM ...,0549264-5476575
1,A,20-21362724-5,DU 21.362.724,ARACENA SANDRO MARCELO,M,2,1,21/06/1970,MISIONES NORTE 1672 5413 CHIMBAS ...,,813,2,,G00111,,01/02/2021,agustinaaracena8@gmail.com ...,0549264-5433620
2,A,20-22359553-8,DU 22.359.553,FLORES BERNARDO LUIS,M,4,1,14/07/1971,LA RIOJA 144 5419 ALBARDON ...,,813,2,,G00111,,01/02/2021,kevinpena314@gmail.com ...,0549264-4051182
3,M,23-31081500-4,DU 31.081.500,PELETAY GRISELDA ELIZABET,F,4,1,21/11/1984,SALTA (SUR) 510 5400 SAN JUAN ...,,821,2,1,G00111,,01/02/2021,GRISPELETAY@GMAIL.COM ...,054911-65210907
4,M,27-12610526-1,DU 12.610.526,MESTRE ETHEL GRACIELA,F,4,1,29/04/1958,JACARANDA SUR 1355 5400 SAN JUAN ...,0510264-4216491,822,2,1,G00111,,01/02/2021,ETHEL_MESTRE@HOTMAIL.COM ...,0549264-5827309
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
923,M,20-39650660-3,DU 39.650.660,MERCADO LUCAS JOEL,M,4,1,16/09/1996,RICARDO COLOMBO 1 5419 ALBARDON ...,5400264-4911943,821,2,,N77930,"MILLÁN,MAITÉ",31/08/2020,mercadojoel820@gmail.com ...,5490264-5437263
924,M,20-07949544-2,LE 07.949.544,GATTI ENRIQUE ANGEL,M,1,1,15/10/1946,SAAVEDRA NORTE V LOPEZ MANSILL 5400 RIVADAVIA...,02644230363,652,2,,PFWEB,,31/08/2020,GATTIFEDERICO@HOTMAIL.COM ...,0264154428222
925,M,20-12333727-2,DU 12.333.727,LEIVA ANTONIO,M,7,1,04/05/1956,TUCUMAN 0 5419 ALBARDON ...,264 154406708,821,2,1,PFWEB,,31/08/2020,ANTONIOLEIVAFERRER1956@GMAIL.COM,
926,M,27-17923605-8,DU 17.923.605,LEVEQUE MARIANA,F,4,1,23/08/1964,AV. CORDOBA ESTE 446 5400 SAN JUAN ...,54 264 4084037,821,2,1,PFWEB,,31/08/2020,DANILEVEQUE@HOTMAIL.COM ...,54264 5055783
