# AFIP Data Cleansing 

In [1]:
import re
import cchardet as chardet
import pandas as pd
import dask.dataframe as dd

In [2]:
def get_encoding_type(file_path):
    rawdata = open(file_path, 'rb').read()
    result = chardet.detect(rawdata)
    return result.get('encoding')

In [3]:
file_dir = '../../data/estatutos/external_sources/SELE-SAL-CONSTA.p20out1.20200509.tmp'
get_encoding_type(file_dir)

'ISO-8859-4'

In [4]:
ddf = dd.read_csv(file_dir, encoding='ISO-8859-4', blocksize=6400, dtype=str)

In [5]:
df = ddf.partitions[-1].compute()

In [6]:
cuit = re.compile(r'(\d{2}(\d{8})\d)')
basura = re.compile(r'[A-Z]+\d+$')

In [7]:
df.columns = ['concatenado']

In [8]:
cuits = df.concatenado.str.extract(cuit)

In [9]:
cuits.columns = ['cuit', 'dni']

In [10]:
cuits

Unnamed: 0,cuit,dni
0,34600983158,60098315
1,34602466150,60246615
2,34603819499,60381949
3,34604104035,60410403
4,34604160741,60416074
...,...,...
82,34688233358,68823335
83,34999032089,99903208
84,34999230573,99923057
85,34999257560,99925756


In [11]:
df['concatenado'] = df.concatenado.str.replace(cuit, '', regex=True)
df['concatenado'] = df.concatenado.str.replace(basura, '', regex=True)
df

Unnamed: 0,concatenado
0,CONSORCIO DE PROPIETARIOS
1,FIORDANI DOUGLAS Y FIORDANI
2,CONSORCIO DE PROPIETARIOS AV
3,CONSORCIO DE PROPIETARIOSAVDA
4,JUNCALITO SOCIEDAD COLECTIVA
...,...
82,FANCON SA
83,GOBIERNO DE LA CIUDAD DE
84,DIRECCION GENERAL DE
85,MUNICIPALIDAD DE LA MATANZA


In [12]:
df = pd.concat([df, cuits], axis=1)

In [13]:
df

Unnamed: 0,concatenado,cuit,dni
0,CONSORCIO DE PROPIETARIOS,34600983158,60098315
1,FIORDANI DOUGLAS Y FIORDANI,34602466150,60246615
2,CONSORCIO DE PROPIETARIOS AV,34603819499,60381949
3,CONSORCIO DE PROPIETARIOSAVDA,34604104035,60410403
4,JUNCALITO SOCIEDAD COLECTIVA,34604160741,60416074
...,...,...,...
82,FANCON SA,34688233358,68823335
83,GOBIERNO DE LA CIUDAD DE,34999032089,99903208
84,DIRECCION GENERAL DE,34999230573,99923057
85,MUNICIPALIDAD DE LA MATANZA,34999257560,99925756


In [14]:
with open(file_dir, 'r', encoding='ISO-8859-4') as f:
    text = f.read()

In [15]:
entries = text.split('\n')

In [23]:
df = pd.DataFrame(entries, columns=['concatenado'])

In [24]:
df = dd.from_pandas(df, chunksize=10000)

In [25]:
cuits = df.concatenado.str.extract(cuit)
cuits.columns = ['cuit', 'dni']
df['concatenado'] = df.concatenado.str.replace(cuit, '', regex=True)
df['concatenado'] = df.concatenado.str.replace(basura, '', regex=True)
df = dd.concat([df, cuits], axis=1)

In [27]:
%%time
df.to_csv('../../data/estatutos/external_sources/afip_names_cuits.csv', encoding='UTF-8', single_file=True)

CPU times: user 52.1 s, sys: 1.01 s, total: 53.1 s
Wall time: 52.5 s


['/home/cpega/projects/ai/jupyter-tests/NLPTools/nlptools/data/estatutos/external_sources/afip_names_cuits.csv']