In [1]:
import sklearn
import numpy as np
import pandas as pd

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.width', 1000)

# Step 1. Cleaning and Preparing Data

In [54]:
df = pd.read_csv('./data/data-sample-invoices.csv', index_col=0)

In [55]:
df

Unnamed: 0,counterparty_name,counterparty_alias,counterparty_rfc,descriptions,text,id,prepayment,nature,cost_center
0,OSCAR ROBERTO POLANCO CARRILLO,,POCO630718D99,ARRENDAMIENTO CORRESPONDIENTE AL MES DE AGOST...,,86359320,FALSO,3090.0,10056.0
1,AMJ EQUIPO INDUSTRIAL,,AEI100412ACA,BOTA HULE JOMART SUELA ROJA C/C No.25 BOTA HU...,,64005390,FALSO,,10032.0
2,JORGE ARCE JIMENEZ,,AEJJ540327TV6,RENTA DE TRACTOCAMION CON LOWBOY DE 80 TONS. ...,,75922900,FALSO,,10032.0
3,DSV AIR SEA,,DAA020218JY1,Asesoria Aduanal,,76614700,FALSO,,13001.0
4,REMAPA,,REM120119US5,HONORARIOS POR GESTIÓN PARA DERECHO DE PASO ...,,78995570,FALSO,3461.0,10032.0
...,...,...,...,...,...,...,...,...,...
26035,DALCRISE,,DAL1612019S0,DIESEL ULSD FECHA DE OC 26/10/21 OC 330055640...,,343032420,FALSO,,10073.0
26036,Comunicación Vial,,CVI970519C90,"Estimación No. 3, correspondiente al Suminis...",,338541240,FALSO,,10032.0
26037,Centro Nacional de Control de Energia,,CNC140828PQ4,20210203C017001B25182000000000003374000000000...,,321294480,FALSO,135.0,10001.0
26038,Centro Nacional de Control de Energia,,CNC140828PQ4,20210205C017012B11182000000000051785000000000...,,321391480,FALSO,135.0,10001.0


- Integer columns with at least one `NaN` are converted automaticaly by pandas to floatg64.
- To allows these columns to be integer and have null values, we convert to 'Int64' dtype (nullable Int array)
- Finally, convert to 'category' dtype

In [80]:
df['nature'] = df['nature'].astype('Int64')
df['nature'] = df['nature'].astype('category')

df['cost_center'] = df['cost_center'].astype('Int64')
df['cost_center'] = df['cost_center'].astype('category')

### Targets: `nature`, `cost_center`, `prepayment` values in extra_data

In [7]:
len(df['nature'].unique())  # 172 classes over 26,040 examples

172

In [8]:
len(df['cost_center'].unique())  # 274 classes over 26,040 examples

274

In [9]:
#df["prepayment"].replace({"FALSO": False, "VERDADERO": True}, inplace=True)  # highly skewed; 26,005 is False: 99.8%

#### Benchmark con Logistic regression o Multinomial?

In [67]:
df.loc[[1344]]
# in this sample, text is not provided.

Unnamed: 0,counterparty_name,counterparty_alias,counterparty_rfc,descriptions,text,id,prepayment,nature,cost_center
1344,REMAPA,,REM120119US5,HONORARIOS POR GESTI&Oacute;N PARA TRABAJOS E...,,45765920,FALSO,3461.0,10031.0


In [13]:
df.drop('text', inplace=True, axis=1)

### Data cleaning and extraction

In [42]:
df['all_text'] = df['counterparty_name'] + ' ' + df['counterparty_rfc'] + ' ' + df['descriptions']  # tokenize and vectorize
df['all_text'] = df['all_text'].astype(str)

In [43]:
import re
def convertAccented(text, pattobj):
    '''
    Restores characters from a normalized, lowercase text
    like "&oacute;" into "ó"
    '''
    accented = {
        'a':'á',
        'e':'é',
        'i':'í',
        'o': 'ó',
        'u':'ú'
    }
    
    def accentRepl(matchobj):
        letter = matchobj.group(1)
        return accented[letter]
    
    text = pattobj.sub(accentRepl, text)
    return text

In [44]:
def normalizeTextColumn(dataframe):
    # lowercase and remove invalid characters from `all_text` column
    patt = r'&([aeiou])acute;'  # vowel is captured by group 1
    rgx = re.compile(patt)
    dataframe['all_text'] = dataframe['all_text'].apply( lambda x: convertAccented(x.lower(), rgx))

In [46]:
patt = r'&([aeiou])acute;'  # vowel is captured by group 1
rgx = re.compile(patt)

convertAccented(df['all_text'][1344].lower(), rgx)

'remapa rem120119us5  honorarios por gestión para trabajos especializados cc 10031 proveedor: 326824'

In [47]:
normalizeTextColumn(df)

Para este punto tenemos las features x1, x2,..,xp concatenadas como un solo texto.
Tenemos que vectorizar el texto de cada observación antes de pasarlo a un algoritmo de clasificación.

En scikit-learn, los vectorizers implementan tokenización. 

##### Limpieza previa del Texto
1. Una buena práctica es quitar la puntuación primero.
2. En español, quizá no deberíamos quitar acentos (aunque a veces no vienen con ellos nisiquiera)
3. Queremos obtener únicamente palabras relevantes que existen en el español? 
4. Sin limpieza, ¿cuáles son los tokens más frecuentes? ¿Qué tanto poder de predicción tiene un RFC?
5. Usar una función previa o aprovechar los parámetros de un Vectorizer?

https://scikit-learn.org/0.15/modules/feature_extraction.html#text-feature-extraction

In [86]:
df[df['cost_center'].isnull()]

Unnamed: 0,counterparty_name,counterparty_alias,counterparty_rfc,descriptions,text,id,prepayment,nature,cost_center
21,CENTRO INTEGRAL DE CALIBRACION Y CONSULTORIAS,,CIC150211FP1,Semana del 19 al 24 de Mar. Sup. SST Coordina...,,62429290,FALSO,,
34,ING&CONSTRUCCION,,ING0906279X2,"RECIBÍ DE MOTA-ENGIL MEXICO, S.A. DE C.V. POR...",,24139780,FALSO,,
103,ANGELICA ROMERO HERNANDEZ,,ROHA730807IC7,HOSPEDAJE DEL 21 AL 27 DE MAYO DEL 2018,,74667870,FALSO,3372.0,
267,Consultores Profesionales en Soporte a T.I.,,CPS130326CA0,MONITOR,,102978820,FALSO,6887.0,
330,JORGE LOPEZ SALAZAR,,LOSJ7708257M9,LIBERACI&Oacute;N DE PIPAS Y MAQUINAR&Iacute;...,,36430870,FALSO,3390.0,
362,CORPORACION SANCHEZ,,CSA110614QG1,PASTILLA SANIT HARPIC FRESC ACT AZUL 35G LIMP...,,132515270,FALSO,,
373,QUID LUCERNA,,QLU1403136Y8,SUMINISTRO PERSONAL ESPECIALIZADO EN REG. DE ...,,36133220,FALSO,1.0,
414,JUAN MANUEL AMADOR JUAREZ,,AAJJ900706C20,ING. ADALBERTO RODRIGUEZ CENTRO DE COSTOS C.H...,,24950190,FALSO,3372.0,
465,TELEFONOS DE MEXICO,,TME840315KT6,SERVICIOS ESPECIALES UNINET,,34972170,FALSO,3333.0,
525,Radiomóvil Dipsa,,RDI841003QJ4,Servicios de Telecomunicaciones Cargo de equipo,,115426690,FALSO,3332.0,
