In [74]:
import numpy as np
import scipy.sparse
import pickle
import xgboost as xgb
import os
import pandas as pd
from scipy import sparse
from sklearn import metrics, model_selection
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer, CountVectorizer
from tqdm import tqdm
from classif import PartSelector
import re

In [75]:
df = pd.read_csv('./data/data-sample-invoices.csv')

# Convert codes as floats into category types
df['nature'] = df['nature'].astype('Int64')
df['nature'] = df['nature'].astype('category')

df['cost_center'] = df['cost_center'].astype('Int64')
df['cost_center'] = df['cost_center'].astype('category')

In [76]:
df['counterparty_rfc'] = df['counterparty_rfc'].astype('str')
df['counterparty_name'] = df['counterparty_name'].astype('str')
df['descriptions'] = df['descriptions'].astype('str')

In [77]:
feature_names = ['counterparty_name', 'counterparty_rfc', 'descriptions']
target_names = ['nature', 'cost_center']
# subset the working dataset columns
data = df[feature_names + target_names]

In [78]:
data.dtypes

counterparty_name      object
counterparty_rfc       object
descriptions           object
nature               category
cost_center          category
dtype: object

#### Create an `all_text` column for vectorization
Transform NA values (NaN) into empty strings for the current target and concatenate nto an `all_text` col

In [79]:
def _convertAccented(text, pattobj):
        '''
        Restore characters from lowercase text, like "&oacute;" into "ó"
        '''
        accented = {
            'a':'á',
            'e':'é',
            'i':'í',
            'o': 'ó',
            'u':'ú'
        }

        def accentRepl(matchobj):
            letter = matchobj.group(1)
            return accented[letter]

        return pattobj.sub(accentRepl, text)

In [80]:
patt = r'&([aeiou])acute;'  # vowel is captured by group 1
accent_rgx = re.compile(patt)      # compiled beforehand for performance
    
def f(row):
    '''
    Creates a value for the column all_text with the concatenation of string values, 
    emmpty strings if nan.
    '''
    notna = row.notna()
    
    name = row['counterparty_name'] if notna['counterparty_name'] else ''
    rfc = row['counterparty_rfc'] if notna['counterparty_rfc'] else ''
    descriptions = row['descriptions'] if notna['descriptions'] else ''
    
    all_text = ' '.join([
                    name,
                    rfc,
                    descriptions,
                ]).replace('\n', ' ')
    
    all_text = _convertAccented(all_text.lower(), accent_rgx)
    return all_text

In [81]:
data['all_text'] = data.apply(lambda x: f(x), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['all_text'] = data.apply(lambda x: f(x), axis=1)


In [82]:
data['all_text'][1]

'amj equipo industrial aei100412aca  bota hule jomart suela roja c/c no.25 bota hule jomart suela roja c/c no.29 bota hule jomart suela roja c/c no.30 casco iga cachucha matraca c/e naranja chaparrera de carnaza guante sivsa carnaza corto t/l respirador 3m 8210 polaina suricata chaleco de malla c/reflejante verde mr ut'

## `nature` classifier
1. Remove rows with Na in target
2. Separate training and validation
3. Run model
4. Assessment

#### Remove rows that have NA in target

In [102]:
na_nature = data['nature'].isna().sum()
print("Total", len(data))
print(na_nature)
print("Without na", len(data) - na_nature)

Total 26040
9844
Without na 16196


In [103]:
nature_data = data.dropna(axis=0, subset=['nature'], how='any', inplace=False)

In [104]:
len(nature_data)

16196

#### Vectorize: strip accents, hashing vectorizer

In [105]:
# just like in current implementation
vect = HashingVectorizer(n_features=262144, ngram_range=(1, 2),
                        binary=True, strip_accents='ascii')

In [106]:
# Add the Pipeline Steps manually FOR NATURE
X = nature_data['all_text']  # name, rfc, descriptions
vectorized_X = vect.transform(X)

#### Split the data

In [107]:
y = np.array(nature_data['nature']).astype('int64')

X_train, X_test, y_train, y_test = model_selection.train_test_split(
                                    vectorized_X, y, test_size=0.3, random_state=0
                                    )

### Instantiate `DMatrix` 

In [109]:
X_train.shape[0]==len(y_train) and  X_test.shape[0]==len(y_test)  # subset to 18,228 obs. for training

True

In [110]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest =  xgb.DMatrix(X_test, label=y_test)

# obj_df["body_style_cat"] = obj_df["body_style"].cat.codes
# obj_df.head() to make categories available as numbers

#### Init a boosted classifier

[Softmax!](https://en.wikipedia.org/wiki/Softmax_function)

In [118]:
objective_func = 'multi:softmax' # does it work with multi class?  | multi:softmax | multi:softprob

# we need to set the number of classes in num_class.
params = {
    'max_depth':2,  # the deeper the more complex. default=6
    'eta': 1, # step size shrinkage in every update to prevent overfitting. 
    'objective':objective_func,
    'num_class': len(set(y))
}
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
num_round = 2 # number of boosting rounds

In [119]:
bst = xgb.train(params, dtrain, num_round, watchlist)

XGBoostError: [12:04:24] C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/objective/multiclass_obj.cu:120: SoftmaxMultiClassObj: label must be in [0, num_class).