In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import RandomizedSearchCV

from skrub import GapEncoder
from skrub import Cleaner, TableReport
from skrub import StringEncoder, MinHashEncoder, TableVectorizer, TextEncoder

from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD

from sklearn.preprocessing import KBinsDiscretizer

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

from sklearn.feature_extraction.text import TfidfVectorizer

import sys
sys.path.append('../src/')
from qto_categorizer_ml.io import datasets
from matplotlib import pyplot as plt

Ideas:
- Creer des embeddings à partir d'un modèle sur étagère: https://skrub-data.org/stable/auto_examples/02_text_with_string_encoders.html
  - Gap Encoder:
    from skrub import GapEncoder
    gap = GapEncoder(n_components=30)
    X_trans = gap.fit_transform(X["text"])
    X_trans.insert(0, "text", X["text"])
    TableReport(X_trans)
  - MinHashEncoder 
  - TextEncoder
  - SringEncoder
  - TFidf?
- Quelles metriques?
  - Multiclass classification problem
    - accuracy score: accuracy(y_true, y_pred) -> simple a intepreter, biaisé avec des classes déséquilibrée
    - precision, recall, f1score : classification_report(y_true, y_pred)

In [2]:
dtypes = {
    'TRANSACTION_ID': str,
    'AMOUNT': float,
    'TYPE_OF_PAYMENT': str,
    'MERCHANT_NAME': str,
    'DESCRIPTION': str,
    'SIDE':  int,
    'CATEGORY': str,
}
parse_dates = ['DATE_EMITTED']

path = "../data/data-products.csv"
df = datasets.CSVReader(path=path, dtypes=dtypes, parse_dates=parse_dates).read()

# Replace missing values
df['MERCHANT_NAME'] = df.MERCHANT_NAME.fillna("No marchant")
df['DESCRIPTION'] = df.DESCRIPTION.fillna("No Description")
df['TYPE_OF_PAYMENT'] = df.TYPE_OF_PAYMENT.fillna("No type payment")

features = ['AMOUNT', 'TYPE_OF_PAYMENT', 'MERCHANT_NAME', 'DESCRIPTION']
target = 'CATEGORY'

X = df[features+[target]].drop_duplicates()
le = LabelEncoder()
y = le.fit_transform(X.pop(target))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [4]:
df.AMOUNT.describe()

count    3.372370e+05
mean     3.411381e+03
std      3.968469e+04
min      1.000000e-02
25%      3.390000e+00
50%      4.864000e+01
75%      6.466300e+02
max      7.511743e+06
Name: AMOUNT, dtype: float64

In [3]:
from sklearn.pipeline import make_pipeline

desc_pipe = make_pipeline(
    TfidfVectorizer(max_features=1000),
    TruncatedSVD(n_components=50, random_state=42)
)

merch_pipe = make_pipeline(
    TfidfVectorizer(max_features=500),
    TruncatedSVD(n_components=30, random_state=42)
)

type_pipe = make_pipeline(
    OneHotEncoder(handle_unknown='ignore')
)

preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), ['AMOUNT']),
        ('cat', type_pipe, ['TYPE_OF_PAYMENT']),
        ('desc', desc_pipe , "DESCRIPTION"),
        ('merchant', merch_pipe , 'MERCHANT_NAME')
    ]
)

pipeline = Pipeline(
    steps = [
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier(n_estimators=200, max_depth=30, n_jobs=-1, random_state=42))
    ]
)

pipeline

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_components,50
,algorithm,'randomized'
,n_iter,5
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,42
,tol,0.0

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_components,30
,algorithm,'randomized'
,n_iter,5
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,42
,tol,0.0

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,30
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


Choosing the right hyperparameters can significantly improve model performance. sklearn offers several methods for automated hyperparameter tuning: GridSearchCV, RandominzedCV, HalvingGridSearchCV, HalvingRandomSearchCV.
Best practices I used are: 
- Always combine these methods with cross-validation (cv parameter)
- Use n_jobs=-1 to parallelize the search
- Choose scoring metric based on the problem we want to solve (e.g. accuracy, f1_macro, roc_auc)

For the categorizer, I selected RandomizedSearchCV because it efficiently explore a wide hyperparameter space with fewer computations, making it ideal for the time-constrained searches.

In [224]:
param_grid ={
    'classifier__n_estimators': [50,100,200],
    'classifier__max_depth': [10, 20, 30, None]
}

grid_search = RandomizedSearchCV(
    pipeline,
    param_grid, 
    n_iter=10,
    cv=3,
    scoring="accuracy",
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

grid_search.best_params_, grid_search.best_score_

({'classifier__n_estimators': 200, 'classifier__max_depth': 30},
 0.598390937434056)

In [240]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

CPU times: user 5 μs, sys: 2 μs, total: 7 μs
Wall time: 14.1 μs


In [None]:
cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
acc

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

# Colonnes
numeric_features = ["AMOUNT"]
categorical_features = ["TYPE_OF_PAYMENT"]
text_features = ["MERCHANT_NAME", "DESCRIPTION"]

le = LabelEncoder()
X_train, X_test, y_train, y_test = train_test_split(
    df[numeric_features+categorical_features+text_features], le.fit_transform(df.CATEGORY), test_size=0.2, random_state=42, stratify=df.CATEGORY
)

# Preprocessing
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

text_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="")),
    ("tfidf", TfidfVectorizer(max_features=100))
])

# Combine all preprocessors
preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features),
    ("merchant_tfidf", text_transformer, "MERCHANT_NAME"),
    ("desc_tfidf", text_transformer, "DESCRIPTION"),
])

# Full pipeline
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
])

In [None]:
# Separate features and target
X = df.drop(columns=['CATEGORY'])
y = df['CATEGORY']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing for numeric columns
numeric_features = ['AMOUNT', 'YEAR', 'MONTH', 'DAY', 'SIDE']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Define preprocessing for categorical columns
categorical_features = ['TYPE_OF_PAYMENT', 'MERCHANT_NAME', 'DESCRIPTION']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create the pipeline with a classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(X_test)