In [None]:
import sys

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import recall_score, f1_score, classification_report, accuracy_score

print('Modules imported sucessfully')

### Utility methods

In [None]:
# Just to identify different steps
def step(msg=''):
    dash = 40
    print(f'[{msg}]', '-' * dash, sep='')
    print('-' * (dash + len(msg) + 2))

def show(X, nb_row):
    step(f"Showing '{nb_row}' first line of X variables")
    print(X.head(nb_row).to_string())

### Extraction

In [None]:
def load_data():
    try:
        return pd.read_excel('dataset_pretraitement_fraude.xlsx')
    except Exception as e:
        print(f'Erreur de chargement {e}')

df = load_data()
print('Data loaded successfully')
df.shape

### Data Cleaning & Dataset Preparation

In [None]:
def remove_columns(df):
    # On enleve TransactionID, ClientID, Commentaire
    # print('revalidating df')
    columns = ['TransactionID', 'ClientID', 'Commentaire']
    # print('Dropping', columns)
    return remove_TODOs_columns(df.drop(columns=columns))


def remove_TODOs_columns(df):
    return df.loc[:, ~df.columns.str.startswith('TODO_')]


def build_X_y(df):
    X = df.drop(columns=['Fraude'])
    y = df['Fraude']

    return X, y

X, y = build_X_y(remove_columns(df))

### Feature Engineering Functions

In [None]:
def fix_dates(X):
    col = X.columns[0]

    raw = (
        X[col]
        .astype("string")
        .str.strip()
        .str.upper()
        .replace("INCONNU", pd.NA)
    )

    dates = pd.to_datetime(raw, format='mixed', errors='coerce')

    df = pd.DataFrame({
        "jour_semaine": dates.dt.weekday,
        "mois": dates.dt.month,
        "date_invalide": dates.isna().astype(int)
    })

    return df


def fix_montant(X):
    col = X.columns[0]
    montant = X[col].copy()

    montant = montant.astype('string').str.strip().str.upper()

    # identifying conversions needed
    is_k = montant.str.contains('K', na=False)
    is_usd = montant.str.contains('USD', na=False)

    # removing disturbing characters
    montant = montant.str.replace(r'[A-Z,\s]', '', regex=True)

    # convert into numeric
    montant = pd.to_numeric(montant, errors='coerce')

    # apply the conversions
    montant = montant.where(~is_usd, montant * 130)
    montant = montant.where(~is_k, montant * 1000)

    return pd.DataFrame(montant)

def normalize_devise(X):
    col = X.columns[0]
    devise = (
        X[col]
        .astype("string")
        .str.strip()
        .str.upper()
        .str.replace('USD', 'HTG', regex=False)
    )

    return devise.to_frame()

def fix_anciennete(X):
    col = X.columns[0]
    s = X[col].copy()
    s = s.astype("string").str.lower().str.strip()

    # flag missing / unknown
    is_na = s.isin(["inconnu", "nan", "na"])
    s = s.where(~is_na, pd.NA)

    # identify years vs months
    is_year = s.str.contains("year", na=False)
    is_month = s.str.contains("m", na=False)

    # remove non-numeric characters
    s = s.str.replace(r"[^\d.]", "", regex=True)
    s = pd.to_numeric(s, errors="coerce")

    # convert years and months to days
    s = s.where(~is_year, s * 365)
    s = s.where(~is_month, s * 30)

    df = pd.DataFrame({
        "anciennete_jours": s,
        "anciennete_invalide": s.isna().astype(int)
    })
    return df

def fix_revenu(X):
    col = X.columns[0]
    s = X[col].copy()
    s = s.astype("string").str.strip().str.upper()

    # replace missing
    s = s.replace(["INCONNU", "NAN"], pd.NA)

    # remove non-numeric characters
    s = s.str.replace(r"[^\d.]", "", regex=True)
    s = pd.to_numeric(s, errors="coerce")

    df = pd.DataFrame(s)
    return df

def normalize_ville(X):
    col = X.columns[0]
    s = X[col].copy()
    s = s.astype('str').str.lower().str.strip()

    is_na = s.isin(['inconnu', 'nan', 'none', '<na>'])
    s = s.where(~is_na, np.nan)

    names = {
        'Port-au-Prince' :['port-au-prince','p-au-p','port au prince','pap'],
        'Gonaïves':['gonaïves','gonaives'],
        'Hinche':['hin','hinche'],
        'Cap-Haïtien':['cap-haïtien','cap','cap haitien'],
        'Les Cayes':['les cayes','cayes'],
        'Jacmel':['jacmel'],
    }
    
    for city_name, messy_names in names.items():
        s = s.where(~s.isin(messy_names), city_name)

    return pd.DataFrame(s)

def clean_age(X):
    col = X.columns[0]
    s = X[col].copy()
    s = pd.to_numeric(s, errors="coerce")
    s = s.clip(lower=16, upper=100)
    return pd.DataFrame(s, columns=["Age"])

def clean_dette(X):
    col = X.columns[0]
    s = X[col].copy()
    s = pd.to_numeric(s, errors="coerce")

    return pd.DataFrame(s)

def clean_employe(X):
    col = X.columns[0]
    s = X[col].astype("str").str.strip().str.upper()

    mapping = {
        "YES": "Oui",
        "OUI": "Oui",
        "NO": "Non",
        "NON": "Non",
        "INCONNU": np.nan
    }

    s = s.replace(mapping)
    
    return pd.DataFrame(s)

def clean_niveau_etude(X):
    col = X.columns[0]
    s = X[col].copy()
    s = s.astype('str').str.capitalize().str.strip()
    s = s.replace(["Inconnu","Nan"], np.nan)

    return pd.DataFrame(s)

categorical_nominal_cols = [
    'Canal', 'Device', 'TypeMarchand', 'StatutMarital'
]

niveau_etude_order = [["Primaire", "Secondaire", "Licence", "Master", "Doctorat"]]


### Preprocessing

In [None]:
date_pipeline = Pipeline([
    ('extract_date_features', FunctionTransformer(fix_dates)),
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

montant_pipeline = Pipeline([
    ('convert', FunctionTransformer(fix_montant)),
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])

devise_pipeline = Pipeline([
    ('normalize', FunctionTransformer(normalize_devise)),
    ('encoder', OneHotEncoder())
])

ville_pipeline = Pipeline([
    ('normalize', FunctionTransformer(normalize_ville)),
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('encode', OneHotEncoder(handle_unknown='ignore'))
])

age_pipeline = Pipeline([
    ('clean', FunctionTransformer(clean_age)),
    ('imputer', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])

dette_pipeline = Pipeline([
    ('clean', FunctionTransformer(clean_dette)),
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

employe_pipeline = Pipeline([
    ('clean', FunctionTransformer(clean_employe)),
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('encode', OneHotEncoder())
])

etude_pipeline = Pipeline([
    ('clean', FunctionTransformer(clean_niveau_etude)),
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('encode', OrdinalEncoder(categories=niveau_etude_order))
])

nb_transaction_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

anciennete_pipeline = Pipeline([
    ('convert', FunctionTransformer(fix_anciennete)),
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])

revenu_pipeline = Pipeline([
    ('convert', FunctionTransformer(fix_revenu)),
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])

categorical_nominal_variables_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('date', date_pipeline, ['DateTransaction_raw']),
        ('montant', montant_pipeline, ['Montant_raw']),
        ('devise', devise_pipeline, ['Devise_indiquee']),
        ('ville', ville_pipeline, ['Ville_raw']),
        ('age', age_pipeline, ['Age']),
        ('dette', dette_pipeline, ['Dette_raw']),
        ('employe', employe_pipeline, ['Employe']),
        ('etude', etude_pipeline, ['NiveauEtude']),
        ('cat', categorical_nominal_variables_pipeline, categorical_nominal_cols),
        ('nb_transaction', nb_transaction_pipeline, ['NbTrans_24h']),
        ('anciennete', anciennete_pipeline, ['AncienneteCompte_raw']),
        ('revenu', revenu_pipeline, ['RevenuMensuel_raw']),
    ],  # verbose=True
)

full_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=100))
])

### Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

### Cross-Validation

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(full_model, X_train, y_train, cv=skf, scoring='accuracy')

step("Crossed validation")
print(f"Précision moyenne : {cv_scores.mean():.2%}")
print(f"Écart-type : {cv_scores.std():.4f}")

### Train