In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pandas import read_csv
from sklearn.model_selection import train_test_split

train_df = read_csv("data/train.csv")

y_train_df = train_df["target"]
X_train_df = train_df.drop(columns=["target"])

X_train, X_val, y_train, y_val = train_test_split(
    X_train_df, y_train_df, test_size=0.2, random_state=42
)

print(X_train.shape, y_train.shape)
print(X_train.columns)
print(X_train.head())

(629144, 17) (629144,)
Index(['transaction_time', 'merch', 'cat_id', 'amount', 'name_1', 'name_2',
       'gender', 'street', 'one_city', 'us_state', 'post_code', 'lat', 'lon',
       'population_city', 'jobs', 'merchant_lat', 'merchant_lon'],
      dtype='object')
        transaction_time                               merch          cat_id  \
142970  2019-12-23 15:18                   fraud_Block Group        misc_pos   
3810    2019-10-19 15:03                  fraud_Hirthe-Beier  health_fitness   
491835  2019-08-12 19:55      fraud_Hauck, Dietrich and Funk       kids_pets   
293147  2019-07-16 22:18                    fraud_Herman Inc        misc_pos   
175453  2019-10-06 12:55  fraud_Streich, Dietrich and Barton    shopping_net   

        amount   name_1   name_2 gender                       street  \
142970    2.97    David   Miller      M   622 Bradley Knoll Apt. 758   
3810     76.03  William  Jenkins      M            50614 Kevin Point   
491835   81.58   Justin   Fowler     

In [3]:
# preprocess pipeline to standart scale, OHE categorical features and impute missing values

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import joblib

numeric_features = X_train.select_dtypes(include=["int64", "float64"]).columns.tolist()
numerical_pipeline = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ])

categorical_features = X_train.select_dtypes(include=["object"]).columns.tolist()
categorical_pipeline = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ])


preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_pipeline, numeric_features),
        ("cat", categorical_pipeline, categorical_features)
    ]
)

X_train_processed = preprocessor.fit_transform(X_train)
joblib.dump(preprocessor, "models/preprocessor.pkl")

X_val_processed = preprocessor.transform(X_val)

In [4]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(
    iterations=50,
    learning_rate=0.05,
    depth=6,
    eval_metric="AUC",
    random_seed=42,
    verbose=100
)

model.fit(X_train_processed, y_train)

0:	total: 211ms	remaining: 10.3s
49:	total: 9.39s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7477911bdca0>

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score
import numpy as np

predictions = np.argmax(model.predict_proba(X_val_processed), axis=1)
accuracy = accuracy_score(y_val, predictions)
roc_auc = roc_auc_score(y_val, model.predict_proba(X_val_processed)[:, 1])

print(f"Accuracy: {accuracy}")
print(f"ROC AUC: {roc_auc}")

0.002956379103168094
Accuracy: 0.9963124733766936
ROC AUC: 0.9393041044758532


In [6]:
model.save_model("models/catboost_model.cbm")

In [None]:
X_test = read_csv("data/test.csv")
X_test_processed = preprocessor.transform(X_test)
test_predictions = np.argmax(model.predict_proba(X_test_processed), axis=1)

submission_df = read_csv("data/sample_submission.csv")
submission_df["prediction"] = test_predictions
submission_df.to_csv("data/submission.csv", index=False)