In [34]:
# Import necessary libraries
import logging
import pathlib
from typing import Tuple, List

import gensim.downloader as api
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedShuffleSplit
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
# Initialize logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [9]:
X_train_file = pathlib.Path("data/X_train_G3tdtEn.csv")
y_train_file = pathlib.Path("data/Y_train_2_XPXJDyy.csv")
X_test_file = pathlib.Path("data/X_test_8skS2ey.csv")

In [4]:
# Define the Word2Vec transformer class
class W2V(BaseEstimator, TransformerMixin):
    def __init__(self, num_words=None, **kwargs):
        self.num_words = num_words
        self.tokenizer = Tokenizer(num_words=num_words, **kwargs)

    def fit(self, X, y=None):
        self.Word2 = api.load("word2vec-google-news-300")
        return self

    def transform(self, X, y=None):
        x = np.array(X.values)
        for i in range(len(x)):
            for j in range(len(x[i])):
                tokens = x[i][j].split()
                embeddings = [
                    self.Word2[token]
                    for token in tokens
                    if token in self.Word2.key_to_index
                ]
                if len(embeddings) > 0:
                    mean = np.mean(embeddings)
                else:
                    mean = 0
                x[i][j] = mean
        return x

    def get_params(self, deep=True):
        return {"num_words": self.num_words}

In [5]:
def load_train_df(path: pathlib.Path) -> pd.DataFrame:
    mixed_columns = (
        ["item" + str(i) for i in range(1, 25)]
        + ["make" + str(i) for i in range(1, 25)]
        + ["model" + str(i) for i in range(1, 25)]
        + ["goods_code" + str(i) for i in range(1, 25)]
    )
    mixed_columns_dtype = {col: str for col in mixed_columns}
    return pd.read_csv(path, dtype=mixed_columns_dtype)


def load_test_df(path: pathlib.Path) -> pd.Series:
    return pd.read_csv(path)["fraud_flag"]

In [6]:
def df_to_input(df: pd.DataFrame) -> Tuple[pd.DataFrame, List[str], List[str]]:
    cols_base = ["goods_code"]
    columns_to_drop = ["ID"] + [col + str(i) for col in cols_base for i in range(1, 25)]

    df = df.drop(columns_to_drop, axis=1)

    # Identify the columns to apply RNN tokenization
    rnn_columns = ["make", "item", "model"]  # Add more columns as needed
    rnn_columns = [col + str(i) for col in rnn_columns for i in range(1, 25)]

    # Identify the categorical and numerical columns
    categorical_columns = rnn_columns
    numerical_columns = [col for col in df.columns if col not in set(categorical_columns)]

    # Clean data
    for col in categorical_columns:
        df[col] = df[col].fillna("")
    for col in numerical_columns:
        df[col] = df[col].fillna(0)

    return df, categorical_columns, numerical_columns

In [19]:
X_train_df = load_train_df(X_train_file)
y_train_df = load_test_df(y_train_file)

X_train_df, categorical_columns, numerical_columns = df_to_input(X_train_df)

In [41]:
# Define transformers
cat_pipeline = make_pipeline(W2V())
num_pipeline = make_pipeline(StandardScaler())


# Create the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("cat_pipeline", cat_pipeline, categorical_columns),
        ("num_pipeline", num_pipeline, numerical_columns),
    ]
)

rfc = RandomForestClassifier(random_state=0, verbose=True)

# Create the pipeline
pipeline = Pipeline(steps=[("preprocess", preprocessor), ("model", rfc)], verbose=True)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_df, y_train_df, test_size=0.2, random_state=0, stratify=y_train_df
)

param_grid = {
    "model__n_estimators": [400, 500, 700],
    # "max_features": ["auto", "sqrt", "log2"],
    "model__max_depth": [10, 20, 30],
    # "criterion": ["gini", "entropy"],
}

sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

grid = GridSearchCV(
    pipeline,
    param_grid,
    scoring="average_precision",
    cv=sss,
    n_jobs=-1,
    verbose=True,
)

In [42]:
# Fit the pipeline
grid.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  44.9s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  45.2s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  45.2s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  46.5s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  45.7s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  47.3s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  46.7s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  47.5s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  47.0s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  47.0s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  47.9s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  47.1s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:   10.6s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  12.5s


[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:   11.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Pipeline] ............. (step 2 of 2) Processing model, total=  13.4s


[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:   11.6s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  11.8s


[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:   11.5s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   14.4s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  11.9s
[Pipeline] ............. (step 2 of 2) Processing model, total=  16.3s


[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:   11.9s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  12.2s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   14.3s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  14.6s


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   14.0s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  14.3s
[Pipeline] ............. (step 2 of 2) Processing model, total=  14.4s


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   14.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   14.4s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  14.6s


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done 700 out of 700 | elapsed:   19.5s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  19.8s


[Parallel(n_jobs=1)]: Done 700 out of 700 | elapsed:   19.7s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  20.0s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 700 out of 700 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Done 700 out of 700 | elapsed:    0.7s finished


[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  52.1s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  46.8s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  51.2s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  48.3s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  47.9s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  50.5s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  50.8s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  46.9s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  50.1s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  47.9s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  43.1s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  43.0s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:   17.5s finished
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:   17.1s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  17.7s
[Pipeline] ............. (step 2 of 2) Processing model, total=  17.4s


[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:   17.5s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  17.8s


[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:   18.4s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  21.5s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:   18.2s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  18.4s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    0.6s finished
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 700 out of 700 | elapsed:   20.7s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  23.7s


[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done 700 out of 700 | elapsed:   21.2s finished
[Parallel(n_jobs=1)]: Done 700 out of 700 | elapsed:   21.2s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  24.5s
[Pipeline] ............. (step 2 of 2) Processing model, total=  21.5s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Pipeline] ............. (step 2 of 2) Processing model, total=  21.6s


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   21.4s finished
[Parallel(n_jobs=1)]: Done 700 out of 700 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   21.3s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  21.6s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   22.1s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  22.4s
[Pipeline] ............. (step 2 of 2) Processing model, total=  23.1s


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   22.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 700 out of 700 | elapsed:    0.6s finished
[Parallel(n_jobs=1)]: Done 700 out of 700 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.8s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.9s finished


[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  46.9s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  45.7s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  44.5s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  48.0s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  48.2s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  48.9s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  47.3s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  44.6s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  45.3s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  44.2s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  47.1s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  47.9s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:   17.6s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  17.8s


[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:   18.3s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  18.5s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:   18.4s finished
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    0.7s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  18.6s


[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:   19.1s finished
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:   19.4s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  19.3s
[Pipeline] ............. (step 2 of 2) Processing model, total=  22.3s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   22.4s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  22.6s


[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   22.5s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  25.5s


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Done 700 out of 700 | elapsed:   31.0s finished
[Parallel(n_jobs=1)]: Done 700 out of 700 | elapsed:   30.7s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  34.0s
[Pipeline] ............. (step 2 of 2) Processing model, total=  33.8s


[Parallel(n_jobs=1)]: Done 700 out of 700 | elapsed:   31.0s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  34.0s


[Parallel(n_jobs=1)]: Done 700 out of 700 | elapsed:   31.7s finished
[Parallel(n_jobs=1)]: Done 700 out of 700 | elapsed:   31.5s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  34.8s
[Pipeline] ............. (step 2 of 2) Processing model, total=  34.6s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 700 out of 700 | elapsed:    1.2s finished
[Parallel(n_jobs=1)]: Done 700 out of 700 | elapsed:    1.2s finished
[Parallel(n_jobs=1)]: Done 700 out of 700 | elapsed:    1.1s finished
[Parallel(n_jobs=1)]: Done 700 out of 700 | elapsed:    1.3s finished
[Parallel(n_jobs=1)]: Done 700 out of 700 | elapsed:    1.2s finished


[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  38.5s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  39.0s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  41.4s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  37.4s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  35.7s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  39.8s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  42.2s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  31.9s
[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  32.2s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   18.5s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  20.6s


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   18.9s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  21.0s


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   19.1s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  21.3s


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   19.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Pipeline] ............. (step 2 of 2) Processing model, total=  21.6s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Done 700 out of 700 | elapsed:   24.7s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  27.1s


[Parallel(n_jobs=1)]: Done 700 out of 700 | elapsed:   25.6s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  27.5s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 700 out of 700 | elapsed:   26.0s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  28.0s


[Parallel(n_jobs=1)]: Done 700 out of 700 | elapsed:    0.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 700 out of 700 | elapsed:    0.8s finished
[Parallel(n_jobs=1)]: Done 700 out of 700 | elapsed:    0.8s finished
[Parallel(n_jobs=1)]: Done 700 out of 700 | elapsed:   24.0s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  24.8s


[Parallel(n_jobs=1)]: Done 700 out of 700 | elapsed:   24.2s finished


[Pipeline] ............. (step 2 of 2) Processing model, total=  25.0s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 700 out of 700 | elapsed:    0.8s finished
[Parallel(n_jobs=1)]: Done 700 out of 700 | elapsed:    0.8s finished
INFO:gensim.models.keyedvectors:loading projection weights from /Users/charles/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz
INFO:gensim.utils:KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from /Users/charles/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2023-11-13T22:34:20.864543', 'gensim': '4.3.0', 'python': '3.10.13 (main, Sep 11 2023, 08:24:56) [Clang 14.0.6 ]', 'platform': 'macOS-14.0-arm64-arm-64bit', 'event': 'load_word2vec_format'}


[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  25.2s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Pipeline] ............. (step 2 of 2) Processing model, total=  12.9s


[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:   12.7s finished


In [45]:
print(grid.best_params_)
print(grid.best_score_)

{'model__max_depth': 20, 'model__n_estimators': 400}
0.18999914433325743


In [47]:
X_test_df = load_train_df(X_test_file)
X_test_df, _, _ = df_to_input(X_test_df)

In [55]:
best_params = {k.replace("model__", ""): v for k, v in grid.best_params_.items()}

In [61]:
best_params

{'max_depth': 20, 'n_estimators': 400}

In [56]:
full_pipeline = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", RandomForestClassifier(random_state=0, **best_params)),
    ],
    verbose=True,
)

In [60]:
print(X_test_df.shape)
print(X_train_df.shape)

(23198, 121)
(92790, 121)


In [64]:
out = full_pipeline.fit(X_train_df, y_train_df).predict_proba(X_test_df)

INFO:gensim.models.keyedvectors:loading projection weights from /Users/charles/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz
INFO:gensim.utils:KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from /Users/charles/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2023-11-13T22:50:01.476979', 'gensim': '4.3.0', 'python': '3.10.13 (main, Sep 11 2023, 08:24:56) [Clang 14.0.6 ]', 'platform': 'macOS-14.0-arm64-arm-64bit', 'event': 'load_word2vec_format'}


[Pipeline] ........ (step 1 of 2) Processing preprocess, total=  25.3s
[Pipeline] ............. (step 2 of 2) Processing model, total=  16.6s


In [68]:
out_ = out[:, 1]
IDs = pd.read_csv(X_test_file)["ID"]
df = pd.DataFrame({"ID": IDs, "fraud_flag": out_})
df = df.reset_index()
df.to_csv("out.csv", index=False)
df.describe()

  IDs = pd.read_csv(X_test_file)["ID"]


Unnamed: 0,index,ID,fraud_flag
count,23198.0,23198.0,23198.0
mean,11598.5,58091.621605,0.014031
std,6696.830108,33465.131873,0.036607
min,0.0,3.0,0.0
25%,5799.25,29355.5,0.000515
50%,11598.5,58128.0,0.003159
75%,17397.75,87016.75,0.015065
max,23197.0,115987.0,0.968746


In [None]:
df["fraud_flag"].hist()