In [2]:
# Import necessary libraries
import logging

import gensim.downloader as api
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.preprocessing.text import Tokenizer

# Initialize logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


# Define the Word2Vec transformer class
class W2V(BaseEstimator, TransformerMixin):
    def __init__(self, num_words=None, **kwargs):
        self.num_words = num_words
        self.tokenizer = Tokenizer(num_words=num_words, **kwargs)

    def fit(self, X, y=None):
        self.Word2 = api.load("word2vec-google-news-300")
        return self

    def transform(self, X, y=None):
        x = np.array(X.values)
        for i in range(len(x)):
            for j in range(len(x[i])):
                tokens = x[i][j].split()
                embeddings = [
                    self.Word2[token]
                    for token in tokens
                    if token in self.Word2.key_to_index
                ]
                if len(embeddings) > 0:
                    mean = np.mean(embeddings)
                else:
                    mean = 0
                x[i][j] = mean
        return x

    def get_params(self, deep=True):
        return {"num_words": self.num_words}


# Load the dataset
X_train_file = "../data/X_train.csv"
y_train_file = "../data/Y_train.csv"

with open(X_train_file, "r") as f:
    mixed_columns = (
        ["item" + str(i) for i in range(1, 25)]
        + ["make" + str(i) for i in range(1, 25)]
        + ["model" + str(i) for i in range(1, 25)]
        + ["goods_code" + str(i) for i in range(1, 25)]
    )
    mixed_columns_dtype = {col: str for col in mixed_columns}
    X_train_df = pd.read_csv(X_train_file, dtype=mixed_columns_dtype)

with open(y_train_file, "r") as f:
    y_train_df = pd.read_csv(f)

cols_base = ["goods_code"]
columns_to_drop = ["ID"] + [col + str(i) for col in cols_base for i in range(1, 25)]

X_train_df = X_train_df.drop(columns_to_drop, axis=1)
y_train_df = y_train_df["fraud_flag"]

# Identify the columns to apply RNN tokenization
rnn_columns = ["make", "item", "model"]  # Add more columns as needed
rnn_columns = [col + str(i) for col in rnn_columns for i in range(1, 25)]

# Identify the categorical and numerical columns
categorical_columns = rnn_columns
numerical_columns = [
    col for col in X_train_df.columns if col not in categorical_columns
]

# Clean data
for col in categorical_columns:
    X_train_df[col] = X_train_df[col].fillna("")
for col in numerical_columns:
    X_train_df[col] = X_train_df[col].fillna(0)

# Define transformers
cat_pipeline = make_pipeline(W2V())
num_pipeline = make_pipeline(StandardScaler())

# Create the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("cat_pipeline", cat_pipeline, categorical_columns),
        ("num_pipeline", num_pipeline, numerical_columns),
    ]
)

rfc = RandomForestClassifier(random_state=42, verbose=True)
pipeline = make_pipeline(("preprocessor", preprocessor), ("rfc", rfc), verbose=True)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_df, y_train_df, test_size=0.3, random_state=42
)

param_grid = {
    "n_estimators": [100, 500],
    # "max_features": ["auto", "sqrt", "log2"],
    "max_depth": [None, 13, 20, 50],
    # "criterion": ["gini", "entropy"],
    # "class_weight":[{1: 100}, None]
}

# Define models and their respective hyperparameters
grid = GridSearchCV(
    estimator=pipeline,
    scoring="average_precision",
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    verbose=True,
)

# Fit the pipeline
grid.fit(X_train, y_train)

# Evaluate the pipeline using average_precision_score
y_pred = pipeline.predict_proba(X_val)
average_precision = average_precision_score(y_val, y_pred[:, 1]) * 100
logger.info(f"Average precision score: {average_precision}")

# Save the trained model
import joblib

model_filename = "trained_rf_classifier.pkl"
joblib.dump(pipeline, model_filena me)
logger.info(f"Trained model saved as {model_filename}")

# Load and use the trained model for predictions
loaded_pipeline = joblib.load(model_filename)
sample_input = X_val.iloc[:5, :]  # Take a sample input for prediction
sample_output = loaded_pipeline.predict_proba(sample_input)
logger.info(f"Sample input predictions: {sample_output}")

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


FileNotFoundError: [Errno 2] No such file or directory: '../data/X_train.csv'

In [None]:
# Import necessary libraries
import logging

import gensim.downloader as api
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.preprocessing.text import Tokenizer

# Initialize logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


# Define the Word2Vec transformer class
class W2V(BaseEstimator, TransformerMixin):
    def __init__(self, num_words=None, **kwargs):
        self.num_words = num_words
        self.tokenizer = Tokenizer(num_words=num_words, **kwargs)

    def fit(self, X, y=None):
        self.Word2 = api.load("word2vec-google-news-300")
        return self

    def transform(self, X, y=None):
        x = np.array(X.values)
        for i in range(len(x)):
            for j in range(len(x[i])):
                tokens = x[i][j].split()
                embeddings = [
                    self.Word2[token]
                    for token in tokens
                    if token in self.Word2.key_to_index
                ]
                if len(embeddings) > 0:
                    mean = np.mean(embeddings)
                else:
                    mean = 0
                x[i][j] = mean
        return x

    def get_params(self, deep=True):
        return {"num_words": self.num_words}


# Load the dataset
X_train_file = "../data/X_train.csv"
y_train_file = "../data/Y_train.csv"

with open(X_train_file, "r") as f:
    mixed_columns = (
        ["item" + str(i) for i in range(1, 25)]
        + ["make" + str(i) for i in range(1, 25)]
        + ["model" + str(i) for i in range(1, 25)]
        + ["goods_code" + str(i) for i in range(1, 25)]
    )
    mixed_columns_dtype = {col: str for col in mixed_columns}
    X_train_df = pd.read_csv(X_train_file, dtype=mixed_columns_dtype)

with open(y_train_file, "r") as f:
    y_train_df = pd.read_csv(f)

cols_base = ["goods_code"]
columns_to_drop = ["ID"] + [col + str(i) for col in cols_base for i in range(1, 25)]

X_train_df = X_train_df.drop(columns_to_drop, axis=1)
y_train_df = y_train_df["fraud_flag"]

# Identify the columns to apply RNN tokenization
rnn_columns = ["make", "item", "model"]  # Add more columns as needed
rnn_columns = [col + str(i) for col in rnn_columns for i in range(1, 25)]

# Identify the categorical and numerical columns
categorical_columns = rnn_columns
numerical_columns = [
    col for col in X_train_df.columns if col not in categorical_columns
]

# Clean data
for col in categorical_columns:
    X_train_df[col] = X_train_df[col].fillna("")
for col in numerical_columns:
    X_train_df[col] = X_train_df[col].fillna(0)

# Define transformers
cat_pipeline = make_pipeline(W2V())
num_pipeline = make_pipeline(StandardScaler())

# Create the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("cat_pipeline", cat_pipeline, categorical_columns),
        ("num_pipeline", num_pipeline, numerical_columns),
    ]
)

param_grid = {
    "n_estimators": [100, 500],
    # "max_features": ["auto", "sqrt", "log2"],
    "max_depth": [None, 13, 20, 50],
    # "criterion": ["gini", "entropy"],
    # "class_weight":[{1: 100}, None]
}

rfc = RandomForestClassifier(random_state=42, verbose=True)
# Define models and their respective hyperparameters
grid = GridSearchCV(estimator=rfc, scoring="average_precision", param_grid=param_grid, cv=5, n_jobs=-1, verbose=True)

pipeline = make_pipeline(preprocessor, grid, verbose=True)
# Create the pipeline

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_df, y_train_df, test_size=0.3, random_state=42
)

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Evaluate the pipeline using average_precision_score
y_pred = pipeline.predict_proba(X_val)
average_precision = average_precision_score(y_val, y_pred[:, 1]) * 100
logger.info(f"Average precision score: {average_precision}")

# Save the trained model
import joblib

model_filename = "trained_rf_classifier.pkl"
joblib.dump(pipeline, model_filename)
logger.info(f"Trained model saved as {model_filename}")

# Load and use the trained model for predictions
loaded_pipeline = joblib.load(model_filename)
sample_input = X_val.iloc[:5, :]  # Take a sample input for prediction
sample_output = loaded_pipeline.predict_proba(sample_input)
logger.info(f"Sample input predictions: {sample_output}")

In [None]:
grid.best_params_

In [None]:
X_train_tranformed = preprocessor.transform(X_train)

In [None]:
param_grid = {
    "n_estimators": [100, 500, 1000],
    "max_features": ["auto", "sqrt", "log2"],
    "max_depth": [None, 13, 20, 50],
    # "criterion": ["gini", "entropy"],
    "class_weight":[{1: 100}, None]
}

rfc = RandomForestClassifier(random_state=42, verbose=True)
# Define models and their respective hyperparameters
grid = GridSearchCV(
    estimator=rfc,
    scoring="average_precision",
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    verbose=True,
)

# Fit the pipeline
grid.fit(X_train_tranformed, y_train)
grid.best_score_

In [None]:
X_tranformed = preprocessor.transform(X_train_df)
grid.best_estimator_.fit(X_tranformed, y_train_df)


In [None]:
X_test_file = "../data/X_test_8skS2ey.csv"

with open(X_test_file, "r") as f:
    mixed_columns = (
        ["item" + str(i) for i in range(1, 25)]
        + ["make" + str(i) for i in range(1, 25)]
        + ["model" + str(i) for i in range(1, 25)]
        + ["goods_code" + str(i) for i in range(1, 25)]
    )
    mixed_columns_dtype = {col: str for col in mixed_columns}
    X_test_df = pd.read_csv(X_test_file, dtype=mixed_columns_dtype)

X_test_df = X_test_df.drop(columns_to_drop, axis=1)

# Identify the categorical and numerical columns
categorical_columns = rnn_columns
numerical_columns = [col for col in X_test_df.columns if col not in categorical_columns]

# Clean data
for col in categorical_columns:
    X_test_df[col] = X_test_df[col].fillna("")
for col in numerical_columns:
    X_test_df[col] = X_test_df[col].fillna(0)

X_test_tranformed = preprocessor.transform(X_test_df)

NameError: name 'columns_to_drop' is not defined

In [None]:
out = grid.best_estimator_.predict_proba(X_test_tranformed)

In [None]:
out_ = out[:, 1]
with open(X_test_file, "r") as f:
    X_test_df = pd.read_csv(X_test_file, dtype=mixed_columns_dtype)
    df = pd.DataFrame({"ID": X_test_df["ID"], "fraud_flag": out_})
    df.to_csv("out.csv")
    df.describe()

In [None]:
df["fraud_flag"].hist()