In [None]:
# import modules and for initial data analysis

import pandas as pd
import numpy as np
from datetime import timedelta
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
from matplotlib.ticker import FuncFormatter
import seaborn as sns
import warnings

def get_base_data(fp):

    df = pd.read_excel(fp)
    
    def count_previous_attempts(row, minutes_window=5):
        same_amount = df['amount'] == row['amount']
        same_country = df['country'] == row['country']
        time_window = (df['tmsp'] >= (row['tmsp'] - timedelta(minutes=minutes_window))) & (df['tmsp'] < row['tmsp'])
        return df[same_amount & same_country & time_window].shape[0]
    df["hour_of_day"] = df["tmsp"].dt.hour
    df["day_of_week"] = df["tmsp"].dt.dayofweek
    df['previous_attempts'] = df.apply(count_previous_attempts, axis=1)

    df.to_pickle("df2.pkl")

    return df

def get_base_data_from_pickle(fp):
    return pd.read_pickle(fp)

In [None]:
# load df and calculate additional fields - takes long
#df = get_base_data(r"G:\My Drive\masterDataScience\ModelEngineering\use_case_1\PSP_Jan_Feb_2019.xlsx")

# load df with all additional fields
df = get_base_data_from_pickle(r"in\df.pkl")


In [None]:
# plot histograms for categorical columns

nrows, ncols = 3, 2
font_size = 20
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(12, 8))

axes = axes.flatten()
axes = axes[:5]

df_str = df.astype(str)

for i, column in enumerate(df_str[["country", "success", "PSP", "3D_secured", "card"]].columns):
    if 1 == 1:
        ax = axes[i]
        ax.hist(df_str[column])
        ax.set_title(f'Histogram of {column}', fontsize=font_size)
        ax.set_xlabel(column, fontsize=font_size)
        ax.set_ylabel('Frequency', fontsize=font_size)
        ax.tick_params(axis='both', labelsize=10)

    print(column)
    print(df_str[column].unique())

for i in range(len(df_str.columns), (nrows * ncols)-1):
    fig.delaxes(axes[i])

fig.tight_layout()

plt.show()


In [None]:
# plot kernel densities for numerical columns

nrows, ncols = 2, 1
font_size = 20
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(12, 8))

axes = axes.flatten()

df['numeric_tmsp'] = (df['tmsp'] - df['tmsp'].min()).dt.total_seconds()
for i, column in enumerate(df[["amount", "numeric_tmsp"]].columns):
    if 1 == 1:
        ax = axes[i]
        kde = gaussian_kde(df[column])
        x = np.linspace(min(df[column]), max(df[column]), 100)
        y = kde(x)
        ax.plot(x, y, linewidth=2)  # Plot the KDE
        ax.set_title(f'Kernel Density Estimation of {column}', fontsize=font_size)  # Set title font size
        ax.set_xlabel(column, fontsize=font_size)  # Set x-axis label font size
        ax.set_ylabel('Density', fontsize=font_size)  # Set y-axis label font size
        ax.tick_params(axis='both', labelsize=font_size)
    print(column)
    if column == "amount":
        print(min(df[column]))
        print(max(df[column]))
        print(np.std(df[column]))
    else:
        print(min(df["tmsp"]))
        print(max(df["tmsp"]))

for i in range(len(df_str.columns), nrows * ncols):
    fig.delaxes(axes[i])

fig.tight_layout()

plt.show()


In [None]:
# plot histogram to show impact of PSP

def plot_hist_per_psp(df, col, ax, font_size = 20):
    grouped = df.groupby([col, 'PSP'])['success'].mean() * 100
    pivot_table = grouped.unstack(level=1)

    pivot_table.plot(kind='bar', stacked=False, ax=ax)
    ax.set_title(f'Percentage of Successful Attempts by {col} and PSP', fontsize=font_size)
    ax.set_xlabel(col, fontsize=font_size)
    ax.set_ylabel('Success Percentage', fontsize=font_size)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=0, fontsize=font_size)
    ax.legend(title='PSP', loc='upper right')
    ax.tick_params(axis='both', labelsize=20)

# Create a single figure with subplots
fig, axes = plt.subplots(2, 1, figsize=(18, 12))
columns = ["country", "day_of_week"]

# Flatten the axes array to access each subplot
axes = axes.flatten()

for i, col in enumerate(columns):
    plot_hist_per_psp(df, col, axes[i])

plt.tight_layout()
plt.show()


In [None]:
# histogram for additional field "previous_attempts"

font_size = 12

plt.figure(figsize=(10, 6))
ax1 = plt.gca()
plt.hist(df['previous_attempts'], edgecolor='black', alpha=0.9, color='green') 
plt.xlabel('# previous_attempts', fontsize=font_size)
plt.ylabel('Frequency', fontsize=font_size, color='green')

plt.xticks(fontsize=font_size)
plt.yticks(fontsize=font_size, color='green') 

success_rate = df.groupby('previous_attempts')['success'].mean()

ax2 = ax1.twinx() 
ax2.bar(success_rate.index, success_rate.values, align='center', alpha=0.7, color='blue') 
ax2.set_ylabel('Success Rate', fontsize=font_size, color='blue') 

ax2.set_yticklabels(ax2.get_yticks(), fontsize=font_size, color='blue')  

ax2.yaxis.set_major_formatter(FuncFormatter(lambda x, _: f'{x:.2f}'))

plt.title('Success Rate vs. Previous Attempts')
plt.grid(axis='y', linestyle='--', alpha=0.3)
plt.show()


In [None]:
# Kernel density of amount for different PSPs

warnings.simplefilter(action='ignore', category=FutureWarning)

def plot_kde_all(df, col):
    sns.set(style="whitegrid")
    df_filtered = df[[col, 'PSP', 'success']]
    plt.figure(figsize=(10, 6))
    for psp_value in df_filtered['PSP'].unique():
        data_subset = df_filtered[df_filtered['PSP'] == psp_value]
        sns.kdeplot(data_subset[col], label=f'PSP {psp_value}', shade=True)
    plt.xlabel(col)
    plt.ylabel("Kernel Density")
    plt.title(f'Kernel Density Estimation of {col} by PSP')
    plt.legend(title='PSP', loc='upper right')
    plt.show()

plot_kde_all(df, 'amount')


In [None]:
# define features and target

num_cols = ["amount", "previous_attempts"]
cat_cols = ["country", "PSP", "3D_secured", "card", "day_of_week"]
target_col = "success"

list_features = num_cols + cat_cols

In [None]:
# setup model

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from imblearn.over_sampling import RandomOverSampler

col_trans = ColumnTransformer([
    ('num', MinMaxScaler(), num_cols),
    ('cat', OneHotEncoder(drop='if_binary'), cat_cols)
])

df_transformed = col_trans.fit_transform(df[list_features])
X = df_transformed[:, :]
y = df["success"]

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=10)

# oversampling is important, check difference!!
# Apply oversampling to the training set
oversampler = RandomOverSampler(random_state=42)
X_train, y_train = oversampler.fit_resample(X_train, y_train)


# See the inital model performance
clf = RandomForestClassifier(random_state=10)
print("Random parameters")
print('Acc:', cross_val_score(clf, X_train, y_train, cv=StratifiedKFold(n_splits=5), scoring='accuracy').mean())

params = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}

In [None]:
# Grid search for optimal parameters
# Takes long!!
if 1 == 2:
    clf = GridSearchCV(RandomForestClassifier(random_state=10), param_grid=params, 
                    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=10), scoring='f1')
    clf.fit(X_train, y_train)

    print(clf.best_params_) # {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 10}
    print(clf.best_score_)

In [None]:
# train network
clf = RandomForestClassifier(n_estimators=50,
                             max_depth=None,
                             min_samples_split=2,
                             min_samples_leaf=1,
                             random_state=11)
clf.fit(X_train, y_train)


In [None]:
# get the feature importance
feat_names_num = list(col_trans.transformers_[0][1].get_feature_names_out())
feat_names_cat = list(col_trans.transformers_[1][1].get_feature_names_out())
feature_names = feat_names_num + feat_names_cat
df_importance = pd.DataFrame({'feature': feature_names, 'importance': clf.feature_importances_})
df_importance.sort_values('importance', ascending=False)

In [None]:
# quantify performance

import numpy as np
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, ConfusionMatrixDisplay, recall_score, precision_score
from sklearn.dummy import DummyClassifier

# Create a random classifier that makes random guesses
random_classifier = DummyClassifier(strategy="uniform", random_state=42)  # "uniform" strategy for random guessing

# Fit the random classifier on the training data
random_classifier.fit(X_train, y_train)

# Predictions and probabilities from your original classifier
y_train_pred = clf.predict(X_train)
y_train_proba = clf.predict_proba(X_train)
y_test_pred = clf.predict(X_test)
y_test_proba = clf.predict_proba(X_test)

# Predictions from the random classifier
y_random_train_pred = random_classifier.predict(X_train)
y_random_test_pred = random_classifier.predict(X_test)

# Calculate and compare performance metrics
print("Train Acc (Original):", accuracy_score(y_train, y_train_pred))

print("Train Acc (Random Guessing):", accuracy_score(y_train, y_random_train_pred))

# Similar comparisons for test data
print("Test Acc (Original):", accuracy_score(y_test, y_test_pred))

print("Test Acc (Random Guessing):", accuracy_score(y_test, y_random_test_pred))

# Confusion matrices for both classifiers (original and random)
print("Confusion Matrix (Original) - Train:")
ConfusionMatrixDisplay.from_estimator(clf, X_train, y_train)
print("Confusion Matrix (Original) - Test:")
ConfusionMatrixDisplay.from_estimator(clf, X_test, y_test, normalize="all")

print("Confusion Matrix (Random Guessing) - Train:")
ConfusionMatrixDisplay.from_estimator(random_classifier, X_train, y_train)
print("Confusion Matrix (Random Guessing) - Test:")
ConfusionMatrixDisplay.from_estimator(random_classifier, X_test, y_test, normalize="all")


In [None]:
# performance of current implementation

from sklearn.metrics import confusion_matrix
actual = y_test
predicted = y_test.copy()
predicted.loc[:] = 1
cm = confusion_matrix(actual, predicted)
# Create a ConfusionMatrixDisplay
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Class 0', 'Class 1'])

print("Test Acc (Current):", accuracy_score(actual, predicted))

# Plot the confusion matrix
disp.plot(cmap=plt.cm.Blues, values_format='.2f')

In [None]:
# save pipeline

from sklearn.pipeline import Pipeline
import joblib

pipeline = Pipeline([('transformer', col_trans),('classifier', clf)])

joblib.dump(pipeline, r'in\pipeline.pkl')