In [3]:
# Importing necessary libraries to read the Excel file and take a look at its content
import pandas as pd

# Load the data from the uploaded Excel file
file_path = '../data/processed/PSP_model_data.xlsx'
data_df = pd.read_excel(file_path)

# Show the first few rows of the dataset to understand its structure and variables
data_df.head()


Unnamed: 0,index,tmsp,country,amount,success,PSP,3D_secured,card,attempts,fee,...,time_of_day,month,quarter,fee_to_amount_ratio,log_amount,country_PSP,PSP_3D_secured,previous_successes,previous_failures,transaction_duration
0,0,2019-01-01 00:01:11,Germany,89,0,UK_Card,0,Visa,1,1.0,...,Night,1,1,0.011236,4.49981,Germany_UK_Card,UK_Card_0,0,0,0
1,1,2019-01-01 00:01:17,Germany,89,1,UK_Card,0,Visa,2,3.0,...,Night,1,1,0.033708,4.49981,Germany_UK_Card,UK_Card_0,1,0,0
2,2,2019-01-01 00:02:49,Germany,238,0,UK_Card,1,Diners,1,1.0,...,Night,1,1,0.004202,5.476464,Germany_UK_Card,UK_Card_1,0,0,0
3,3,2019-01-01 00:03:13,Germany,238,1,UK_Card,1,Diners,2,3.0,...,Night,1,1,0.012605,5.476464,Germany_UK_Card,UK_Card_1,1,0,0
4,4,2019-01-01 00:04:33,Austria,124,0,Simplecard,0,Diners,1,0.5,...,Night,1,1,0.004032,4.828314,Austria_Simplecard,Simplecard_0,0,0,0


In [4]:
# Drop the columns that are not to be used for the model
columns_to_drop = ['index', 'tmsp', 'hour', 'attempts']
data_df = data_df.drop(columns=columns_to_drop)

# Show the first few rows of the dataset after removing unwanted columns
data_df.head()


Unnamed: 0,country,amount,success,PSP,3D_secured,card,fee,weekday,time_of_day,month,quarter,fee_to_amount_ratio,log_amount,country_PSP,PSP_3D_secured,previous_successes,previous_failures,transaction_duration
0,Germany,89,0,UK_Card,0,Visa,1.0,Tuesday,Night,1,1,0.011236,4.49981,Germany_UK_Card,UK_Card_0,0,0,0
1,Germany,89,1,UK_Card,0,Visa,3.0,Tuesday,Night,1,1,0.033708,4.49981,Germany_UK_Card,UK_Card_0,1,0,0
2,Germany,238,0,UK_Card,1,Diners,1.0,Tuesday,Night,1,1,0.004202,5.476464,Germany_UK_Card,UK_Card_1,0,0,0
3,Germany,238,1,UK_Card,1,Diners,3.0,Tuesday,Night,1,1,0.012605,5.476464,Germany_UK_Card,UK_Card_1,1,0,0
4,Austria,124,0,Simplecard,0,Diners,0.5,Tuesday,Night,1,1,0.004032,4.828314,Austria_Simplecard,Simplecard_0,0,0,0


In [5]:
# Perform one-hot encoding for categorical variables
categorical_columns = ['country', 'PSP', 'card', 'weekday', 'time_of_day', 'country_PSP', 'PSP_3D_secured']
data_df_encoded = pd.get_dummies(data_df, columns=categorical_columns)

# Show the first few rows of the dataset after one-hot encoding
data_df_encoded.head()


Unnamed: 0,amount,success,3D_secured,fee,month,quarter,fee_to_amount_ratio,log_amount,previous_successes,previous_failures,...,country_PSP_Switzerland_Simplecard,country_PSP_Switzerland_UK_Card,PSP_3D_secured_Goldcard_0,PSP_3D_secured_Goldcard_1,PSP_3D_secured_Moneycard_0,PSP_3D_secured_Moneycard_1,PSP_3D_secured_Simplecard_0,PSP_3D_secured_Simplecard_1,PSP_3D_secured_UK_Card_0,PSP_3D_secured_UK_Card_1
0,89,0,0,1.0,1,1,0.011236,4.49981,0,0,...,0,0,0,0,0,0,0,0,1,0
1,89,1,0,3.0,1,1,0.033708,4.49981,1,0,...,0,0,0,0,0,0,0,0,1,0
2,238,0,1,1.0,1,1,0.004202,5.476464,0,0,...,0,0,0,0,0,0,0,0,0,1
3,238,1,1,3.0,1,1,0.012605,5.476464,1,0,...,0,0,0,0,0,0,0,0,0,1
4,124,0,0,0.5,1,1,0.004032,4.828314,0,0,...,0,0,0,0,0,0,1,0,0,0


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Define the feature set X and the target variable y
X = data_df_encoded.drop('success', axis=1)
y = data_df_encoded['success']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the StandardScaler and fit it to the training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Display the shape of the training and test data
X_train_scaled.shape, X_test_scaled.shape, y_train.shape, y_test.shape


((40328, 51), (10082, 51), (40328,), (10082,))

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Initialize the neural network model
model = Sequential()

# Input layer with 51 neurons (equal to the number of features) and a ReLU activation function
model.add(Dense(51, input_dim=51, activation='relu'))

# Hidden layer with 32 neurons and a ReLU activation function
model.add(Dense(32, activation='relu'))

# Output layer with 1 neuron (binary classification) and a sigmoid activation function
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fit the model to the training data
history = model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, validation_split=0.2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
