# PMU Fraud detection - Model creation

The aim is to be able to detect fraudulent online transactions. For this purpose, we have a partitioned file containing information about transactions (location, IP, validity, ...).

Last update : 2023-09-14

In [1]:
!pip install -r requirements.txt -q

In [2]:
import s3fs
import joblib
import json
import warnings
import pandas as pd
from imblearn.over_sampling import ADASYN

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [3]:
# config
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# to ignore warnings
warnings.filterwarnings('ignore')

First, we start by reading the file containing the information about the transactions.

In [4]:
df_fraud = pd.read_parquet(
    "s3a://graal-demo-data-integration/output/fraud.parquet",
    storage_options={
        "key": "REPLACE_BY_BUCKET_KEY",
        "secret": "REPLACE_BY_BUCKET_SECRET",
        "client_kwargs": {
            "endpoint_url": "https://s3.fr-par.scw.cloud",
            "region_name": "fr-par",
        },
    },
)

print(f'Dim: {df_fraud.shape}\n\nGeneric information:')
df_fraud.info()
print("\n\nObject columns statistics:")
print(df_fraud.describe(include=['object']))
print('\n\n5 first lines')
df_fraud.head(5)

Dim: (1215, 11)

Generic information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1215 entries, 0 to 1214
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   CN                    1215 non-null   Int64 
 1   date                  1215 non-null   object
 2   amount                1215 non-null   Int64 
 3   ip                    1215 non-null   object
 4   ip_range              1215 non-null   object
 5   is_valid              1215 non-null   bool  
 6   provider              1215 non-null   object
 7   country_name          1215 non-null   object
 8   continent_code        564 non-null    object
 9   continent_name        1215 non-null   object
 10  is_in_european_union  1215 non-null   Int64 
dtypes: Int64(3), bool(1), object(7)
memory usage: 99.8+ KB


Object columns statistics:
              date             ip    ip_range  provider   country_name continent_code continent_name
count         1

Unnamed: 0,CN,date,amount,ip,ip_range,is_valid,provider,country_name,continent_code,continent_name,is_in_european_union
0,30001034549321,2022-02-01,13595,6.200.204.63,6.0.0.0/8,True,ExpressVPN,United States,,North America,0
1,30001034549321,2022-02-01,13595,6.200.204.63,6.0.0.0/8,True,KeepSolid VPN Unlimited,United States,,North America,0
2,30001034549321,2022-02-01,13595,6.200.204.63,6.0.0.0/8,True,PureVPN,United States,,North America,0
3,30001034549321,2022-02-01,13595,6.200.204.63,6.0.0.0/8,True,SaferVPN,United States,,North America,0
4,30001034549321,2022-02-01,13595,6.200.204.63,6.0.0.0/8,True,TunnelBear,United States,,North America,0


This file describe the transactions on various topics such as internet information (IP, IP range, provider), banking information (card number, amount) and geographic information (country name, continent code, continent name). Most of the columns are strings so data pre-processing is necessary.

## Data pre-processing

The continent code does not give more information than the continent name so we will drop this column.

In [5]:
# Drop columns & NA
df_fraud = df_fraud.drop(["CN", "date", "ip", "ip_range", "continent_code"], axis=1) 
df_fraud.dropna(inplace=True)
df_fraud.head()

Unnamed: 0,amount,is_valid,provider,country_name,continent_name,is_in_european_union
0,13595,True,ExpressVPN,United States,North America,0
1,13595,True,KeepSolid VPN Unlimited,United States,North America,0
2,13595,True,PureVPN,United States,North America,0
3,13595,True,SaferVPN,United States,North America,0
4,13595,True,TunnelBear,United States,North America,0


## Model

In [6]:
categorical_features = ["provider", "country_name", "continent_name", "is_in_european_union"]
numeric_features = ["amount"]

# Split the dataset & resample
X = df_fraud[categorical_features + numeric_features]
y = df_fraud['is_valid']

# adasyn = ADASYN(sampling_strategy='minority', random_state=42)
# X_resampled, y_resampled = adasyn.fit_resample(X, y)
# resampled_fraud = pd.DataFrame(X_resampled, columns=X.columns)
# print(resampled_fraud.info())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Create transformers for Onehot Encoding and Standardisation
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown="ignore"))])
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

# Create a ColumnTransformer object that applies these transformers to the appropriate columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Create the pipeline including the preprocessing and the random forest model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=1000, max_depth=3, random_state=42))
])

In [8]:
# Model training
model.fit(X_train, y_train)
# Prediction on the test dataset
y_pred = model.predict(X_test)

In [9]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Model precision:", accuracy)
print("\nClassification report:\n", classification_report(y_test, y_pred))

Model precision: 0.8888888888888888

Classification report:
               precision    recall  f1-score   support

       False       0.00      0.00      0.00        27
        True       0.89      1.00      0.94       216

    accuracy                           0.89       243
   macro avg       0.44      0.50      0.47       243
weighted avg       0.79      0.89      0.84       243



In [10]:
# Save the model in the joblib to the appropriate bucket
folder_name = "fraud-model"
fs_d69b7172 = s3fs.S3FileSystem(
    key="REPLACE_BY_BUCKET_KEY",
    secret="REPLACE_BY_BUCKET_SECRET",
    client_kwargs={
        "endpoint_url": "https://s3.fr-par.scw.cloud",
        "region_name": "fr-par",
    },
)
joblib.dump(
    model,
    fs_d69b7172.open(
        f"s3a://graal-demo-data-integration/output/{folder_name}/model.joblib", "wb"
    ),
)
print("Model uploaded!")

Model uploaded!


In [11]:
settings = {
    "parameters": {
	"version": "v0.1.0"
    },
    "inputs": [
        {
            "name": "amount",
            "datatype": "FP32",
            "shape": [1],
    	},
        {
            "name": "provider",
            "datatype": "BYTES",
            "shape": [1],
            "parameters": {
                "content_type": "str"
            }
        },
        {
            "name": "country_name",
            "datatype": "BYTES",
            "shape": [1],
            "parameters": {
                "content_type": "str"
            }
        },
        {
            "name": "continent_name",
            "datatype": "BYTES",
            "shape": [1],
            "parameters": {
                "content_type": "str"
            }
        },
        {
            "name": "is_in_european_union",
            "datatype": "INT32",
            "shape": [1]
        }
    ],
    "outputs": [
        {
            "name": "is_valid",
            "datatype": "BOOL",
            "shape": [1]
        }
    ]
}

json.dump(
    settings,
    fs_d69b7172.open(
        f"s3a://graal-demo-data-integration/output/{folder_name}/model-settings.json", "w"
    ),
)
print("Model settings uploaded!")

Model settings uploaded!
