## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc
import shap

# Set style for plots
plt.style.use('seaborn-v0_8')

  from .autonotebook import tqdm as notebook_tqdm


## load datasets

In [2]:
fraud_data = pd.read_csv('../data/Fraud_Data.csv')
ip_country = pd.read_csv('../data/IpAddress_to_Country.csv')
creditcard_data = pd.read_csv('../data/creditcard.csv')

## Display basic info for each dataset

In [3]:
def inspect_dataset(df, name):
    print(f"\n{'='*40}")
    print(f"Inspecting: {name}")
    print(f"{'='*40}")
    print("Shape:", df.shape)
    print("\nFirst 3 rows:")
    print(df.head(3))
    print("\nData types:")
    print(df.dtypes)
    print("\nMissing values:")
    print(df.isnull().sum())
    print("\nNumber of duplicates:", df.duplicated().sum())

## Run inspection

In [4]:
inspect_dataset(fraud_data, "Fraud_Data.csv")


Inspecting: Fraud_Data.csv
Shape: (151112, 11)

First 3 rows:
   user_id          signup_time        purchase_time  purchase_value  \
0    22058  2015-02-24 22:55:49  2015-04-18 02:47:11              34   
1   333320  2015-06-07 20:39:50  2015-06-08 01:38:54              16   
2     1359  2015-01-01 18:52:44  2015-01-01 18:52:45              15   

       device_id source browser sex  age    ip_address  class  
0  QVPSPJUOCKZAR    SEO  Chrome   M   39  7.327584e+08      0  
1  EOGFQPIZPYXFZ    Ads  Chrome   F   53  3.503114e+08      0  
2  YSSKYOSJHPPLJ    SEO   Opera   M   53  2.621474e+09      1  

Data types:
user_id             int64
signup_time        object
purchase_time      object
purchase_value      int64
device_id          object
source             object
browser            object
sex                object
age                 int64
ip_address        float64
class               int64
dtype: object

Missing values:
user_id           0
signup_time       0
purchase_time     0
pu

In [5]:
inspect_dataset(ip_country, "IpAddress_to_Country.csv")


Inspecting: IpAddress_to_Country.csv
Shape: (138846, 3)

First 3 rows:
   lower_bound_ip_address  upper_bound_ip_address    country
0              16777216.0                16777471  Australia
1              16777472.0                16777727      China
2              16777728.0                16778239      China

Data types:
lower_bound_ip_address    float64
upper_bound_ip_address      int64
country                    object
dtype: object

Missing values:
lower_bound_ip_address    0
upper_bound_ip_address    0
country                   0
dtype: int64

Number of duplicates: 0


In [6]:
inspect_dataset(creditcard_data, "creditcard.csv")


Inspecting: creditcard.csv
Shape: (284807, 31)

First 3 rows:
   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   

        V26       V27       V28  Amount  Class  
0 -0.189115  0.133558 -0.021053  149.62      0  
1  0.125895 -0.008983  0.014724    2.69      0  
2 -0.139097 -0.055353 -0.059752  378.66      0  

[3 rows x 31 columns]

Data types:
Time      float64
V1        float64
V2        float64
V3   

## Clean and Preprocess Data

### Handle Missing Values

#### Check for missing values

In [7]:
print("\nMissing Values in Fraud Data:")
print(fraud_data.isnull().sum())


Missing Values in Fraud Data:
user_id           0
signup_time       0
purchase_time     0
purchase_value    0
device_id         0
source            0
browser           0
sex               0
age               0
ip_address        0
class             0
dtype: int64


In [8]:
print("\nMissing Values in ip_country Data:")
print(ip_country.isnull().sum())


Missing Values in ip_country Data:
lower_bound_ip_address    0
upper_bound_ip_address    0
country                   0
dtype: int64


In [9]:

print("\nMissing Values in creditcard Data:")
print(creditcard_data.isnull().sum())


Missing Values in creditcard Data:
Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64


#### Note: <p> No missing values found, so no imputation/dropping needed</p>

### Convert TimeStamps in to DateTime

In [10]:
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'])
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])

### Create Time-Based Features

In [11]:
# Extract time-based features
fraud_data['purchase_hour'] = fraud_data['purchase_time'].dt.hour
fraud_data['purchase_day_of_week'] = fraud_data['purchase_time'].dt.dayofweek
fraud_data['time_since_signup'] = (fraud_data['purchase_time'] - fraud_data['signup_time']).dt.total_seconds() / 3600  # in hours


### IP Address GeoLocation Mapping

#### Converting IP Address in to numeric format for matching

In [12]:
def ip_to_numeric(ip):
    parts = ip.split('.')
    return int(parts[0]) * 256**3 + int(parts[1]) * 256**2 + int(parts[2]) * 256 + int(parts[3])

#### Apply to fraud data

In [14]:
def int_to_ip(ip_num):
	ip_num = int(ip_num)
	return '.'.join([str((ip_num >> (i * 8)) & 0xFF) for i in range(3, -1, -1)])

fraud_data['ip_numeric'] = fraud_data['ip_address'].apply(lambda x: ip_to_numeric(int_to_ip(x)))

#### Convert IP Country code to numeric 

In [16]:
ip_country['lower_bound'] = ip_country['lower_bound_ip_address'].astype(int)
ip_country['upper_bound'] = ip_country['upper_bound_ip_address'].astype(int)

#### Function to find country for IP

In [17]:
def get_country(ip_num):
    match = ip_country[(ip_country['lower_bound'] <= ip_num) & (ip_country['upper_bound'] >= ip_num)]
    return match['country'].iloc[0] if not match.empty else 'Unknown'

#### Map IPS for Countries

In [18]:
fraud_data['country'] = fraud_data['ip_numeric'].apply(get_country)

#### Save Prerocessed Data

In [19]:
fraud_data.to_csv('../data/preprocessed_fraud_data.csv', index=False)
creditcard_data.to_csv('../data/preprocessed_creditcard_data.csv', index=False)

print("Data preprocessing completed and saved to 'data' directory")

Data preprocessing completed and saved to 'data' directory
