In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

#  Load datasets

In [2]:
# Load datasets
fraud_data = pd.read_csv('../data/Fraud_Data.csv')
ip_country_data = pd.read_csv('../data/IpAddress_to_Country.csv')
creditcard_data = pd.read_csv('../data/creditcard.csv')

#  Handle Missing Values

In [3]:
# 1. Handle Missing Values
# Impute missing values for numerical features
imputer = SimpleImputer(strategy='median')
fraud_data[['age', 'purchase_value']] = imputer.fit_transform(fraud_data[['age', 'purchase_value']])

In [4]:
# Drop rows with missing values in categorical features
fraud_data.dropna(subset=['device_id', 'source', 'browser', 'sex'], inplace=True)

# Data Cleaning

In [5]:
# 2. Data Cleaning
# Remove duplicates
fraud_data.drop_duplicates(inplace=True)

In [7]:
# Correct data types
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'])
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])

#  Exploratory Data Analysis (EDA)

In [8]:
# 3. Exploratory Data Analysis (EDA)
# Univariate analysis
fraud_data.describe()


Unnamed: 0,user_id,purchase_value,age,ip_address,class
count,151112.0,151112.0,151112.0,151112.0,151112.0
mean,200171.04097,36.935372,33.140704,2152145000.0,0.093646
std,115369.285024,18.322762,8.617733,1248497000.0,0.291336
min,2.0,9.0,18.0,52093.5,0.0
25%,100642.5,22.0,27.0,1085934000.0,0.0
50%,199958.0,35.0,33.0,2154770000.0,0.0
75%,300054.0,49.0,39.0,3243258000.0,0.0
max,400000.0,154.0,76.0,4294850000.0,1.0


In [9]:
# Bivariate analysis (example: purchase_value vs class)
fraud_data.groupby('class')['purchase_value'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,136961.0,36.929418,18.315064,9.0,22.0,35.0,49.0,154.0
1,14151.0,36.993004,18.397654,9.0,22.0,35.0,49.0,111.0


In [13]:
# Function to convert IP address to integer
def ip_to_int(ip):
    try:
        parts = ip.split('.')
        return int(parts[0]) * 16777216 + int(parts[1]) * 65536 + int(parts[2]) * 256 + int(parts[3])
    except (AttributeError, IndexError, ValueError):
        return None

In [14]:
# Convert IP addresses to integer format
fraud_data['ip_address'] = fraud_data['ip_address'].apply(lambda x: ip_to_int(str(x)) if pd.notna(x) else x)
ip_country_data['lower_bound_ip_address'] = ip_country_data['lower_bound_ip_address'].apply(lambda x: ip_to_int(str(x)) if pd.notna(x) else x)
ip_country_data['upper_bound_ip_address'] = ip_country_data['upper_bound_ip_address'].apply(lambda x: ip_to_int(str(x)) if pd.notna(x) else x)

#  Exploratory Data Analysis (EDA)

In [15]:
# Merge fraud data with IP to country data
def map_ip_to_country(ip):
    if pd.isna(ip):
        return None
    matching_rows = ip_country_data[(ip_country_data['lower_bound_ip_address'] <= ip) & (ip_country_data['upper_bound_ip_address'] >= ip)]
    if not matching_rows.empty:
        return matching_rows.iloc[0]['country']
    return None

In [16]:
fraud_data['country'] = fraud_data['ip_address'].apply(map_ip_to_country)

# Feature Engineering

In [17]:
# Feature Engineering
# Transaction frequency and velocity
fraud_data['transaction_count'] = fraud_data.groupby('user_id')['user_id'].transform('count')
fraud_data['transaction_velocity'] = fraud_data.groupby('user_id')['purchase_time'].diff().dt.total_seconds()

In [18]:
# Time-Based features
fraud_data['hour_of_day'] = fraud_data['purchase_time'].dt.hour
fraud_data['day_of_week'] = fraud_data['purchase_time'].dt.dayofweek

In [19]:
# Normalization and Scaling
# Standardize numerical features
scaler = StandardScaler()
numerical_features = ['purchase_value', 'age', 'transaction_count', 'transaction_velocity', 'hour_of_day']
fraud_data[numerical_features] = scaler.fit_transform(fraud_data[numerical_features])


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [21]:
# Frequency Encoding for categorical features
categorical_features = ['device_id', 'source', 'browser', 'sex', 'country']
for feature in categorical_features:
    freq_encoding = fraud_data[feature].value_counts().to_dict()
    fraud_data[feature + '_freq'] = fraud_data[feature].map(freq_encoding)

In [22]:

# Drop original categorical columns
fraud_data.drop(columns=categorical_features, inplace=True)

In [23]:
# Normalization and Scaling
# Standardize numerical features
scaler = StandardScaler()
numerical_features = ['purchase_value', 'age', 'transaction_count', 'transaction_velocity', 'hour_of_day', 'device_id_freq', 'source_freq', 'browser_freq', 'sex_freq', 'country_freq']
fraud_data[numerical_features] = scaler.fit_transform(fraud_data[numerical_features])

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [25]:
# Save preprocessed data
fraud_data.to_csv('../data/preprocessed_fraud_data.csv', index=False)