## Import Dependencies

In [2]:
import pandas as pd
import numpy as np

## Load Datasets

In [3]:
print("Loading datasets...")
try:
    fraud_data = pd.read_csv('../data/Fraud_Data.csv')
    ip_to_country = pd.read_csv('../data/IpAddress_to_Country.csv')
    creditcard_data = pd.read_csv('../data/creditcard.csv')
    print("Datasets loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading file: {e}. Please ensure the CSV files are in the data directory")
    exit()
print("\n--- Initial Data Info ---")
print("\nFraud_Data.csv Info:")
fraud_data.info()
print("\nIpAddress_to_Country.csv Info:")
ip_to_country.info()
print("\ncreditcard.csv Info:")
creditcard_data.info()

Loading datasets...
Datasets loaded successfully.

--- Initial Data Info ---

Fraud_Data.csv Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   user_id         151112 non-null  int64  
 1   signup_time     151112 non-null  object 
 2   purchase_time   151112 non-null  object 
 3   purchase_value  151112 non-null  int64  
 4   device_id       151112 non-null  object 
 5   source          151112 non-null  object 
 6   browser         151112 non-null  object 
 7   sex             151112 non-null  object 
 8   age             151112 non-null  int64  
 9   ip_address      151112 non-null  float64
 10  class           151112 non-null  int64  
dtypes: float64(1), int64(4), object(6)
memory usage: 12.7+ MB

IpAddress_to_Country.csv Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138846 entries, 0 to 138845
Data columns (total

## Handle Missing Values

In [None]:
print("\n--- Handling Missing Values ---")

print("\nProcessing Fraud_Data.csv for missing values...")
print("Missing values before handling:\n", fraud_data.isnull().sum())


initial_rows_fraud = fraud_data.shape[0]
fraud_data.dropna(subset=['ip_address', 'device_id'], inplace=True)
print(f"Dropped {initial_rows_fraud - fraud_data.shape[0]} rows from Fraud_Data due to missing ip_address or device_id.")

if fraud_data['sex'].isnull().any():
    mode_sex = fraud_data['sex'].mode()[0]
    fraud_data['sex'].fillna(mode_sex, inplace=True)
    print(f"Imputed missing 'sex' values with mode: {mode_sex}")

if fraud_data['age'].isnull().any():
    median_age = fraud_data['age'].median()
    fraud_data['age'].fillna(median_age, inplace=True)
    print(f"Imputed missing 'age' values with median: {median_age}")

print("Missing values after handling for Fraud_Data.csv:\n", fraud_data.isnull().sum())


print("\nProcessing IpAddress_to_Country.csv for missing values...")
print("Missing values before handling:\n", ip_to_country.isnull().sum())
print("Missing values after handling for IpAddress_to_Country.csv:\n", ip_to_country.isnull().sum())


print("\nProcessing creditcard.csv for missing values...")
print("Missing values before handling:\n", creditcard_data.isnull().sum())
print("Missing values after handling for creditcard.csv:\n", creditcard_data.isnull().sum())


--- Handling Missing Values ---

Processing Fraud_Data.csv for missing values...
Missing values before handling:
 user_id           0
signup_time       0
purchase_time     0
purchase_value    0
device_id         0
source            0
browser           0
sex               0
age               0
ip_address        0
class             0
dtype: int64
Dropped 0 rows from Fraud_Data due to missing ip_address or device_id.
Missing values after handling for Fraud_Data.csv:
 user_id           0
signup_time       0
purchase_time     0
purchase_value    0
device_id         0
source            0
browser           0
sex               0
age               0
ip_address        0
class             0
dtype: int64

Processing IpAddress_to_Country.csv for missing values...
Missing values before handling:
 lower_bound_ip_address    0
upper_bound_ip_address    0
country                   0
dtype: int64
Missing values after handling for IpAddress_to_Country.csv:
 lower_bound_ip_address    0
upper_bound_ip_addr

## Data Cleaning

### Remove Duplicates

In [5]:
print("\n--- Removing Duplicate Rows ---")

# Fraud_Data.csv
initial_rows_fraud = fraud_data.shape[0]
fraud_data.drop_duplicates(inplace=True)
print(f"Removed {initial_rows_fraud - fraud_data.shape[0]} duplicate rows from Fraud_Data.csv.")

# IpAddress_to_Country.csv
initial_rows_ip = ip_to_country.shape[0]
ip_to_country.drop_duplicates(inplace=True)
print(f"Removed {initial_rows_ip - ip_to_country.shape[0]} duplicate rows from IpAddress_to_Country.csv.")

# creditcard.csv
initial_rows_credit = creditcard_data.shape[0]
creditcard_data.drop_duplicates(inplace=True)
print(f"Removed {initial_rows_credit - creditcard_data.shape[0]} duplicate rows from creditcard.csv.")


--- Removing Duplicate Rows ---
Removed 0 duplicate rows from Fraud_Data.csv.
Removed 0 duplicate rows from IpAddress_to_Country.csv.
Removed 1081 duplicate rows from creditcard.csv.


## Correct Datatypes

In [None]:
print("\n--- Correcting Data Types ---")

# Fraud_Data.csv
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'])
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])
print("Converted 'signup_time' and 'purchase_time' in Fraud_Data.csv to datetime.")

fraud_data['ip_address_int'] = fraud_data['ip_address'].round().astype(int)
print("Converted 'ip_address' in Fraud_Data.csv to integer format ('ip_address_int').")

# IpAddress_to_Country.csv
ip_to_country['lower_bound_ip_address'] = ip_to_country['lower_bound_ip_address'].astype(int)
ip_to_country['upper_bound_ip_address'] = ip_to_country['upper_bound_ip_address'].astype(int)
print("Converted 'lower_bound_ip_address' and 'upper_bound_ip_address' in IpAddress_to_Country.csv to integer.")

print("\n--- Final Data Info After Preprocessing Steps ---")
print("\nFraud_Data.csv Info:")
fraud_data.info()
print("\nIpAddress_to_Country.csv Info:")
ip_to_country.info()
print("\ncreditcard.csv Info:")
creditcard_data.info()

print("\nFirst 5 rows of processed Fraud_Data.csv:")
print(fraud_data.head())
print("\nFirst 5 rows of processed IpAddress_to_Country.csv:")
print(ip_to_country.head())
print("\nFirst 5 rows of processed creditcard.csv:")
print(creditcard_data.head())


--- Correcting Data Types ---
Converted 'signup_time' and 'purchase_time' in Fraud_Data.csv to datetime.
Converted 'ip_address' in Fraud_Data.csv to integer format ('ip_address_int').
Converted 'lower_bound_ip_address' and 'upper_bound_ip_address' in IpAddress_to_Country.csv to integer.
Data types for creditcard.csv are already appropriate.

--- Final Data Info After Preprocessing Steps ---

Fraud_Data.csv Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   user_id         151112 non-null  int64         
 1   signup_time     151112 non-null  datetime64[ns]
 2   purchase_time   151112 non-null  datetime64[ns]
 3   purchase_value  151112 non-null  int64         
 4   device_id       151112 non-null  object        
 5   source          151112 non-null  object        
 6   browser         151112 non-null  object    

## Save the Datasets

In [11]:
print("\n--- Saving Cleaned Datasets ---")
fraud_data.to_csv('../data/Fraud_Data_cleaned.csv', index=False)
ip_to_country.to_csv('../data/IpAddress_to_Country_cleaned.csv', index=False)
creditcard_data.to_csv('../data/creditcard_cleaned.csv', index=False)
print("Cleaned datasets saved to '../data/' directory.")


--- Saving Cleaned Datasets ---
Cleaned datasets saved to '../data/' directory.
