In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../src')))
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../scripts')))

In [2]:
import pandas as pd

# Load the datasets into DataFrames (make sure you provide the correct path to the files)
creditcard_df = pd.read_csv('../data/creditcard.csv')
fraud_data_df = pd.read_csv('../data/Fraud_Data.csv')
ip_data_df = pd.read_csv('../data/IpAddress_to_Country.csv')


In [3]:

# Initialize processor
from eda.eda import FraudDataProcessor


processor = FraudDataProcessor(
    creditcard_path=creditcard_df, 
    fraud_data_path=fraud_data_df, 
    ip_data_path=ip_data_df
)



In [None]:
# Get minimum age
min_age = processor.fraud_data_df['age'].min()

# Get maximum age
max_age = processor.fraud_data_df['age'].max()

print(f"Minimum Age: {min_age}")
print(f"Maximum Age: {max_age}")


In [None]:
unique = processor.fraud_data_df['user_id'].unique()
print("unique: ", unique)

In [None]:
# Check for duplicates
duplicates = processor.fraud_data_df[processor.fraud_data_df.duplicated(subset=['device_id'], keep=False)]

# Display the first 20 duplicate rows
print("First 20 duplicate rows: \n", duplicates.head(20))

# Count the total number of duplicate rows
total_duplicates = duplicates.shape[0]
print("Total number of duplicate rows: ", total_duplicates)


In [None]:
# Count occurrences of each device_id
device_id_counts = processor.fraud_data_df['device_id'].value_counts()

# Filter to get only the duplicated device_ids (occurrences > 1)
duplicated_device_ids = device_id_counts[device_id_counts > 1]

# Print the duplicated device_ids and their counts
print("Duplicated device_ids and their counts: \n", duplicated_device_ids)


In [None]:
# Show missing values
processor.show_missing_values()


In [9]:

# Handle missing values (impute or drop)
processor.handle_missing_values(method='drop')


In [10]:

# Clean the data
processor.clean_data()


In [None]:
# View data types and null values for fraud data
processor.fraud_data_df.info()

# View data types and null values for credit card data
processor.creditcard_df.info()

# View data types and null values for IP data
processor.ip_data_df.info()


In [None]:

# Perform EDA
processor.univariate_analysis()
processor.bivariate_analysis()


In [None]:

# Merge datasets
merged_data = processor.merge_datasets_for_geolocation()


In [None]:
merged_data.head()

In [None]:
print(merged_data.columns)


In [None]:

# Feature engineering
processed_data = processor.feature_engineering(merged_data)


In [None]:

# Normalize and scale the data
scaled_data = processor.normalize_and_scale(processed_data)


In [None]:

# Encode categorical features
final_data = processor.encode_categorical_features(scaled_data)

