## Import Dependencies

In [1]:
import pandas as pd
import numpy as np

## Load Datasets

In [2]:

try:
    fraud_data = pd.read_csv('../data/Fraud_Data_cleaned.csv')
    ip_to_country = pd.read_csv('../data/IpAddress_to_Country_cleaned.csv')
    creditcard_data = pd.read_csv('../data/creditcard_cleaned.csv')

    # Re-apply the necessary preprocessing steps to ensure consistency for EDA
    fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'])
    fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])
    
    # Convert ip_address to integer as per previous successful step
    fraud_data['ip_address_int'] = fraud_data['ip_address'].round().astype(int)

    ip_to_country['lower_bound_ip_address'] = ip_to_country['lower_bound_ip_address'].astype(int)
    ip_to_country['upper_bound_ip_address'] = ip_to_country['upper_bound_ip_address'].astype(int)

    creditcard_data.drop_duplicates(inplace=True)
    
    print("Datasets loaded for EDA.")
except FileNotFoundError as e:
    print(f"Error loading file for EDA: {e}. Please ensure the CSV files are in the data directory.")
    exit()

Datasets loaded for EDA.


## Merge the Datasets

In [None]:

print("\n--- Starting Merging Datasets for Geolocation Analysis ---")

# Sort ip_to_country by lower_bound_ip_address for efficient searching
ip_to_country_sorted = ip_to_country.sort_values(by='lower_bound_ip_address').reset_index(drop=True)


def get_country(ip_int, ip_ranges_df):

    idx = ip_ranges_df['lower_bound_ip_address'].searchsorted(ip_int, side='right') - 1

    if idx >= 0 and ip_int <= ip_ranges_df.loc[idx, 'upper_bound_ip_address']:
        return ip_ranges_df.loc[idx, 'country']
    else:
        return np.nan

print("Mapping IP addresses to countries. This may take a moment...")
fraud_data['country'] = fraud_data['ip_address_int'].apply(lambda x: get_country(x, ip_to_country_sorted))
print("IP address to country mapping complete.")

unmapped_ips_count = fraud_data['country'].isnull().sum()
if unmapped_ips_count > 0:
    print(f"\nWarning: {unmapped_ips_count} IP addresses could not be mapped to a country.")
else:
    print("\nAll IP addresses successfully mapped to a country.")

# Lable the unmapped countries Unknown
fraud_data['country'].fillna('Unknown', inplace=True)
print("\n--- Merged Fraud Data Info ---")
fraud_data.info()

print("\nFirst 5 rows of Fraud_Data.csv with new 'country' column:")
print(fraud_data.head())




--- Starting Merging Datasets for Geolocation Analysis ---
Mapping IP addresses to countries. This may take a moment...
IP address to country mapping complete.


--- Merged Fraud Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   user_id         151112 non-null  int64         
 1   signup_time     151112 non-null  datetime64[ns]
 2   purchase_time   151112 non-null  datetime64[ns]
 3   purchase_value  151112 non-null  int64         
 4   device_id       151112 non-null  object        
 5   source          151112 non-null  object        
 6   browser         151112 non-null  object        
 7   sex             151112 non-null  object        
 8   age             151112 non-null  int64         
 9   ip_address      151112 non-null  float64       
 10  class           151112 non-null  int64         
 11  i

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fraud_data['country'].fillna('Unknown', inplace=True)


## Save the merged dataset

In [5]:
fraud_data.to_csv('../data/Fraud_Data_merged.csv', index=False)
print("Merged fraud_data saved to '../data/Fraud_Data_merged.csv'.")

Merged fraud_data saved to '../data/Fraud_Data_merged.csv'.
