In [1]:
# Libraries
import kaggle
import os
import pandas as pd
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt



In [2]:
# Additional settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 25)

In [3]:
# Download Dataset using Kaggle API Key
# Requirements: Kaggle API key
if not os.path.isfile("data/fraudTrain.csv") or not os.path.isfile("data/fraudTest.csv"):
    kaggle.api.authenticate()
    kaggle.api.dataset_download_files('kartik2112/fraud-detection', path='./data', unzip=True)

In [4]:
# Dataset Analysis
df_train = pd.read_csv('data/fraudTrain.csv', index_col=0)
df_test = pd.read_csv('data/fraudTest.csv', index_col=0)

In [5]:
# Cast Data

# Datetime
# Train
df_train['trans_date_trans_time'] = pd.to_datetime(df_train['trans_date_trans_time'], errors='ignore')
df_train['dob'] = pd.to_datetime(df_train['dob'], errors='ignore') # day of birth

# Test
df_test['trans_date_trans_time'] = pd.to_datetime(df_test['trans_date_trans_time'], errors='ignore')
df_test['dob'] = pd.to_datetime(df_test['dob'], errors='ignore')

In [6]:
# Reduce dimension
drop_columns = ["first","last","street","city","state",
                "zip","trans_num","unix_time"]
df_train.drop(columns=drop_columns, inplace=True)
df_test.drop(columns=drop_columns, inplace=True)

In [8]:
# Calculate Age
df_train['age'] = np.round((df_train['trans_date_trans_time'] - df_train['dob']) / np.timedelta64(1,'Y'))
df_train = df_train.astype({'age': 'int64'})

df_test['age'] = np.round((df_test['trans_date_trans_time'] - df_test['dob']) / np.timedelta64(1,'Y'))
df_test = df_test.astype({'age': 'int64'})

drop_columns = ["dob"]

df_train.drop(columns=drop_columns, inplace=True)
df_test.drop(columns=drop_columns, inplace=True)

In [12]:
# Rename Columns
df_train.head()
trans_dict = {"trans_date_trans_time":"timestamp","cc_num":"credit_card_num","merchant":"shop",
             "amt":"amount"}
df_train.rename(columns=trans_dict, inplace=True)
df_test.rename(columns=trans_dict, inplace=True)

In [14]:
# Save clean dataset
df_train.to_csv("data/clean_fraudTrain.csv")
df_test.to_csv("data/clean_fraudTest.csv")