# Fraud Detection Data Preprosessing

## 1. Import libraries and loading dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pd.set_option('display.max_columns', 50)

In [None]:
pd.read_csv('~/aiffel/data/fraud.csv')

## 2. Understanding dataset

In [None]:
fraud_df = pd.read_csv('~/aiffel/data/fraud.csv')
fraud_df.head()

In [None]:
fraud_df.info()

## 3. Handling duplicated data

In [None]:
fraud_df[fraud_df.duplicated()]

## 4. Handling missing data

In [None]:
fraud_df.isna().sum()

In [None]:
fraud_df.isna().mean()

## 5. Change data type 'trans_date_trans_time' & 'dob' to  datetime

In [None]:
pd.to_datetime(fraud_df['trans_date_trans_time'])

In [None]:
fraud_df['trans_date_trans_time'] = pd.to_datetime(fraud_df['trans_date_trans_time'])

In [None]:
fraud_df['dob'] = pd.to_datetime(fraud_df['dob'])

### Extract 'hour' value from the 'trans_date_trans_time'

In [None]:
fraud_df['trans_date_trans_time'].dt.hour

In [None]:
fraud_df['hour'] = fraud_df['trans_date_trans_time'].dt.hour

### Extract 'age' from 'dob'

In [None]:
from datetime import datetime

today = datetime.now()
fraud_df['age'] = today.year - fraud_df['dob'].dt.year

## 6. Binning 'amt' data 
[0, 10, 50, 100, 500, 1000, 5000, 10000, 15000, 20000, 25000]

In [None]:
fraud_df['amt'].describe()

In [None]:
bins = [0, 10, 50, 100, 500, 1000, 5000, 10000, 15000, 20000, 25000]
labels = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
fraud_df['amt_bin'] = pd.cut(fraud_df['amt'], bins = bins, labels = labels)

In [None]:
sns.scatterplot(data=fraud_df, x = fraud_df.index, y= 'amt_bin', hue='is_fraud')

## 7. Divide region w/c/e depend on the 'long' column 


In [None]:
sns.scatterplot(data=fraud_df, x = 'long', y= 'lat', hue='is_fraud', alpha = 0.1, size = 1)

In [None]:
def determine_region_ew(longitude):
    if -82 <= longitude <= -66:
        return 'Eastern'
    elif -101 <= longitude < -82:
        return 'Central'
    elif -125 <= longitude < -101:
        return 'Western'
    else:
        return 'Other'
    
fraud_df['east_west'] = fraud_df['long'].apply(determine_region_ew).astype('category')


In [None]:
sns.scatterplot(data=fraud_df, x = 'long', y= 'lat', hue='east_west')

In [None]:
fraud_df['east_west'].value_counts()

## 8. Divide region n/c/s depend on the 'lat' column

In [None]:
def determine_region_ns(latitude):
    if 60 > latitude > 40:
        return 'Northern'
    elif 40 >= latitude > 35:
        return 'Central'
    elif 35 >= latitude > 20:
        return 'Southern'
    else:
        return 'Other'
    
fraud_df['north_south'] = fraud_df['lat'].apply(determine_region_ns).astype('category')

In [None]:
sns.scatterplot(data=fraud_df, x = 'long', y= 'lat', hue='north_south')

In [None]:
fraud_df['north_south'].value_counts()

## 9 Eliminate unnecessary columns

In [None]:
fraud_df2 = fraud_df.drop(['trans_date_trans_time', 'cc_num', 'merchant', 'amt', 'first', 'last', 'city', 'street', 'state', 'zip', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'lat', 'long', 'merch_lat', 'merch_long'], axis = 1)

In [None]:
fraud_df2.columns

In [None]:
fraud_df2.info()

In [None]:
fraud_df2.head()

## 10. Change datatype object to category for 'category' and 'gender' column

In [None]:
fraud_df2['category'] = fraud_df2['category'].astype('category')

In [None]:
fraud_df2['gender'] = fraud_df2['gender'].astype('category')

In [None]:
fraud_df2.info()

## 11. One-Hot encoding & Correlation analysis

In [None]:
fraud_df2_encoded = pd.get_dummies(fraud_df2)
fraud_df2_encoded.corr()


In [None]:
colormap = plt.cm.coolwarm
plt.figure(figsize=(15, 15))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(fraud_df2_encoded.astype(float).corr(), linewidths=0.1, vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True, annot_kws={'size': 7}, fmt='.1f')

## 12. Model

In [None]:
fraud_df2_encoded.info()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [None]:
# Split data into train and test set
X_train_df = fraud_df2_encoded.drop('is_fraud', axis=1).values
target_label = fraud_df2_encoded['is_fraud'].values

In [None]:
# Split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X_train_df, target_label, test_size=0.3, random_state=2024)

In [None]:
# Train model (Random Forest Regression)
model = RandomForestClassifier(n_estimators=500, random_state=0)
model.fit(X_train, y_train)

In [None]:
# Predict test set
y_pred = model.predict(X_test)

In [None]:
# Calculate model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# 결과를 출력합니다.
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)