## Credit Card Fraud Data Balance

In [1]:
import pandas as pd

### Load Unbalanced Training Data

In [2]:
df_train = pd.read_csv('fraudTrain.csv')

In [3]:
row_count = len(df_train)

In [4]:
row_count

1296675

### Drop Text Data Considered Irrelevant to this Analysis

In [5]:
df_train = df_train.drop(df_train.columns[0], axis='columns') # drop the first unnamed index column

In [6]:
df_train = df_train.drop(['trans_date_trans_time', 'merchant', 'first', 'last', 'street','city','state','job','cc_num','trans_num','unix_time','zip','city_pop'], axis='columns')

In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 9 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   category    1296675 non-null  object 
 1   amt         1296675 non-null  float64
 2   gender      1296675 non-null  object 
 3   lat         1296675 non-null  float64
 4   long        1296675 non-null  float64
 5   dob         1296675 non-null  object 
 6   merch_lat   1296675 non-null  float64
 7   merch_long  1296675 non-null  float64
 8   is_fraud    1296675 non-null  int64  
dtypes: float64(5), int64(1), object(3)
memory usage: 89.0+ MB


### Data Wrangling

#### Map Gender

In [8]:
def map_gender(row):    
    if row['gender'] == "M":
        return 0
    else:
        return 1

In [9]:
df_train['gender'] = df_train.apply(lambda row: map_gender(row), axis=1)

#### Calculate Age from Date-of-Birth and Map to AgeGroup

In [10]:
from datetime import date
def calculate_age(row):
  today = date.today()
  return today.year - row['dob'].year - ((today.month, today.day) < (row['dob'].month, row['dob'].day))

In [11]:
df_train['dob'] = pd.to_datetime(df_train['dob'])
df_train['age'] = df_train.apply (lambda row: calculate_age(row), axis=1)

In [12]:
def map_age_group(row):    
    if row['age'] <= 19:
        return 0 #teenage
    elif row['age'] > 19 and row['age'] <= 24:
        return 1 #young_adult
    elif row['age'] > 24 and row['age'] <= 39:
        return 2 #adult
    elif row['age'] > 39 and row['age'] <= 54:
        return 3 #middle_aged
    else:
        return 4 #elderly

In [13]:
df_train['age_group'] = df_train.apply(lambda row: map_age_group(row), axis=1)

In [14]:
df_train = df_train.drop(['dob','age'], axis='columns') #drop DOB & Age

#### Categorize Transaction Amount into Groups

In [15]:
def map_price_group(row):    
    if row['amt'] <= 30:
        return 0 #Cheap
    elif row['amt'] > 30 and row['amt'] <= 60:
        return 1 #Affordable
    elif row['amt'] > 60 and row['amt'] <= 499:
        return 2 #Average
    elif row['amt'] > 499 and row['amt'] <= 2999:
        return 3 #Expensive
    else:
        return 4 #Luxury

In [16]:
df_train['price_range'] = df_train.apply (lambda row: map_price_group(row), axis=1)

In [17]:
df_train = df_train.drop(['amt'], axis='columns') #drop Amount

#### Oversampling - Increase Distance for 40% of Transactions and Mark as Fraudulent

In [18]:
fraud_count = round(row_count * 0.4) #update 40% of rows

In [19]:
fraud_count

518670

In [20]:
import random
#make the different about 100 degrees
diff_deg = 100
row_no = 0
for i in range(fraud_count):
    row_no = row_no + 2
    df_train.at[row_no, "merch_lat"] = float(df_train.at[row_no, "merch_lat"]) + diff_deg
    df_train.at[row_no, "merch_long"] = float(df_train.at[row_no, "merch_long"]) + diff_deg

#### Map Latitude and Longitude to Distance

In [21]:
def find_lat_diff(row) :
    return round( abs(abs(row['lat']) - abs(row['merch_lat'])), 2 )

In [22]:
def find_long_diff(row) :
    return round( abs(abs(row['long']) - abs(row['merch_long'])), 2 )

In [23]:
df_train['lat_diff'] = df_train.apply(lambda row: find_lat_diff(row), axis=1)

In [24]:
df_train['long_diff'] = df_train.apply(lambda row: find_long_diff(row), axis=1)

In [25]:
import numpy as np
df_train['distance'] = np.linalg.norm(df_train[['long_diff', 'lat_diff']], axis=1) # find distance

In [26]:
df_train['distance'] = round(df_train['distance'], 2) #round to 2 decimal

In [27]:
df_train.loc[df_train['distance']>10, 'is_fraud'] = 1 #set any distance > 10 as Fraudulent

In [28]:
len(df_train[df_train['is_fraud']==1])

523193

#### Drop the original latitude and longitudes

In [29]:
df_train = df_train.drop(['lat', 'long', 'merch_lat', 'merch_long','lat_diff','long_diff'], axis='columns')

In [30]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 6 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   category     1296675 non-null  object 
 1   gender       1296675 non-null  int64  
 2   is_fraud     1296675 non-null  int64  
 3   age_group    1296675 non-null  int64  
 4   price_range  1296675 non-null  int64  
 5   distance     1296675 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 59.4+ MB


#### Map Category Values
['misc_net', 'grocery_pos', 'entertainment', 'gas_transport', 'misc_pos', 'grocery_net', 'shopping_net', 'shopping_pos',       'food_dining', 'personal_care', 'health_fitness', 'travel', 'kids_pets', 'home']

In [31]:
def map_category_code(row):    
    if row['category'] == "misc_net":
        return 0
    elif row['category'] == "grocery_pos":
        return 1
    elif row['category'] == "entertainment":
        return 2
    elif row['category'] == "gas_transport":
        return 3
    elif row['category'] == "misc_pos":
        return 4
    elif row['category'] == "grocery_net":
        return 5
    elif row['category'] == "shopping_net":
        return 6
    elif row['category'] == "shopping_pos":
        return 7
    elif row['category'] == "food_dining":
        return 8
    elif row['category'] == "personal_care":
        return 9
    elif row['category'] == "health_fitness":
        return 10
    elif row['category'] == "travel":
        return 11
    elif row['category'] == "kids_pets":
        return 12
    elif row['category'] == "home":
        return 13

In [32]:
df_train['category_code'] = df_train.apply(lambda row: map_category_code(row), axis=1)

In [33]:
df_train = df_train.drop(['category'], axis='columns') #drop category text value

In [34]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 6 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   gender         1296675 non-null  int64  
 1   is_fraud       1296675 non-null  int64  
 2   age_group      1296675 non-null  int64  
 3   price_range    1296675 non-null  int64  
 4   distance       1296675 non-null  float64
 5   category_code  1296675 non-null  int64  
dtypes: float64(1), int64(5)
memory usage: 59.4 MB


### Save Balanced Data

In [35]:
df_train.to_csv('fraudTrainBalanced.csv', index=False)

### Find Pearson Correlation

In [36]:
correlation_pearson = df_train.corr(method ='pearson')

In [37]:
correlation_pearson

Unnamed: 0,gender,is_fraud,age_group,price_range,distance,category_code
gender,1.0,-0.000895,-0.028813,-0.061793,0.003886,0.003711
is_fraud,-0.000895,1.0,0.003008,0.007011,0.986578,-0.00396
age_group,-0.028813,0.003008,1.0,-0.017746,0.003492,0.012787
price_range,-0.061793,0.007011,-0.017746,1.0,-0.001314,-0.171678
distance,0.003886,0.986578,0.003492,-0.001314,1.0,-0.000413
category_code,0.003711,-0.00396,0.012787,-0.171678,-0.000413,1.0


In [38]:
corr_is_fraud = abs(correlation_pearson["is_fraud"])
high_corr = corr_is_fraud[corr_is_fraud>0.1] #find highly correlated features
print(high_corr.sort_values())

distance    0.986578
is_fraud    1.000000
Name: is_fraud, dtype: float64
