## Credit Card Fraud Data Balance

In [None]:
import pandas as pd

### Load Unbalanced Training Data

In [73]:
df_train = pd.read_csv('fraudTrain.csv')

In [74]:
row_count = len(df_train)

In [75]:
row_count

1296675

### Drop Text Data Considered Irrelevant to this Analysis

In [76]:
df_train = df_train.drop(df_train.columns[0], axis='columns') # drop the first unnamed index column

In [77]:
df_train = df_train.drop(['trans_date_trans_time', 'merchant', 'first', 'last', 'street','city','state','job','dob','trans_num'], axis='columns')

In [78]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 12 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   cc_num      1296675 non-null  int64  
 1   category    1296675 non-null  object 
 2   amt         1296675 non-null  float64
 3   gender      1296675 non-null  object 
 4   zip         1296675 non-null  int64  
 5   lat         1296675 non-null  float64
 6   long        1296675 non-null  float64
 7   city_pop    1296675 non-null  int64  
 8   unix_time   1296675 non-null  int64  
 9   merch_lat   1296675 non-null  float64
 10  merch_long  1296675 non-null  float64
 11  is_fraud    1296675 non-null  int64  
dtypes: float64(5), int64(5), object(2)
memory usage: 118.7+ MB


### Data Wrangling

#### Map Gender

In [79]:
def map_gender(row):    
    if row['gender'] == "M":
        return 0
    else:
        return 1

In [80]:
df_train['gender'] = df_train.apply(lambda row: map_gender(row), axis=1)

#### Increase Fraud Transaction Count

In [81]:
fraud_count = round(row_count * 0.4) #update 40% of rows

In [82]:
fraud_count

518670

In [83]:
import random
#update random rows to Fraud
for i in range(fraud_count):
    r = random.randrange(1, fraud_count)
    df_train.at[r, "is_fraud"] = 1

#### Oversampling - Set Location Difference for Fradulent Transactions

In [84]:
#get all fraudulent transactions
df_train_fraud = df_train[df_train['is_fraud']==1]

In [85]:
fraud_rows_to_update = len(df_train_fraud)
print(fraud_rows_to_update)

333604


In [86]:
import random
#make the different about 100 degrees
diff_deg = 100
for i in range(fraud_rows_to_update):
    r = random.randrange(1, fraud_rows_to_update)
    df_train.at[r, "merch_lat"] = float(df_train.at[r, "merch_lat"]) + diff_deg
    df_train.at[r, "merch_long"] = float(df_train.at[r, "merch_long"]) + diff_deg

#### Map Latitude and Longitude

In [87]:
def find_lat_diff(row) :
    return round( abs(abs(row['lat']) - abs(row['merch_lat'])), 2 )

In [88]:
def find_long_diff(row) :
    return round( abs(abs(row['long']) - abs(row['merch_long'])), 2 )

In [89]:
df_train['lat_diff'] = df_train.apply(lambda row: find_lat_diff(row), axis=1)

In [90]:
df_train['long_diff'] = df_train.apply(lambda row: find_long_diff(row), axis=1)

#### Drop the original latitude and longitudes

In [91]:
df_train = df_train.drop(['lat', 'long', 'merch_lat', 'merch_long'], axis='columns')

In [92]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 10 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   cc_num     1296675 non-null  int64  
 1   category   1296675 non-null  object 
 2   amt        1296675 non-null  float64
 3   gender     1296675 non-null  int64  
 4   zip        1296675 non-null  int64  
 5   city_pop   1296675 non-null  int64  
 6   unix_time  1296675 non-null  int64  
 7   is_fraud   1296675 non-null  int64  
 8   lat_diff   1296675 non-null  float64
 9   long_diff  1296675 non-null  float64
dtypes: float64(3), int64(6), object(1)
memory usage: 98.9+ MB


#### Map Category Values
['misc_net', 'grocery_pos', 'entertainment', 'gas_transport', 'misc_pos', 'grocery_net', 'shopping_net', 'shopping_pos',       'food_dining', 'personal_care', 'health_fitness', 'travel', 'kids_pets', 'home']

In [93]:
def map_category_code(row):    
    if row['category'] == "misc_net":
        return 0
    elif row['category'] == "grocery_pos":
        return 1
    elif row['category'] == "entertainment":
        return 2
    elif row['category'] == "gas_transport":
        return 3
    elif row['category'] == "misc_pos":
        return 4
    elif row['category'] == "grocery_net":
        return 5
    elif row['category'] == "shopping_net":
        return 6
    elif row['category'] == "shopping_pos":
        return 7
    elif row['category'] == "food_dining":
        return 8
    elif row['category'] == "personal_care":
        return 9
    elif row['category'] == "health_fitness":
        return 10
    elif row['category'] == "travel":
        return 11
    elif row['category'] == "kids_pets":
        return 12
    elif row['category'] == "home":
        return 13

In [94]:
df_train['category_code'] = df_train.apply(lambda row: map_category_code(row), axis=1)

In [95]:
df_train = df_train.drop(['category'], axis='columns') #drop category text value

In [96]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 10 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   cc_num         1296675 non-null  int64  
 1   amt            1296675 non-null  float64
 2   gender         1296675 non-null  int64  
 3   zip            1296675 non-null  int64  
 4   city_pop       1296675 non-null  int64  
 5   unix_time      1296675 non-null  int64  
 6   is_fraud       1296675 non-null  int64  
 7   lat_diff       1296675 non-null  float64
 8   long_diff      1296675 non-null  float64
 9   category_code  1296675 non-null  int64  
dtypes: float64(3), int64(7)
memory usage: 98.9 MB


### Save Balanced Data

In [97]:
df_train.to_csv('fraudTrainBalanced.csv', index=False)

### Find Pearson Correlation

In [100]:
correlation_pearson = df_train.corr(method ='pearson')

In [101]:
correlation_pearson

Unnamed: 0,cc_num,amt,gender,zip,city_pop,unix_time,is_fraud,lat_diff,long_diff,category_code
cc_num,1.0,0.001769,-0.001112,0.041459,-0.008991,0.000354,-0.000731,-0.000434,0.001202,-0.00032
amt,0.001769,1.0,-0.001034,0.001843,0.005818,-0.000293,0.029814,0.000944,0.00143,-0.059366
gender,-0.001112,-0.001034,1.0,0.065951,0.028649,0.00096,-0.001769,-0.000412,0.001268,0.003711
zip,0.041459,0.001843,0.065951,1.0,0.078467,0.00067,-0.001313,-0.000868,0.02001,-0.000612
city_pop,-0.008991,0.005818,0.028649,0.078467,1.0,-0.001714,0.000896,0.001108,0.002765,0.003815
unix_time,0.000354,-0.000293,0.00096,0.00067,-0.001714,1.0,-0.591081,-0.490829,-0.460836,-0.000226
is_fraud,-0.000731,0.029814,-0.001769,-0.001313,0.000896,-0.591081,1.0,0.332083,0.311757,-0.005669
lat_diff,-0.000434,0.000944,-0.000412,-0.000868,0.001108,-0.490829,0.332083,1.0,0.823826,-0.000635
long_diff,0.001202,0.00143,0.001268,0.02001,0.002765,-0.460836,0.311757,0.823826,1.0,-0.000189
category_code,-0.00032,-0.059366,0.003711,-0.000612,0.003815,-0.000226,-0.005669,-0.000635,-0.000189,1.0


In [102]:
corr_is_fraud = abs(correlation_pearson["is_fraud"])
high_corr = corr_is_fraud[corr_is_fraud>0.1] #find highly correlated features
print(high_corr.sort_values())

long_diff    0.311757
lat_diff     0.332083
unix_time    0.591081
is_fraud     1.000000
Name: is_fraud, dtype: float64
