In [10]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from datetime import datetime
import time
import json

# 1. DIVIDING DA DATASET

In [83]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv("dataquest_fraud_train.csv")
df.sample(4)

Unnamed: 0,transDate,creditCardNum,business,category,amount,firstName,lastName,gender,street,city,...,latitude,longitude,cityPop,job,dateOfBirth,transNum,unixTime,merchLatitude,merchLongitude,isFraud
41919,2019-07-24 9:20,377550200000000.0,"fraud_Willms, Kris and Bergnaum",shopping_pos,3.79,Kevin,Walters,M,87227 Tapia Burgs,Auburn,...,44.0948,-70.239,23045.0,Tourist information centre manager,1958-09-02,e72e48517e0ff5ee5ede5d3be20778bc,1343122000.0,44.374025,-69.39423,0.0
21450,2019-07-16 21:45,4681699000000.0,fraud_Kuhn LLC,shopping_pos,3.0,Joseph,Gonzalez,M,319 Wendy Fort Suite 179,Murfreesboro,...,35.8596,-86.421,158701.0,"Journalist, newspaper",1978-03-06,7c0677a2019114622bc056c738662f72,1342475000.0,35.265799,-87.041881,0.0
17588,2019-07-15 12:21,501803000000.0,fraud_Larkin Ltd,kids_pets,37.23,Robert,Flores,M,3277 Fields Meadows Apt. 790,Greenview,...,41.5403,-122.9366,308.0,Call centre manager,1958-09-20,90a809caf98330098b366bf1dfcc040e,1342355000.0,41.095368,-123.736368,0.0
15701,2019-07-14 22:18,4.469777e+18,fraud_Bins-Howell,personal_care,2.62,Gregory,Graham,M,4005 Dana Glens,Methuen,...,42.728,-71.181,47249.0,Market researcher,1980-11-22,264629ffe623ae98ab5b2782107ce57d,1342304000.0,43.35958,-71.385489,0.0


In [84]:
# Replace empty strings with NaN
df.replace('', np.nan, inplace=True)

# Remove rows where all elements are NaN
df.dropna(how='all', inplace=True)

# Removing firstName, lastName, transNum, creditCardNum columns
df.drop(['firstName', 'lastName', 'transNum', 'creditCardNum', 'street'], axis=1, inplace=True)

# Convert transDate column to datetime format
df['transDate'] = pd.to_datetime(df['transDate'], format='%Y-%m-%d %H:%M')

# Convert transDate to Unix time and create a new unixTime column
df['unixTime'] = df['transDate'].astype('int64') // 10**9
df = df.drop(['transDate'], axis=1)

df.sample(4)

Unnamed: 0,business,category,amount,gender,city,state,zip,latitude,longitude,cityPop,job,dateOfBirth,unixTime,merchLatitude,merchLongitude,isFraud
47351,fraud_Lockman Ltd,grocery_pos,184.27,F,De Witt,AR,72042.0,34.2853,-91.3336,5161.0,Electrical engineer,1993-04-08,1565584080,33.907251,-91.538778,0.0
38104,fraud_Kulas Group,health_fitness,16.13,M,Halma,MN,56729.0,48.6669,-96.5969,140.0,"Embryologist, clinical",1942-01-06,1563815040,48.457431,-96.164139,0.0
17666,fraud_Kub PLC,personal_care,70.72,M,Athena,OR,97813.0,45.8289,-118.4971,1302.0,Dealer,1976-10-18,1563194880,46.760186,-118.326292,0.0
56894,fraud_Gulgowski LLC,home,83.06,M,Acworth,NH,3601.0,43.196,-72.3001,477.0,Naval architect,1988-04-15,1569190080,42.81874,-72.628616,0.0


# 2. Catgeorical Data to Numerical Data

## Encoding Catg. Data

In [85]:
# Encoding Gender
df['gender'] = df['gender'].apply(lambda x: 0 if x == 'F' else 1)

# Encoding Age
def calculate_age(birth_date_str):
    birth_date = datetime.strptime(birth_date_str, "%Y-%m-%d")
    today = datetime.now()
    age = today.year - birth_date.year - ((today.month, today.day) < (birth_date.month, birth_date.day))
    return age
    
df['age'] = df['dateOfBirth'].apply(calculate_age)
df = df.drop(['dateOfBirth'], axis=1)
df.sample(5)

Unnamed: 0,business,category,amount,gender,city,state,zip,latitude,longitude,cityPop,job,unixTime,merchLatitude,merchLongitude,isFraud,age
7783,fraud_Stiedemann Ltd,food_dining,63.48,1,Hazel,KY,42049.0,36.5422,-88.3319,1480.0,"Designer, interior/spatial",1562943660,35.698417,-88.64812,0.0,49
5425,"fraud_Gottlieb, Considine and Schultz",shopping_net,9.01,0,Jermyn,PA,18433.0,41.5744,-75.5881,6508.0,Training and development officer,1562849100,40.580618,-75.200099,0.0,42
32504,"fraud_Stroman, Hudson and Erdman",gas_transport,43.99,0,Center Point,WV,26339.0,39.4125,-80.6352,255.0,Chief Executive Officer,1563691920,38.938677,-81.184408,0.0,52
15165,fraud_Macejkovic-Lesch,shopping_pos,2.18,1,Elizabeth,NJ,7208.0,40.6747,-74.2239,124967.0,Operational researcher,1563132420,41.173655,-73.712755,0.0,43
58284,"fraud_Lind, Huel and McClure",gas_transport,48.06,1,Westport,KY,40077.0,38.4921,-85.4524,564.0,Pensions consultant,1569234420,38.546129,-86.067218,0.0,27


# One Hot Encoding Data

In [86]:
"""
print(df.groupby('job').job.count())
print(df.groupby('state').state.count())
print(df.groupby('city').city.count())
print(df.groupby('category').category.count())
print(df.groupby('business').business.count())
"""

df = pd.get_dummies(df, columns=['job'])
df = pd.get_dummies(df, columns=['state'])
df = pd.get_dummies(df, columns=['city'])
df = pd.get_dummies(df, columns=['category'])
df = pd.get_dummies(df, columns=['business'])
df.sample(5)

Unnamed: 0,amount,gender,zip,latitude,longitude,cityPop,unixTime,merchLatitude,merchLongitude,isFraud,...,"business_fraud_Yost, Schamberger and Windler",business_fraud_Yost-Rogahn,business_fraud_Zboncak LLC,business_fraud_Zboncak Ltd,"business_fraud_Zboncak, Rowe and Murazik",business_fraud_Zemlak Group,"business_fraud_Zemlak, Tillman and Cremin",business_fraud_Ziemann-Waters,"business_fraud_Zieme, Bode and Dooley",business_fraud_Zulauf LLC
34053,55.37,1,44702.0,40.8027,-81.3739,192805.0,1563725580,41.356645,-81.41908,0.0,...,0,0,0,0,0,0,0,0,0,0
31733,6.82,1,92210.0,33.7163,-116.3381,4677.0,1563670500,34.36426,-117.150552,0.0,...,0,0,0,0,0,0,0,0,0,0
19842,98.54,1,12410.0,42.074,-74.453,397.0,1563239640,42.66908,-75.182892,0.0,...,0,0,0,0,0,0,0,0,0,0
4493,2.83,0,71232.0,32.3929,-91.4714,6581.0,1562806860,32.440491,-90.820581,0.0,...,0,0,0,0,0,0,0,0,0,0
11490,1.01,0,20687.0,38.0828,-76.3477,313.0,1563047340,38.458254,-76.666267,0.0,...,0,0,0,0,0,0,0,0,0,0


# 3. Adding Features

## Adding distance between transactions

In [87]:
import math
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of the Earth in kilometers

    # Convert latitude and longitude from degrees to radians
    lat1 = math.radians(lat1)
    lon1 = math.radians(lon1)
    lat2 = math.radians(lat2)
    lon2 = math.radians(lon2)

    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distance = R * c

    return distance

# Coordinates of the two points
lat1, lon1 = 37.2692, -82.9161
lat2, lon2 = 37.622362, -82.676007

distanceBetween = []
for index in df.index:
    lat1 = df.iloc[index]["latitude"]
    long1 =  df.iloc[index]["longitude"]
    lat2 = df.iloc[index]["merchLatitude"]
    long2 = df.iloc[index]["merchLongitude"]
    distanceBetween.append(round(haversine(lat1, long1, lat2, long2), 4))

print(distanceBetween[0])
df["distance_between"] = distanceBetween
df.sample(5)
    

85.1941


Unnamed: 0,amount,gender,zip,latitude,longitude,cityPop,unixTime,merchLatitude,merchLongitude,isFraud,...,business_fraud_Yost-Rogahn,business_fraud_Zboncak LLC,business_fraud_Zboncak Ltd,"business_fraud_Zboncak, Rowe and Murazik",business_fraud_Zemlak Group,"business_fraud_Zemlak, Tillman and Cremin",business_fraud_Ziemann-Waters,"business_fraud_Zieme, Bode and Dooley",business_fraud_Zulauf LLC,distance_between
15378,34.05,1,94569.0,38.046,-122.1866,198.0,1563136500,38.630504,-121.567015,0.0,...,0,0,0,0,0,0,0,0,0,84.5238
9785,73.28,0,7439.0,41.0767,-74.5982,2456.0,1563006420,40.703525,-74.455181,0.0,...,0,0,0,0,0,0,0,0,0,43.2016
18039,4.18,0,15484.0,39.8936,-79.7856,328.0,1563201720,38.987284,-78.843284,0.0,...,0,0,0,0,0,0,0,0,0,129.2433
31582,61.66,0,20687.0,38.0828,-76.3477,313.0,1563666540,37.565616,-76.306225,0.0,...,0,0,0,0,0,0,0,0,0,57.6235
22274,43.94,0,55128.0,44.9913,-92.9487,753116.0,1563354540,44.888093,-93.458243,0.0,...,0,0,0,0,0,0,0,0,0,41.7154


# 4. Normalizing All Data

In [90]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Columns to be expressed as a percentage of their total
PERCENT_OF_TOTAL = ["amount", "cityPop"]

# Columns to be scaled to the range [0, 1]
PERCENT_OF_HIGHEST = ["zip", "distance_between", "unixTime", "latitude", "longitude", "merchLatitude", "merchLongitude"]

# Calculate the percentage of the total for specified columns
for item in PERCENT_OF_TOTAL:
    total = df[item].sum()
    df[item] = (df[item] / total) * 100  # Convert to percentage

# Normalize specified columns to the range [0, 1]
for item in PERCENT_OF_HIGHEST:
    df[item] = scaler.fit_transform(df[[item]])

df

Unnamed: 0,amount,gender,zip,latitude,longitude,cityPop,unixTime,merchLatitude,merchLongitude,isFraud,...,business_fraud_Yost-Rogahn,business_fraud_Zboncak LLC,business_fraud_Zboncak Ltd,"business_fraud_Zboncak, Rowe and Murazik",business_fraud_Zemlak Group,"business_fraud_Zemlak, Tillman and Cremin",business_fraud_Ziemann-Waters,"business_fraud_Zieme, Bode and Dooley",business_fraud_Zulauf LLC,distance_between
0,0.000095,0,0.611747,0.453062,0.781827,0.000129,0.000000,0.440364,0.772457,0.0,...,0,0,0,0,0,0,0,0,0,0.571898
1,0.000062,1,0.411861,0.377596,0.846853,0.000015,0.000000,0.389982,0.842888,0.0,...,0,0,0,0,0,0,0,0,0,0.298443
2,0.000532,0,0.951160,0.398916,0.437545,0.000006,0.000003,0.420193,0.448512,0.0,...,0,0,0,0,0,0,0,0,0,0.823080
3,0.000223,1,0.498995,0.499838,0.737319,0.000058,0.000003,0.508881,0.725817,0.0,...,0,0,0,0,0,0,0,0,0,0.506875
4,0.003002,0,0.245468,0.423638,0.898973,0.000037,0.000010,0.423377,0.885715,0.0,...,0,0,0,0,0,0,0,0,0,0.358175
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58914,0.001329,0,0.672797,0.396853,0.685941,0.000004,0.999995,0.410008,0.685347,0.0,...,0,0,0,0,0,0,0,0,0,0.354353
58915,0.001645,1,0.937519,0.360733,0.466412,0.000002,1.000000,0.377551,0.460417,0.0,...,0,0,0,0,0,0,0,0,0,0.572298
58916,0.001495,0,0.278119,0.354218,0.865145,0.000406,1.000000,0.354455,0.854927,0.0,...,0,0,0,0,0,0,0,0,0,0.289904
58917,0.001549,0,0.416296,0.376186,0.806503,0.000009,1.000000,0.375578,0.808744,0.0,...,0,0,0,0,0,0,0,0,0,0.506180


# 5. Balancing Dataset

In [98]:
# Checking Balance of Fraud to No Fraud
def checkBalance(newDf):
    distribution = newDf.groupby("isFraud").isFraud.count()
    print(distribution)
    print(distribution[0] / (distribution[0] + distribution[1]))
    print(distribution[1] / (distribution[0] + distribution[1]))

checkBalance(df)

isFraud
0.0    58305
1.0      614
Name: isFraud, dtype: int64
0.9895789134235137
0.010421086576486363


In [95]:
# Oversampling and Undersampling Data

# Separate majority and minority classes
majority_class = df[df.isFraud == 0]
minority_class = df[df.isFraud == 1]

# Upsample minority class
minority_upsampled = resample(minority_class,
                              replace=True,     # sample with replacement
                              n_samples=len(majority_class),    # to match majority class size
                              random_state=123) # reproducible results

# Downsample Majority class
majority_downsampled = resample(majority_class,
                              replace=True,     # sample with replacement
                              n_samples=len(minority_class),    # to match majority class size
                              random_state=123) # reproducible results

# Combine Dataframes
oversampledDf = pd.concat([majority_class, minority_upsampled])
undersampledDf = pd.concat([minority_class, majority_downsampled])

#oversampledDf.isFraud.sample(5), undersampledDf.isFraud.sample(5)
oversampledDf.shape, undersampledDf.shape

((116610, 2082), (1228, 2082))

In [97]:
# Creating X and Y Training Sets
X_over_train, X_over_test, y_over_train, y_over_test = train_test_split(
                                                            oversampledDf.drop("isFraud", axis=1),
                                                            oversampledDf.isFraud,
                                                            test_size=0.2, 
                                                            random_state=0
                                                        )
X_under_train, X_under_test, y_under_train, y_under_test = train_test_split(
                                                            undersampledDf.drop("isFraud", axis=1),
                                                            undersampledDf.isFraud,
                                                            test_size=0.2, 
                                                            random_state=0
                                                        )

# 6. FEATURE SELECTION WITH RANDOM FOREST

In [99]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA

# Calculate the correlation matrix
correlation_matrix = df.corr()

# Find columns with high correlation
high_corr_columns = [column for column in correlation_matrix.columns if any(correlation_matrix[column] > 0.75)]
print(f"Columns with high correlation: {high_corr_columns}")
pca = PCA(n_components=0.95)  # for example, to retain 95% of variance

# Fit and transform the data with PCA
# Normally you would fit PCA on the normalized or standardized data
pca_result = pca.fit_transform(df)

# Create a new DataFrame with the PCA results
pca_df = pd.DataFrame(pca_result)

print(f"Original shape: {df.shape}, Reduced shape: {pca_df.shape}")

Columns with high correlation: ['amount', 'gender', 'zip', 'latitude', 'longitude', 'cityPop', 'unixTime', 'merchLatitude', 'merchLongitude', 'isFraud', 'age', 'job_Academic librarian', 'job_Accountant, chartered certified', 'job_Accountant, chartered public finance', 'job_Accounting technician', 'job_Acupuncturist', 'job_Administrator', 'job_Administrator, arts', 'job_Administrator, charities/voluntary organisations', 'job_Administrator, education', 'job_Administrator, local government', 'job_Advertising account executive', 'job_Advertising account planner', 'job_Advertising copywriter', 'job_Advice worker', 'job_Aeronautical engineer', 'job_Agricultural consultant', 'job_Aid worker', 'job_Air broker', 'job_Air cabin crew', 'job_Airline pilot', 'job_Ambulance person', 'job_Amenity horticulturist', 'job_Analytical chemist', 'job_Animal nutritionist', 'job_Animal technologist', 'job_Animator', 'job_Applications developer', 'job_Arboriculturist', 'job_Archaeologist', 'job_Architect', 'jo

In [103]:
df.sample(5)

Unnamed: 0,amount,gender,zip,latitude,longitude,cityPop,unixTime,merchLatitude,merchLongitude,isFraud,...,business_fraud_Yost-Rogahn,business_fraud_Zboncak LLC,business_fraud_Zboncak Ltd,"business_fraud_Zboncak, Rowe and Murazik",business_fraud_Zemlak Group,"business_fraud_Zemlak, Tillman and Cremin",business_fraud_Ziemann-Waters,"business_fraud_Zieme, Bode and Dooley",business_fraud_Zulauf LLC,distance_between
29451,0.002016,0,0.686875,0.443897,0.684956,9e-06,0.754455,0.430269,0.676782,0.0,...,0,0,0,0,0,0,0,0,0,0.630863
1362,0.003473,1,0.426933,0.448286,0.847691,5e-06,0.713501,0.462156,0.841421,0.0,...,0,0,0,0,0,0,0,0,0,0.41748
44777,0.001047,0,0.914114,0.314313,0.482484,0.00067,0.837865,0.328119,0.484584,0.0,...,0,0,0,0,0,0,0,0,0,0.239404
12105,0.001359,1,0.168412,0.458557,0.908852,1.2e-05,0.730104,0.457829,0.897486,0.0,...,0,0,0,0,0,0,0,0,0,0.228778
27947,0.000927,0,0.144398,0.43507,0.878888,6e-06,0.752279,0.444162,0.863808,0.0,...,0,0,0,0,0,0,0,0,0,0.514287


In [106]:
pca_df.sample(5)

Unnamed: 0,0
48278,46.866257
17589,38.867066
17083,11.866853
30083,41.867174
43996,-4.133161


In [None]:
# Using Random Forest to Generate
sel = SelectFromModel(RandomForestClassifier(n_estimators = 100))
sel.fit(X_over_train, y_over_train)
sel.get_support()
selected_feat= X_train.columns[(sel.get_support())]

# Features That Are Most Important
len(selected_feat)
print(selected_feat)

# 7. Model Creation