In [16]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from datetime import datetime
import time
import json

# 1. DIVIDING DA DATASET

In [38]:
df = pd.read_csv("dataquest_fraud_train.csv")

# Replace empty strings with NaN
df.replace('', np.nan, inplace=True)

# Remove rows where all elements are NaN
df.dropna(how='all', inplace=True)

# Save the cleaned dataset to a new CSV file (optional)
df.to_csv('dataquest_fraud_train.csv', index=False)

# Removing firstName, lastName, transNum, transDate
df.drop(['firstName', 'lastName', 'transNum', 'unixTime'], axis=1, inplace=True)

# Converting transDate into unixTime
df.x

"''uniTimex"

In [39]:
df

Unnamed: 0,transDate,creditCardNum,business,category,amount,gender,street,city,state,zip,latitude,longitude,cityPop,job,dateOfBirth,merchLatitude,merchLongitude,isFraud
0,2019-01-01 4:28,4.119763e+15,fraud_Welch Inc,misc_net,4.10,F,1497 West Gateway,Eureka,IL,61530.0,40.7152,-89.2706,6713.0,Archivist,1945-08-19,40.020043,-89.693412,0.0
1,2019-01-01 4:28,3.607811e+13,fraud_Auer-West,shopping_net,2.67,M,956 Sanchez Highway,Mallie,KY,41836.0,37.2692,-82.9161,798.0,Facilities manager,1926-06-26,37.622362,-82.676007,0.0
2,2019-01-01 4:29,4.449531e+15,fraud_Balistreri-Nader,misc_pos,22.88,F,84079 Thomas Burgs,Tomales,CA,94971.0,38.2427,-122.9145,337.0,Occupational psychologist,1954-07-05,39.060101,-121.969417,0.0
3,2019-01-01 4:29,4.302475e+15,"fraud_Boehm, Predovic and Reinger",misc_pos,9.59,M,384 Newman Forks Apt. 370,Belmond,IA,50421.0,42.8511,-93.6200,3032.0,Community pharmacist,1964-08-08,43.280848,-94.340312,0.0
4,2019-01-01 4:32,3.772340e+14,fraud_Hudson-Ratke,grocery_pos,129.06,F,43576 Kristina Islands,Shenandoah Junction,WV,25442.0,39.3716,-77.8229,1925.0,Systems developer,1966-02-14,39.211630,-78.409044,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58914,2019-09-23 14:58,4.259996e+12,fraud_Weimann-Lockman,kids_pets,57.13,F,9331 Robert Passage Suite 327,Hudson,KS,67545.0,38.1485,-98.6408,215.0,Probation officer,1968-11-22,38.575422,-98.372574,0.0
58915,2019-09-23 15:00,6.304849e+11,fraud_Collier LLC,home,70.72,M,8088 Sherman Radial Suite 689,Helm,CA,93627.0,36.4992,-120.0936,123.0,Early years teacher,1973-02-07,37.030768,-120.783300,0.0
58916,2019-09-23 15:00,3.708775e+14,fraud_Bernhard-Lesch,food_dining,64.29,F,25961 Beverly Union Apt. 042,North Wilkesboro,NC,28659.0,36.2017,-81.1286,21134.0,Dispensing optician,1984-03-06,35.931601,-81.476513,0.0
58917,2019-09-23 15:00,3.551513e+15,"fraud_Roberts, Ryan and Smith",personal_care,66.59,F,3786 Hale Corners,Rochester,KY,42273.0,37.2048,-86.8592,443.0,"Development worker, international aid",1947-07-15,36.936864,-86.077960,0.0


## Encoding Gender

In [25]:
df['gender'] = df['gender'].apply(lambda x: 0 if x == 'F' else 1)
df

Unnamed: 0,transDate,creditCardNum,business,category,amount,firstName,lastName,gender,street,city,...,latitude,longitude,cityPop,job,dateOfBirth,transNum,unixTime,merchLatitude,merchLongitude,isFraud
0,2019-01-01 4:28,4.119763e+15,fraud_Welch Inc,misc_net,4.10,Melinda,Gutierrez,0,1497 West Gateway,Eureka,...,40.7152,-89.2706,6713.0,Archivist,1945-08-19,3fc1791ff1db94ce9540262d7eb973e4,1.325392e+09,40.020043,-89.693412,0.0
1,2019-01-01 4:28,3.607811e+13,fraud_Auer-West,shopping_net,2.67,Christopher,Horn,1,956 Sanchez Highway,Mallie,...,37.2692,-82.9161,798.0,Facilities manager,1926-06-26,edbf65b8606b53fc48168c2daa5933e5,1.325392e+09,37.622362,-82.676007,0.0
2,2019-01-01 4:29,4.449531e+15,fraud_Balistreri-Nader,misc_pos,22.88,Felicia,Mckee,0,84079 Thomas Burgs,Tomales,...,38.2427,-122.9145,337.0,Occupational psychologist,1954-07-05,2fcbde131cff833751dde53da54da6da,1.325392e+09,39.060101,-121.969417,0.0
3,2019-01-01 4:29,4.302475e+15,"fraud_Boehm, Predovic and Reinger",misc_pos,9.59,Daniel,Cain,1,384 Newman Forks Apt. 370,Belmond,...,42.8511,-93.6200,3032.0,Community pharmacist,1964-08-08,bcae564f9a1a299dfbd1efd917499968,1.325392e+09,43.280848,-94.340312,0.0
4,2019-01-01 4:32,3.772340e+14,fraud_Hudson-Ratke,grocery_pos,129.06,Theresa,Blackwell,0,43576 Kristina Islands,Shenandoah Junction,...,39.3716,-77.8229,1925.0,Systems developer,1966-02-14,5e42e35c255edb55a192b3f1f0600936,1.325392e+09,39.211630,-78.409044,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181817,2019-12-15 13:48,4.958590e+18,fraud_Goyette-Gerhold,kids_pets,72.97,Aaron,Pena,1,793 Hooper Tunnel Suite 154,Burke,...,38.7894,-77.2818,43102.0,Health visitor,1950-11-27,16cfc30968a4ffc36f44a8fa4d98224f,1.355579e+09,38.676963,-77.647315,0.0
181818,2019-12-15 13:48,2.576710e+15,"fraud_Reichel, Bradtke and Blanda",travel,7.50,Joseph,Morgan,1,126 Underwood Drive,San Diego,...,33.0067,-117.0690,1241364.0,Chartered public finance accountant,1959-08-05,9c68a7b61b6d9a1afd3db343f6943e0e,1.355579e+09,33.567015,-117.134451,0.0
181819,2019-12-15 13:49,6.011680e+15,"fraud_Bradtke, Torp and Bahringer",personal_care,46.18,Jennifer,Gonzalez,0,5517 Stacy Land,Jelm,...,41.0539,-106.0763,100.0,Public librarian,1974-04-16,e715a05b1aab3b771d6c22a0ef0e6fcd,1.355579e+09,40.456310,-106.856904,0.0
181820,2019-12-15 13:49,4.102004e+18,fraud_Connelly-Carter,home,11.86,William,Fitzgerald,1,715 Courtney Pike Suite 932,Keller,...,32.9276,-97.2489,95035.0,Probation officer,1987-06-13,df05203f1050a2621bd8c4e41a3e4d8b,1.355579e+09,32.266679,-97.380913,0.0


## Adding distance between transactions

In [36]:
import math
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of the Earth in kilometers

    # Convert latitude and longitude from degrees to radians
    lat1 = math.radians(lat1)
    lon1 = math.radians(lon1)
    lat2 = math.radians(lat2)
    lon2 = math.radians(lon2)

    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distance = R * c

    return distance

# Coordinates of the two points
lat1, lon1 = 37.2692, -82.9161
lat2, lon2 = 37.622362, -82.676007

distanceBetween = []
for index in df.index:
    lat1 = df.iloc[index]["latitude"]
    long1 =  df.iloc[index]["longitude"]
    lat2 = df.iloc[index]["merchLatitude"]
    long2 = df.iloc[index]["merchLongitude"]
    distanceBetween.append(round(haversine(lat1, long1, lat2, long2), 4))

print(distanceBetween[0])
df["distance_between"] = distanceBetween
df
    

85.1941


Unnamed: 0,transDate,creditCardNum,business,category,amount,firstName,lastName,gender,street,city,...,longitude,cityPop,job,dateOfBirth,transNum,unixTime,merchLatitude,merchLongitude,isFraud,distance_between
0,2019-01-01 4:28,4.119763e+15,fraud_Welch Inc,misc_net,4.10,Melinda,Gutierrez,F,1497 West Gateway,Eureka,...,-89.2706,6713.0,Archivist,1945-08-19,3fc1791ff1db94ce9540262d7eb973e4,1.325392e+09,40.020043,-89.693412,0.0,85.1941
1,2019-01-01 4:28,3.607811e+13,fraud_Auer-West,shopping_net,2.67,Christopher,Horn,M,956 Sanchez Highway,Mallie,...,-82.9161,798.0,Facilities manager,1926-06-26,edbf65b8606b53fc48168c2daa5933e5,1.325392e+09,37.622362,-82.676007,0.0,44.6248
2,2019-01-01 4:29,4.449531e+15,fraud_Balistreri-Nader,misc_pos,22.88,Felicia,Mckee,F,84079 Thomas Burgs,Tomales,...,-122.9145,337.0,Occupational psychologist,1954-07-05,2fcbde131cff833751dde53da54da6da,1.325392e+09,39.060101,-121.969417,0.0,122.4591
3,2019-01-01 4:29,4.302475e+15,"fraud_Boehm, Predovic and Reinger",misc_pos,9.59,Daniel,Cain,M,384 Newman Forks Apt. 370,Belmond,...,-93.6200,3032.0,Community pharmacist,1964-08-08,bcae564f9a1a299dfbd1efd917499968,1.325392e+09,43.280848,-94.340312,0.0,75.5473
4,2019-01-01 4:32,3.772340e+14,fraud_Hudson-Ratke,grocery_pos,129.06,Theresa,Blackwell,F,43576 Kristina Islands,Shenandoah Junction,...,-77.8229,1925.0,Systems developer,1966-02-14,5e42e35c255edb55a192b3f1f0600936,1.325392e+09,39.211630,-78.409044,0.0,53.4864
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181817,2019-12-15 13:48,4.958590e+18,fraud_Goyette-Gerhold,kids_pets,72.97,Aaron,Pena,M,793 Hooper Tunnel Suite 154,Burke,...,-77.2818,43102.0,Health visitor,1950-11-27,16cfc30968a4ffc36f44a8fa4d98224f,1.355579e+09,38.676963,-77.647315,0.0,34.0807
181818,2019-12-15 13:48,2.576710e+15,"fraud_Reichel, Bradtke and Blanda",travel,7.50,Joseph,Morgan,M,126 Underwood Drive,San Diego,...,-117.0690,1241364.0,Chartered public finance accountant,1959-08-05,9c68a7b61b6d9a1afd3db343f6943e0e,1.355579e+09,33.567015,-117.134451,0.0,62.6005
181819,2019-12-15 13:49,6.011680e+15,"fraud_Bradtke, Torp and Bahringer",personal_care,46.18,Jennifer,Gonzalez,F,5517 Stacy Land,Jelm,...,-106.0763,100.0,Public librarian,1974-04-16,e715a05b1aab3b771d6c22a0ef0e6fcd,1.355579e+09,40.456310,-106.856904,0.0,93.4800
181820,2019-12-15 13:49,4.102004e+18,fraud_Connelly-Carter,home,11.86,William,Fitzgerald,M,715 Courtney Pike Suite 932,Keller,...,-97.2489,95035.0,Probation officer,1987-06-13,df05203f1050a2621bd8c4e41a3e4d8b,1.355579e+09,32.266679,-97.380913,0.0,74.5243


In [17]:
# Checking Balance of Fraud to No Fraud
def checkBalance(newDf):
    distribution = newDf.groupby("fraud").fraud.count()
    print(distribution)
    print(distribution[0] / (distribution[0] + distribution[1]))
    print(distribution[1] / (distribution[0] + distribution[1]))

checkBalance(df)

fraud
0.0    912597
1.0     87403
Name: fraud, dtype: int64
0.912597
0.087403


In [18]:
# Oversampling and Undersampling Data

# Separate majority and minority classes
majority_class = df[df.fraud == 0]
minority_class = df[df.fraud == 1]

# Upsample minority class
minority_upsampled = resample(minority_class,
                              replace=True,     # sample with replacement
                              n_samples=len(majority_class),    # to match majority class size
                              random_state=123) # reproducible results

# Downsample Majority class
majority_downsampled = resample(majority_class,
                              replace=True,     # sample with replacement
                              n_samples=len(minority_class),    # to match majority class size
                              random_state=123) # reproducible results

# Combine Dataframes
oversampledDf = pd.concat([majority_class, minority_upsampled])
undersampledDf = pd.concat([minority_class, majority_downsampled])

#oversampledDf.fraud.sample(5), undersampledDf.fraud.sample(5)
oversampledDf.shape, undersampledDf.shape

((1825194, 8), (174806, 8))

In [19]:
# Creating X and Y Training Sets
X_over_train, X_over_test, y_over_train, y_over_test = train_test_split(
                                                            oversampledDf.drop("fraud", axis=1),
                                                            oversampledDf.fraud,
                                                            test_size=0.2, 
                                                            random_state=0
                                                        )
X_under_train, X_under_test, y_under_train, y_under_test = train_test_split(
                                                            undersampledDf.drop("fraud", axis=1),
                                                            undersampledDf.fraud,
                                                            test_size=0.2, 
                                                            random_state=0
                                                        )

# 2. FEATURE SELECTION WITH RANDOM FOREST

In [10]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

# Checking for correlation and eliminating columns with above 0.75 correlation value through PCA


In [None]:
# Using Random Forest to Generate
sel = SelectFromModel(RandomForestClassifier(n_estimators = 100))
sel.fit(X_over_train, y_over_train)
sel.get_support()
selected_feat= X_train.columns[(sel.get_support())]

# Features That Are Most Important
len(selected_feat)
print(selected_feat)