### 1. Load the CSV Data File into a Pandas DataFrame

**a. import Pandas**

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

import seaborn as sns
import matplotlib.pyplot as plt
import joblib

**b. Read in the .csv file with pandas read_csv**

In [2]:
data = pd.read_csv('../dataset/credit_card.csv')
data.head(5)

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,2019-01-01 00:18:07,343464013864032,fraud_Kunde-Sanford,misc_net,2.55,Jodi,Foster,F,551 Zachary Freeway,Bailey,...,35.8072,-78.0892,6629,Call centre manager,1962-08-13,abe0676c18f34fa305b113fa762e29b5,1325377087,36.74987,-78.67787,0
2,2019-01-01 00:37:18,4265776278887457,fraud_Pacocha-O'Reilly,grocery_pos,103.0,Christine,Best,F,68248 Deanna Land,Enola,...,35.2087,-92.2123,969,"Physicist, medical",1954-01-05,289789e617eece42a9cafe36680b1b69,1325378238,35.034285,-92.34301,0
3,2019-01-01 00:56:59,571465035400,"fraud_Reichert, Huels and Hoppe",shopping_net,113.4,Louis,Fisher,M,45654 Hess Rest,Fort Washakie,...,43.0048,-108.8964,1645,Freight forwarder,1976-02-26,9d660a18154a3ba5fe869f373c6f819c,1325379419,42.868965,-108.50335,0
4,2019-01-01 01:16:52,6593250708747804,fraud_Brekke and Sons,gas_transport,55.18,Melissa,Meza,F,244 Abbott Parkway,Loxahatchee,...,26.7383,-80.276,26551,Paramedic,1977-01-04,3c6158c556727d527f8b51cc03b30236,1325380612,27.346033,-80.475563,0


In [3]:
data.shape

(59073, 22)

### 2. Find Statistical Properties of the DataFrame and Check NaN Values

**a. Create the statistical summaries:**



In [4]:
data.describe()

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,59073.0,59073.0,59073.0,59073.0,59073.0,59073.0,59073.0,59073.0,59073.0,59073.0
mean,4.155063e+17,125.809682,48936.650805,38.535668,-90.300925,89993.55,1349139000.0,38.53065,-90.301737,0.127063
std,1.305091e+18,245.649462,26897.427002,5.082398,13.872055,305684.4,12969020.0,5.118062,13.884615,0.333046
min,60416210000.0,1.0,1257.0,20.0271,-165.6723,23.0,1325376000.0,19.038876,-166.620488,0.0
25%,180046200000000.0,11.51,26292.0,34.7031,-96.798,741.0,1338510000.0,34.795189,-96.904179,0.0
50%,3523898000000000.0,52.52,48088.0,39.3199,-87.4569,2457.0,1349187000.0,39.333993,-87.37885,0.0
75%,4642255000000000.0,101.05,72042.0,41.8467,-80.1752,20328.0,1359542000.0,41.918596,-80.285566,0.0
max,4.992346e+18,14238.11,99783.0,66.6933,-67.9503,2906700.0,1371816000.0,67.510267,-66.961923,1.0


**b. check all columns for null values**

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59073 entries, 0 to 59072
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   trans_date_trans_time  59073 non-null  object 
 1   cc_num                 59073 non-null  int64  
 2   merchant               59073 non-null  object 
 3   category               59073 non-null  object 
 4   amt                    59073 non-null  float64
 5   first                  59073 non-null  object 
 6   last                   59073 non-null  object 
 7   gender                 59073 non-null  object 
 8   street                 59073 non-null  object 
 9   city                   59073 non-null  object 
 10  state                  59073 non-null  object 
 11  zip                    59073 non-null  int64  
 12  lat                    59073 non-null  float64
 13  long                   59073 non-null  float64
 14  city_pop               59073 non-null  int64  
 15  jo

### 3. Check Average Values Spend Per Fraud

a. We can use groupby() to separate based on the **is_fraud** column categories and then calculate the mean and median values:

In [6]:
print("Median Spend per Fraud Category:")
print(data.groupby("is_fraud")["amt"].median())
print('\n')
print("Mean Spend per Fraud Category:")
print(data.groupby("is_fraud")["amt"].mean())

Median Spend per Fraud Category:
is_fraud
0     47.240
1    396.505
Name: amt, dtype: float64


Mean Spend per Fraud Category:
is_fraud
0     66.784314
1    531.320092
Name: amt, dtype: float64


### 4. Calculate Total Fraud and Non-fraud Spend on a Specific Credit Card

**1. Filter and caculate sum on legitimate purchases:**

In [7]:
data[(data["cc_num"] == 344709867813900) & (data["is_fraud"] == 0)]["amt"].sum()

11667.49

**2. Filter and calculate on fraud purchases:**

In [15]:
data[(data["cc_num"] == 344709867813900) & (data["is_fraud"] == 1)]["amt"].sum()

6450.280000000002

In [16]:
## Alternatively, we can use groupby to do both operations with a single line:
data[(data["cc_num"] == 344709867813900)].groupby('is_fraud').sum()["amt"]

  data[(data["cc_num"] == 344709867813900)].groupby('is_fraud').sum()["amt"]


is_fraud
0    11667.49
1     6450.28
Name: amt, dtype: float64

**5. Clean Data Columns for further anaysis:**
- Drop some columns whcih do not actually hold information relevant to the transaction being fraudulent.

In [8]:
## Drop the columns date portion of the timestamp
data = data.drop(["unix_time", "trans_num"], axis=1)


Create some new features to indicate a specific category for time of day (to tell if the transaction occurred within a specific block of time in a day)
To do this, clean the transaction time column (trans_date_trans_time) by binning the trans_date_trans_time column into 4 categories:

- Category 1: 00:00:00 to 05:59:59

- Category 2: 06:00:00 to 11:59:59

- Category 3: 12:00:00 to 17:59:59

- Category 4: 8:00:00 to 23:59:59

In [9]:
## Create a function to bin timestamps into categories
data["trans_date_trans_time"] = data["trans_date_trans_time"].apply(lambda x: x.split(" ")[1])



In [10]:
## Create a function to bin timestamps into categories
def quantitize(string):
    time_hour = int(string[:2])
    if time_hour < 6:
        return 0
    elif 6 <= time_hour < 12:
        return 1
    elif 12 <= time_hour <18:
        return 2
    else:
        return 3


In [11]:
## 
data["trans_date_trans_time"] = data["trans_date_trans_time"].apply(quantitize)


In [12]:
## Confirm with a value count and Check Final results should look like this:
data["trans_date_trans_time"].value_counts()


trans_date_trans_time
3    19819
2    16133
0    12714
1    10407
Name: count, dtype: int64

In [13]:
data

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,merch_lat,merch_long,is_fraud
0,0,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,NC,28654,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,36.011293,-82.048315,0
1,0,343464013864032,fraud_Kunde-Sanford,misc_net,2.55,Jodi,Foster,F,551 Zachary Freeway,Bailey,NC,27807,35.8072,-78.0892,6629,Call centre manager,1962-08-13,36.749870,-78.677870,0
2,0,4265776278887457,fraud_Pacocha-O'Reilly,grocery_pos,103.00,Christine,Best,F,68248 Deanna Land,Enola,AR,72047,35.2087,-92.2123,969,"Physicist, medical",1954-01-05,35.034285,-92.343010,0
3,0,571465035400,"fraud_Reichert, Huels and Hoppe",shopping_net,113.40,Louis,Fisher,M,45654 Hess Rest,Fort Washakie,WY,82514,43.0048,-108.8964,1645,Freight forwarder,1976-02-26,42.868965,-108.503350,0
4,0,6593250708747804,fraud_Brekke and Sons,gas_transport,55.18,Melissa,Meza,F,244 Abbott Parkway,Loxahatchee,FL,33470,26.7383,-80.2760,26551,Paramedic,1977-01-04,27.346033,-80.475563,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59068,0,3524574586339330,fraud_Kassulke PLC,shopping_net,977.01,Ashley,Cabrera,F,94225 Smith Springs Apt. 617,Vero Beach,FL,32960,27.6330,-80.4031,105638,"Librarian, public",1986-05-07,26.888686,-80.834389,1
59069,0,3524574586339330,fraud_Schumm PLC,shopping_net,1210.91,Ashley,Cabrera,F,94225 Smith Springs Apt. 617,Vero Beach,FL,32960,27.6330,-80.4031,105638,"Librarian, public",1986-05-07,28.216707,-79.855648,1
59070,0,4005676619255478,"fraud_Tillman, Dickinson and Labadie",gas_transport,10.24,William,Perry,M,458 Phillips Island Apt. 768,Denham Springs,LA,70726,30.4590,-90.9027,71335,Herbalist,1994-05-31,29.700456,-91.361632,1
59071,0,3560725013359375,fraud_Corwin-Collins,gas_transport,21.69,Brooke,Smith,F,63542 Luna Brook Apt. 012,Notrees,TX,79759,31.8599,-102.7413,23,Cytogeneticist,1969-09-15,32.675272,-103.484949,1


#### 6. Create a Visual Correlation Matrix of Data Features

- Use seaborn to create a visualization heatmap matrix of the correlation between all the features and the label column. 

In [14]:
plt.figure(figsize=(15,10),dpi=150)
sns.heatmap(data.corr(numeric_only=True),vmin=0,vmax=1,cmap="viridis")

ValueError: could not convert string to float: 'fraud_Rippin, Kub and Mann'

<Figure size 2250x1500 with 0 Axes>

**Note:** We can notice that how the amount spent, and long and lat location information are by far the most correlated features to fraud.

#### 7. Encode Categorical Data Features

- Using the Pandas DataFrame and Scikit-Learn, we use Label Encoding to encode the categorical features in the DataFrame.

In [28]:
## create the encoder
encoder = LabelEncoder()

In [29]:
### Get the categorical features with Pandas

categorical_features = data.select_dtypes(include=['object']).columns

In [30]:
### Apply fit_transform to create the encoded category data columns
data_encoded = data.copy()
data_encoded[categorical_features] = data_encoded[categorical_features].apply(encoder.fit_transform) 


In [31]:
# Display dataframe
data_encoded

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,merch_lat,merch_long,is_fraud
0,0,2703186189652095,514,8,4.97,162,18,0,568,526,27,28654,36.0788,-81.1781,3495,370,779,36.011293,-82.048315,0
1,0,343464013864032,351,8,2.55,171,138,0,552,45,27,27807,35.8072,-78.0892,6629,59,317,36.749870,-78.677870,0
2,0,4265776278887457,468,4,103.00,73,30,0,681,250,2,72047,35.2087,-92.2123,969,337,182,35.034285,-92.343010,0
3,0,571465035400,502,11,113.40,222,127,1,464,272,50,82514,43.0048,-108.8964,1645,202,572,42.868965,-108.503350,0
4,0,6593250708747804,77,2,55.18,239,285,0,236,463,9,33470,26.7383,-80.2760,26551,327,589,27.346033,-80.475563,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59068,0,3524574586339330,295,11,977.01,27,54,0,928,829,9,32960,27.6330,-80.4031,105638,271,741,26.888686,-80.834389,1
59069,0,3524574586339330,571,11,1210.91,27,54,0,928,829,9,32960,27.6330,-80.4031,105638,271,741,28.216707,-79.855648,1
59070,0,4005676619255478,622,2,10.24,348,339,1,465,209,18,70726,30.4590,-90.9027,71335,222,880,29.700456,-91.361632,1
59071,0,3560725013359375,107,2,21.69,51,404,0,640,592,43,79759,31.8599,-102.7413,23,115,446,32.675272,-103.484949,1


#### 8. Perform a Train-Test Split on the Data and make it ready for training

- Use a test size of 10% of the total data set.

In [None]:
from sklearn.model_selection import train_test_split

In [32]:
### Separate Features and Label
data_encoded, labels = data_encoded.drop("is_fraud", axis=1), data_encoded["is_fraud"]


In [33]:

### Perform the split

X_train, X_test, y_train, y_test = train_test_split(data_encoded, labels, test_size = 0.1, random_state = 42)


In [39]:
print("X_train:", X_train.shape) 
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

X_train: (53165, 19)
X_test: (5908, 19)
y_train: (53165,)
y_test: (5908,)


#### 9. Train a Random Forest Classifier Model

- Using Scikit-Learn create and train a random forest classifier on the training data set

In [40]:
from sklearn.ensemble import RandomForestClassifier

In [41]:
## Create the instance of the model (you can also edit hyperparameters further)

classifier = RandomForestClassifier(class_weight='balanced')

In [42]:
## Fit the classifier to the training data set

classifier.fit(X_train, y_train)

In [49]:
joblib.dump(classifier, "../models")

['../models']

#### 10. Evaluate the Model on the Test Set
- Using the trained Random Forest classifier, evaluate performance of the model on the test set using accuracy as the performance metric. 
- Also, create a confusion matrix on the results of the test data set.

In [43]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [44]:
## Create the predictions

preds = classifier.predict(X_test)



In [45]:
## Calculate the accuracy

accuracy_score(preds, y_test)


0.9808733920108328

In [46]:
## Create the confusion matrix

confusion_matrix(y_test, preds)


array([[5113,   18],
       [  95,  682]], dtype=int64)