# <CENTER><U> 1. IMPORTING BASIC NECESSARY PACKAGES</U></CENTER>

In [1]:
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report,confusion_matrix
from __future__ import division
from sklearn.cluster import KMeans
# Implement learning algorithms

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np

## <CENTER><U> 2. READING INPUT DATA </CENTER></U>`

### <B><U> Reading from File 1:

In [3]:
fraudtrain = pd.read_csv('fraudTrain.csv')
fraudtrain.drop('Unnamed: 0',axis=1,inplace=True)
fraudtrain.count()

trans_date_trans_time    1296675
cc_num                   1296675
merchant                 1296675
category                 1296675
amt                      1296675
first                    1296675
last                     1296675
gender                   1296675
street                   1296675
city                     1296675
state                    1296675
zip                      1296675
lat                      1296675
long                     1296675
city_pop                 1296675
job                      1296675
dob                      1296675
trans_num                1296675
unix_time                1296675
merch_lat                1296675
merch_long               1296675
is_fraud                 1296675
dtype: int64

### <B><U> Reading from File 2:

In [4]:
fraudtest = pd.read_csv('fraudTest.csv')
fraudtest.drop('Unnamed: 0',axis=1,inplace=True)
fraudtest.count()

trans_date_trans_time    555719
cc_num                   555719
merchant                 555719
category                 555719
amt                      555719
first                    555719
last                     555719
gender                   555719
street                   555719
city                     555719
state                    555719
zip                      555719
lat                      555719
long                     555719
city_pop                 555719
job                      555719
dob                      555719
trans_num                555719
unix_time                555719
merch_lat                555719
merch_long               555719
is_fraud                 555719
dtype: int64

In [5]:
fraudtest.shape

(555719, 22)

In [6]:
fraudtrain.shape

(1296675, 22)

### <u> Combine both the dataframes into single dataframe</u>

In [7]:
frames = [fraudtrain,fraudtest]

combined_data = pd.concat(frames)

In [8]:
combined_data.shape

(1852394, 22)

In [None]:
print()

### <B><u> Shuffing the dataset and random sampling

In [9]:
combined_shuffled = combined_data.sample(frac=1)

In [10]:
combined_shuffled = combined_shuffled [~combined_shuffled.index.duplicated (keep='first')]

In [11]:
fraud_data=combined_shuffled.sample(n=300000)

In [12]:
fraud_data.shape

(300000, 22)

In [13]:
fraud_data.dtypes

trans_date_trans_time     object
cc_num                   float64
merchant                  object
category                  object
amt                      float64
first                     object
last                      object
gender                    object
street                    object
city                      object
state                     object
zip                        int64
lat                      float64
long                     float64
city_pop                   int64
job                       object
dob                       object
trans_num                 object
unix_time                  int64
merch_lat                float64
merch_long               float64
is_fraud                   int64
dtype: object


## <CENTER><U> 3. DATA TRANSFORMATIONS</CENTER></U>

### <B><u> Handling nulls:

In [14]:
fraud_data.isna().sum()

trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

Since the field doesnt have any null values, we shall procedd to next step which is duplicate check.

### <b><u>Handling Duplicates:

In [15]:
#checking duplicates
print(f"There are {fraud_data.duplicated().sum()} duplicated rows")

There are 0 duplicated rows


Since the dataset doesnt have any redundancy, no rows will be dropped.

In [16]:
fraud_data.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
518672,12/23/2020 8:25,4470000000000000.0,fraud_Kilback LLC,grocery_pos,167.42,Matthew,Russell,M,168 Michael Coves Suite 343,June Lake,...,37.7773,-119.0825,633,Health service manager,9/9/1927,8c1e3a9480e34876c75af5b1b533e53e,1387787153,38.492626,-118.677235,0
1103482,2020-04-04 11:57:59,2233883000000000.0,fraud_Huels-Hahn,gas_transport,46.91,Jamie,Robinson,F,67089 Caitlin Meadow Apt. 905,Sturgis,...,33.357,-89.0473,1923,Medical physicist,1960-01-16,42f7a414a3fea93f52df911719625a00,1365076679,33.193352,-90.017058,0
777216,2019-11-29 10:12:42,675909900000.0,"fraud_Watsica, Haag and Considine",shopping_pos,8.46,Christopher,Henry,M,1198 Robert Stravenue Apt. 479,Armonk,...,41.136,-73.7009,7987,Television/film/video producer,1964-03-16,3070934ee3fb99b3bcd65be2f7dda261,1354183962,41.49308,-74.290518,0
698728,2019-10-25 18:35:11,377026700000000.0,fraud_Gerhold LLC,home,112.71,Jackie,Davis,F,1898 Parker Fork Apt. 057,Redford,...,37.3272,-91.0243,241,Investment analyst,1974-10-27,430986c6f1a78c45b42cef37c35b5dfe,1351190111,36.342555,-91.407343,0
945306,2020-01-12 22:52:03,4585133000000.0,"fraud_Durgan, Gislason and Spencer",home,56.41,Karen,Gordon,F,543 Ware Path Apt. 593,Utica,...,39.7417,-93.6289,271,Land/geomatics surveyor,1972-04-18,f3c652aa1f6bf58e2109e29631569d66,1358031123,39.435902,-93.064931,0


In [17]:
fraud_data.count()

trans_date_trans_time    300000
cc_num                   300000
merchant                 300000
category                 300000
amt                      300000
first                    300000
last                     300000
gender                   300000
street                   300000
city                     300000
state                    300000
zip                      300000
lat                      300000
long                     300000
city_pop                 300000
job                      300000
dob                      300000
trans_num                300000
unix_time                300000
merch_lat                300000
merch_long               300000
is_fraud                 300000
dtype: int64

#### <B><U> EXPORT TO CSV:

In [18]:
fraud_data.to_csv('Data_Processing/FraudData_RandomSample.csv')