#### 1. Loading The Train Dataset For Training

In [2]:
import pandas as pd

# Read the data
df = pd.read_csv('/content/drive/MyDrive/fraudTrain.csv')

# print shape of dataset with rows and columns
print(df.shape)

# print all the columns of the dataset
print(df.columns)

(1296675, 23)
Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')


#### 2. Checking Any Null Values

In [3]:
# checking any missing values present in the dataset or not

print(df.isnull().sum())

Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64


#### 3. Removing Unwanted Columns

In [4]:
# removing the unnecessary columns

df.drop(['Unnamed: 0', 'trans_date_trans_time', 'merchant', 'first', 'last', 'street', 'city', 'state', 'dob', 'trans_num', 'job'], axis=1, inplace=True)

#### 4. Applying Label Encdoing on Gender and Category Columns

In [5]:
# Apply label encoder to convert categorical data into numeric form

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df['category'] = le.fit_transform(df['category'])

df['gender'] = le.fit_transform(df['gender'])

#### 5. Seeing First 5 rows after Data Cleaning

In [6]:
df.head()

Unnamed: 0,cc_num,category,amt,gender,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
0,2703186189652095,8,4.97,0,28654,36.0788,-81.1781,3495,1325376018,36.011293,-82.048315,0
1,630423337322,4,107.23,0,99160,48.8878,-118.2105,149,1325376044,49.159047,-118.186462,0
2,38859492057661,0,220.11,1,83252,42.1808,-112.262,4154,1325376051,43.150704,-112.154481,0
3,3534093764340240,2,45.0,1,59632,46.2306,-112.1138,1939,1325376076,47.034331,-112.561071,0
4,375534208663984,9,41.96,1,24433,38.4207,-79.4629,99,1325376186,38.674999,-78.632459,0


#### 6. Applying Feature Scaling using Standard Scaler

In [7]:
# applying standard scaling to all columns except gender and target column

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

df[['cc_num', 'category', 'amt','zip', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat', 'merch_long']] = sc.fit_transform(df[['cc_num', 'category', 'amt','zip', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat', 'merch_long']])

#### 7. Final Cleaned and Scaled Data

In [8]:
df.head()

Unnamed: 0,cc_num,category,amt,gender,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
0,-0.316692,0.452853,-0.407826,0,-0.749136,-0.48442,0.65762,-0.282589,-1.858664,-0.494354,0.593864,0
1,-0.318757,-0.569266,0.230039,0,1.872567,2.03912,-2.03387,-0.29367,-1.858662,2.078699,-2.030341,0
2,-0.318728,-1.591384,0.934149,1,1.281042,0.717754,-1.601537,-0.280406,-1.858662,0.902849,-1.592323,0
3,-0.316058,-1.080325,-0.158132,1,0.402753,1.515617,-1.590766,-0.287742,-1.85866,1.662886,-1.621848,0
4,-0.318471,0.708382,-0.177094,1,-0.90609,-0.023035,0.782279,-0.293835,-1.858651,0.026941,0.841909,0


#### 8. Oversampling The Data

In [9]:
# Over sampling the data using SMOTE

normal = df[df['is_fraud']==0]
fraud = df[df['is_fraud']==1]

print(normal.shape)
print(fraud.shape)

normal_sample = normal.sample(n=fraud.shape[0], ignore_index=True)

print(normal_sample.shape)

new_data = pd.concat([normal_sample, fraud])

(1289169, 12)
(7506, 12)
(7506, 12)


In [10]:
new_data['is_fraud'].value_counts()

0    7506
1    7506
Name: is_fraud, dtype: int64

In [11]:
new_data.head()

Unnamed: 0,cc_num,category,amt,gender,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
0,-0.314165,-1.080325,0.112771,1,0.853796,-1.024039,-0.182997,-0.285881,-0.073753,-1.194569,-0.116049,0
1,-0.318735,-0.569266,0.26522,1,-0.360227,-1.110429,-0.058308,-0.292193,-1.783526,-1.183981,-0.038147,0
2,-0.31862,-1.591384,-0.431841,1,-0.152405,0.44625,0.643389,0.344356,-1.80459,0.577531,0.687386,0
3,-0.316009,-0.569266,0.784257,0,-1.379183,0.420678,1.213006,-0.179922,-0.228155,0.231973,1.250028,0
4,-0.314165,1.474971,0.36995,0,-0.169845,0.532778,0.616396,-0.268842,0.037544,0.458976,0.599322,0


In [12]:
print(normal.shape)
print(fraud.shape)

(1289169, 12)
(7506, 12)


#### 9. Spliting the Training Data Into Training Part and Testing Part

In [13]:
from sklearn.model_selection import train_test_split

X = new_data.drop('is_fraud', axis=1)

y = new_data['is_fraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

print(X_train.shape, X_test.shape)

(10508, 11) (4504, 11)


#### 10. Model Training

1. Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score

lr = LogisticRegression()

lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

# print accuracy score
print(accuracy_score(y_test, y_pred))

# print precision score
print(precision_score(y_test, y_pred))

0.8534635879218473
0.9398373983739837


2. Decision Tree Classifier

In [15]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()

dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)

# print accuracy score
print(accuracy_score(y_test, y_pred))

# print precision score
print(precision_score(y_test, y_pred))

0.9535968028419183
0.9540280210157618


3. Random Forest Classifier

In [16]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 100)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

# print accuracy score
print(accuracy_score(y_test, y_pred))

# print precision score
print(precision_score(y_test, y_pred))

0.9560390763765542
0.9689608636977058


#### Testing Data Loading

In [17]:
df_test = pd.read_csv('/content/drive/MyDrive/fraudTest.csv')


# shape of dataset with rows and columns
print(df_test.shape)

(555719, 23)


#### 1. Data Cleaning and Preprocessing Function

In [18]:
# creating  function for data preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

def preprocess_inputs(df):

    # removing unnecessary columns

    df.drop(['Unnamed: 0', 'trans_date_trans_time', 'merchant', 'first', 'last', 'street', 'city', 'state', 'dob', 'trans_num', 'job'], axis=1, inplace=True)

    # applying labael encoding to categorical columns

    le = LabelEncoder()

    df['category'] = le.fit_transform(df['category'])

    df['gender'] = le.fit_transform(df['gender'])


    # applying standard scaling to all columns except gender and target column

    sc = StandardScaler()

    df[['cc_num', 'category', 'amt','zip', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat', 'merch_long']] = sc.fit_transform(df[['cc_num', 'category', 'amt','zip', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat', 'merch_long']])

#### 2. Applying Data Preproceeing To Test Data

In [19]:
# calling the function

preprocess_inputs(df_test)

#### 3. Separating Features and Target Columns

In [20]:
# separating targets and features

y_test = df_test['is_fraud']

X_test = df_test.drop('is_fraud', axis=1)

#### 4. Predicting the Using Best Classifier Model and Printing Accuracy Score and Precision Score

In [21]:
# predicting the values
y_pred = rf.predict(X_test)

# print accuracy score
print(accuracy_score(y_test, y_pred))

0.9674583737464438
