# Predicting Caegories with K-Nearest Neighbors

### Preparing a dataset for machine learning with scikit-learn

### - The goal of this dataset is to predict whether a mobile transaction is fraulent 

In [1]:
# Importing the dataset using pandas

import pandas as pd

df = pd.read_csv("PS_20174392719_1491204439457_log.csv")

# Viewing the first 5 rows

df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


### - Dropping features that are redundant

In [2]:
# Dropping the redundant features

df = df.drop(["nameOrig","nameDest","isFlaggedFraud"],axis = 1)

### - Reducing the size of the data

In [4]:
# reducing the size of the dataset to 20,000 rows


#Storing the fradulent data into a dataframe
df_fraud = df[df["isFraud"] == 1 ]


#Storing the non-fraudulent data into a dataframe
df_nofraud = df[df["isFraud"] == 0]


#Storing 12,000 rows of non-fraudulent data
df_nofraud = df_nofraud.head(12000)


#Joining both datasets together
df = pd.concat([df_fraud,df_nofraud],axis = 0)

### - Encoding the categorical variables

In [5]:
#Converting the type column into a number CASH-IN = 0, CASH-OUT = 1 

#importing the correct packages

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder


#Converting the type column to categorical

df["type"] = df["type"].astype("category")


#Integer Encoding the type column

type_encode = LabelEncoder()

df["type"] = type_encode.fit_transform(df.type)

In [6]:
# One hot encoding the type column

type_one_hot = OneHotEncoder()

type_one_hot_encode = type_one_hot.fit_transform(df.type.values.reshape(-1,1)).toarray()

#Adding the one hot encoded variables to the dataset


ohe_variable = pd.DataFrame(type_one_hot_encode,columns= ["type_"+str(int(i)) for i in range(type_one_hot_encode.shape[1])])

df = pd.concat([df,ohe_variable],axis=1)

#Dropping the original type variable

df = df.drop("type",axis=1)

In [7]:
#Viewing the new dataframe after one-hot-encoding
df.head()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,type_0,type_1,type_2,type_3,type_4
0,1.0,9839.64,170136.0,160296.36,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,1864.28,21249.0,19384.72,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1.0,181.0,181.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,1.0,181.0,181.0,0.0,21182.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,1.0,11668.14,41554.0,29885.86,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### - Missing Values

In [9]:
#Checking every column for missing values

df.isnull().any()

step              True
amount            True
oldbalanceOrg     True
newbalanceOrig    True
oldbalanceDest    True
newbalanceDest    True
isFraud           True
type_0            True
type_1            True
type_2            True
type_3            True
type_4            True
dtype: bool

In [10]:
# Imputing the missing values with a 0

df = df.fillna(0)

In [11]:
# Saving a exporting our newly created dataset as a csv file

df.to_csv("fraud_prediction.csv")

### - Splitting the data into training and test sets

In [12]:
#Storing the target and features as separate variables

#Creating the features as a numpy array

features = df.drop("isFraud",axis = 1).values

#Creating the target as a numpy array

target = df["isFraud"].values

In [13]:
#Splitting the features and target into training and test sets

from sklearn.model_selection import train_test_split

X_train , X_test, y_train, y_test = train_test_split(features, target,test_size= 0.3,random_state =42,stratify=target)

### - Implementation and evaluation of the model

In [14]:
#Using the K-NN algo on the training sets and evaluate its score on the test sets

from sklearn.neighbors import KNeighborsClassifier

# Initializing the kNN classifier with 3 neighbors

knn_classifier = KNeighborsClassifier(n_neighbors=3)

#Fitting the classifier on the training data

knn_classifier.fit(X_train,y_train)

#Extracting the accuracy score from the test sets

knn_classifier.score(X_test,y_test)

0.9830667920978363

### - Fine-tuning the parameters of the k-NN algorithm

In [17]:
# Using GridSearchCV algo to find the optimal number of neighbors 

import numpy as np 

from sklearn.model_selection import GridSearchCV

#Initializing a grid with possible number of neighbors from 1 to 24


grid = {"n_neighbors": np.arange(1,25)}

#Initializing a k-NN classifier

knn_classifier = KNeighborsClassifier()


#Using cross validation to find optimal number of neighbors

knn = GridSearchCV(knn_classifier, grid,cv=10)

knn.fit(X_train,y_train)

#Extracting the optimal number of neighbors


print(knn.best_params_)


#Extracting the accuracy score for optimal number of neighbors


knn.best_score_

{'n_neighbors': 1}


0.9850814323149428

### - Scaling for optimized performance

In [19]:
# We want all the features to have the same range of values so that the distance metric is on level terms across all features
#One way is to subtract each value of each feature by the mean of that feature and divide by the variance of that feature.
# This process is called standardization

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


#Setting up the scaling pipeline

pipeline_order = [("scaler",StandardScaler()), ("knn",KNeighborsClassifier(n_neighbors=1) )]

pipeline = Pipeline(pipeline_order)

#Fitting the classfier to the scaled dataset

knn_classifier_scaled = pipeline.fit(X_train,y_train)

#Extracting the score

knn_classifier_scaled.score(X_test,y_test)

0.9960018814675446