# credit card fraud detection analysis

In [None]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# load data

In [None]:
df = pd.read_csv("../input/creditcard.csv")

# DATA EXPLORATION

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df[['Class','Amount']].head()

In [None]:
df['Amount'].max()

In [None]:
df.loc[2]

In [None]:
df.loc[5665,'Amount']

In [None]:
df.loc[[123,45356],['Amount','Class']]

In [None]:
df[df['Amount']==25691.16]['Class']

In [None]:
df.Time[df.Class == 1]

In [None]:
print ("Fraud")
print (df.Time[df.Class == 1].describe())
print ()
print ("Normal")
print (df.Time[df.Class == 0].describe())

In [None]:
df.describe()

In [None]:
df['Class'].value_counts()

In [None]:
df['Amount'].sum()

In [None]:
df.Amount[df.Class == 1]

In [None]:
print ("Fraud")
print (df.Amount[df.Class == 1].describe())
print ()
print ("Normal")
print (df.Amount[df.Class == 0].describe())

In [None]:
fig,(ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(14,4))

bins = 10

ax1.hist(df.Amount[df.Class == 1], bins = bins)
ax1.set_title('Fraud')

ax2.hist(df.Amount[df.Class == 0], bins = bins)
ax2.set_title('Normal')

plt.xlabel('Amount ($)')
plt.ylabel('Number of Transactions')
plt.yscale('log')
plt.show()

In [None]:
Count_Normal_transacation = len(df[df["Class"]==0]) # normal transaction are repersented by 0
Count_Fraud_transacation = len(df[df["Class"]==1]) # fraud by 1
Count_Normal_transacation

In [None]:
Count_Fraud_transacation

In [None]:
Percentage_of_Normal_transacation = Count_Normal_transacation/(Count_Normal_transacation+Count_Fraud_transacation)
print("percentage of normal transacation is",Percentage_of_Normal_transacation*100)
Percentage_of_Fraud_transacation= Count_Fraud_transacation/(Count_Normal_transacation+Count_Fraud_transacation)
print("percentage of fraud transacation",Percentage_of_Fraud_transacation*100)

In [None]:
from sklearn.preprocessing import StandardScaler
df['Amount_n']= StandardScaler().fit_transform(df['Amount'].values.reshape(-1,1))

In [None]:
df.head()

In [None]:
df['Time_H']= df['Time']/3600

In [None]:
sns.jointplot(df['Time_H'], df['Class'])

In [None]:
df.drop(["Time_H"],axis=1,inplace=True)
df.head()

In [None]:
df.drop(["Time","Amount"],axis=1,inplace=True)

In [None]:
df.head()

In [None]:
X= df.iloc[:, df.columns != 'Class']
y= df.iloc[:, df.columns == 'Class']   

In [None]:
X.head()

In [None]:
y.head()

Random Undersampling aims to balance class distribution by randomly eliminating majority class examples.  This is done until the majority and minority class instances are balanced out.

Total Observations = 1000

Fraudulent   Observations =20

Non Fraudulent Observations = 980

Event Rate= 2 %

In this case we are taking 10 % samples without replacement from Non Fraud instances.  And combining them with Fraud instances.

Non Fraudulent Observations after random under sampling = 10 % of 980 =98

Total Observations after combining them with Fraudulent observations = 20+98=118

Event Rate for the new dataset after under sampling = 20/118 = 17%
Advantages
It can help improve run time and storage problems by reducing the number of training data samples when the training data set is huge.
Disadvantages
It can discard potentially useful information which could be important for building rule classifiers.
The sample chosen by random under sampling may be a biased sample. And it will not be an accurate representative of the population. Thereby, resulting in inaccurate results with the actual test data set.

# undersampling

In [None]:
fraud_count = len(df[df.Class == 1])
fraud_indices = df[df.Class == 1].index
normal_indices = df[df.Class == 0].index

r_normal_indices = np.random.choice(normal_indices, fraud_count, replace = False) # random 

undersample_indices = np.concatenate([fraud_indices,r_normal_indices])
undersample_data = df.iloc[undersample_indices,:]

X_undersample = undersample_data.iloc[:, undersample_data.columns != 'Class']
y_undersample = undersample_data.iloc[:, undersample_data.columns == 'Class']


# cross validation

In [None]:
from sklearn.model_selection import train_test_split
X_tr, X_test, y_tr, y_test = train_test_split(X,y,test_size = 0.3, random_state = 0)
X_tr_u, X_test_u, y_tr_u, y_test_u = train_test_split(X_undersample,y_undersample,test_size = 0.3,random_state = 0)
                

# logistic rogression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logmodel = LogisticRegression()
logmodel.fit(X_tr_u,y_tr_u)

In [None]:
predictions = logmodel.predict(X_test_u)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
print(classification_report(y_test_u,predictions))

In [None]:
print(confusion_matrix(y_test_u,predictions))

# decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtree = DecisionTreeClassifier()

In [None]:
dtree.fit(X_tr_u,y_tr_u)

In [None]:
predictions_dt = dtree.predict(X_test_u)

In [None]:
print(classification_report(y_test_u,predictions_dt))

In [None]:
print(confusion_matrix(y_test_u,predictions_dt))

# Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_tr_u, y_tr_u)

In [None]:
rfc_pred = rfc.predict(X_test_u)

In [None]:
print(confusion_matrix(y_test_u,rfc_pred))

In [None]:
print(classification_report(y_test_u,rfc_pred))

# SUPPORT VECTOR MACHINE

In [None]:
from sklearn.svm import SVC

In [None]:
model_svm = SVC()

In [None]:
model_svm.fit(X_tr_u,y_tr_u)

In [None]:
predictions_svm = model_svm.predict(X_test_u)

In [None]:
print(confusion_matrix(y_test_u,predictions_svm))

In [None]:
print(classification_report(y_test_u,predictions_svm))

# OVERSAMPLING

Over-Sampling increases the number of instances in the minority class by randomly replicating them in order to present a higher representation of the minority class in the sample.

Total Observations = 1000

Fraudulent   Observations =20

Non Fraudulent Observations = 980

Event Rate= 2 %

In this case we are replicating 20 fraud observations   20 times.

Non Fraudulent Observations =980

Fraudulent Observations after replicating the minority class observations= 400

Total Observations in the new data set after oversampling=1380

Event Rate for the new data set after under sampling= 400/1380 = 29 %

Advantages
Unlike under sampling this method leads to no information loss.
Outperforms under sampling
Disadvantages
It increases the likelihood of overfitting since it replicates the minority class events.

In [None]:
print("length of training data",len(df))
print("length of normal data",len(df[df["Class"]==0]))
print("length of fraud  data",len(df[df["Class"]==1]))

In [None]:
# ok Now we have a traing data
X_tr["Class"]= y_tr["Class"] # combining class with original data
data_train = X_tr.copy() # for naming conevntion
print("length of training data",len(data_train))
# Now make data set of normal transction from train data
normal_data = data_train[data_train["Class"]==0]
print("length of normal data",len(normal_data))
fraud_data = data_train[data_train["Class"]==1]
print("length of fraud data",len(fraud_data))

In [None]:
# Now start oversamoling of training data 
# means we will duplicate many times the value of fraud data
for i in range (355): # the number is choosen by myself on basis of nnumber of fraud transaction
    normal_data= normal_data.append(fraud_data)
ovs_data = normal_data.copy() 
print("length of oversampled data is ",len(ovs_data))
print("Number of normal transcation in oversampled data",len(ovs_data[ovs_data["Class"]==0]))
print("No.of fraud transcation",len(ovs_data[ovs_data["Class"]==1]))
print("Proportion of Normal data in oversampled data is ",len(ovs_data[ovs_data["Class"]==0])/len(ovs_data))
print("Proportion of fraud data in oversampled data is ",len(ovs_data[ovs_data["Class"]==1])/len(ovs_data))

In [None]:
ovs_data.head()

In [None]:
X_oversample = ovs_data.iloc[:, ovs_data.columns != 'Class']
y_oversample = ovs_data.iloc[:, ovs_data.columns == 'Class']


In [None]:
X_tr_o, X_test_o, y_tr_o, y_test_o = train_test_split(X_oversample,y_oversample,test_size = 0.3,random_state = 0)

In [None]:
print(len(X_oversample))

In [None]:
print(len(X_tr_o))

In [None]:
logmodel_ovs = LogisticRegression()
logmodel_ovs.fit(X_tr_o,y_tr_o)
predictions_log_ovs = logmodel_ovs.predict(X_test_o)
print(classification_report(y_test_o,predictions_log_ovs))
print(confusion_matrix(y_test_o,predictions_log_ovs))

In [None]:
dtree_ovs = DecisionTreeClassifier()
dtree_ovs.fit(X_tr_o,y_tr_o)
predictions_dt_ovs = dtree_ovs.predict(X_test_o)
print(classification_report(y_test_o,predictions_dt_ovs))
print(confusion_matrix(y_test_o,predictions_dt_ovs))

In [None]:
rfc_ovs = RandomForestClassifier(n_estimators=100)
rfc_ovs.fit(X_tr_o,y_tr_o)
predictions_rfc_ovs = rfc_ovs.predict(X_test_o)
print(classification_report(y_test_o,predictions_rfc_ovs))
print(confusion_matrix(y_test_o,predictions_rfc_ovs))

In [None]:
from imblearn.over_sampling import SMOTE 
oss = SMOTE(random_state=0)

In [None]:
columns = df.columns
#columns1 =y_tr.columns

In [None]:
columns

In [None]:
df.columns

In [None]:
os = SMOTE(random_state=0)

In [None]:
df.columns

In [None]:
data_train_X,data_test_X,data_train_y,data_test_y=train_test_split(X,y,test_size = 0.3, random_state = 0)
columns = data_train_X.columns

In [None]:
# now use SMOTE to oversample our train data which have features data_train_X and labels in data_train_y
os_data_X,os_data_y=os.fit_sample(data_train_X,data_train_y)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=["Class"])
# we can Check the numbers of our data
print("length of oversampled data is ",len(os_data_X))
print("Number of normal transcation in oversampled data",len(os_data_y[os_data_y["Class"]==0]))
print("No.of fraud transcation",len(os_data_y[os_data_y["Class"]==1]))
print("Proportion of Normal data in oversampled data is ",len(os_data_y[os_data_y["Class"]==0])/len(os_data_X))
print("Proportion of fraud data in oversampled data is ",len(os_data_y[os_data_y["Class"]==1])/len(os_data_X))

In [None]:
rfc_smote = RandomForestClassifier(n_estimators=100)
rfc_smote.fit(os_data_X,os_data_y)
predictions_rfc_ovs = rfc_smote.predict(X_test)
print(classification_report(y_test,predictions_rfc_ovs))
print(confusion_matrix(y_test,predictions_rfc_ovs))