# Importing the Required Packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Importing the Dataset

In [None]:
data = pd.read_csv("Documents/DS Projects/creditcard.csv")
data.head()

In [None]:
data.info()

In [None]:
data.describe()

# Data Preprocessing

In [None]:
#checking for the null values
data.isnull().sum()

### There are no null values

In [None]:
#checking for duplicate values
data.duplicated().sum()

In [None]:
#removing the duplicate values
data.drop_duplicates(inplace = True)
data.duplicated().sum()

In [None]:
#Analysing the distribution of Fraud and Valid Transcations
data['Class'].value_counts()

In [None]:
plt.pie(data['Class'].value_counts(), labels = data['Class'].value_counts().index,autopct='%1.2f%%')
plt.title("Distribution of Transcations")
plt.show()

In [None]:
fraud = data[data['Class'] == 1]
valid = data[data['Class'] == 0]

In [None]:
print(len(fraud))

In [None]:
fraud.Amount.describe()

In [None]:
valid.Amount.describe()

In [None]:
print("Fraud Cases: {}".format(len(fraud)))
print("Valid Cases: {}".format(len(valid)))

In [None]:
outlier_fraction = len(fraud)/len(valid)
print(outlier_fraction)

In [None]:
correlation_matrix = data.corr()
fig = plt.figure(figsize = (5,4))
sns.heatmap(correlation_matrix, vmax = .8, annot = False, square = True)
plt.show()

### The mean of fraud transactions is comparatively high and if can lead to the overfitting of the model. Let's transform the data 

# Splitting the data

In [None]:
X= data.drop(['Class'],axis=1)
Y=data['Class']

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y, train_size = 0.8, random_state = 42)
Y_train.value_counts()



In [None]:
#Using SMOTE 
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state = 0)
X_train, Y_train = smote.fit_resample(X_train, Y_train)
X_test, Y_test = smote.fit_resample(X_test, Y_test)
Y_train.value_counts()

# Model Training and Evaluation

In [None]:
from sklearn.linear_model import LogisticRegression
LR_Model = LogisticRegression()
LR_Model.fit(X_train, Y_train)
Y_pred_LR = LR_Model.predict(X_test)


In [None]:
from sklearn.metrics import confusion_matrix, precision_recall_curve, classification_report, precision_score, recall_score, accuracy_score
cnf_matrix = confusion_matrix(Y_test, Y_pred_LR)
accuracy_LR = accuracy_score(Y_test, Y_pred_LR)
print("Accuracy of the model is", accuracy_LR*100)

sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu", fmt='g')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')

labels = ['Non-fraud', 'Fraud']
print(classification_report(Y_test, Y_pred_LR, target_names=labels))
