In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
import pickle

In [None]:
df = pd.read_csv('../Resources/card_transdata.csv')
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
fraud_cases=len(df[df['fraud']==1])

In [None]:
print(' Number of Fraud Cases:',fraud_cases)

In [None]:
non_fraud_cases=len(df[df['fraud']==0])

In [None]:
print('Number of Non Fraud Cases:',non_fraud_cases)

In [None]:
fraud=df[df['fraud']==1]

In [None]:
Nonfraud=df[df['fraud']==0]

# Continuous Variables

In [None]:
conti_variable = df[['distance_from_home', 'distance_from_last_transaction', 'ratio_to_median_purchase_price']]
conti_variable.head()

In [None]:
conti_variable.hist(bins=60)

In [None]:
conti_variable.boxplot()

In [None]:
def removeOutliers(data, col):
    Q3 = np.quantile(data[col], 0.75)
    Q1 = np.quantile(data[col], 0.25)
    IQR = Q3 - Q1
 
    print("IQR value for column %s is: %s" % (col, IQR))
    global outlier_free_list
    global filtered_data
 
    lower_range = Q1 - 1.5 * IQR
    upper_range = Q3 + 1.5 * IQR
    outlier_free_list = [x for x in data[col] if (
        (x > lower_range) & (x < upper_range))]
    filtered_data = data.loc[data[col].isin(outlier_free_list)]

In [None]:
for x in conti_variable.columns:
    q25, q75 = conti_variable[x].quantile(0.25), conti_variable[x].quantile(0.75)
    iqr = q75 - q25
    cut_off = iqr * 1.5
    lower, upper = q25 - cut_off, q75 + cut_off

    conti_variable[x+'_outlier'] = np.where((conti_variable[x]<lower) & (conti_variable[x]<upper),1,0) 
    
conti_variable.head()    

# Unbalanced Data

In [None]:
sns.countplot(x='fraud',data=df)

In [None]:
sm = SMOTE(sampling_strategy='minority', random_state=7)
resampled_X, resampled_Y = sm.fit_resample(df.drop('fraud', axis=1), df['fraud'])
oversampled_df = pd.concat([pd.DataFrame(resampled_X), pd.DataFrame(resampled_Y)], axis=1)
oversampled_df.columns = df.columns
oversampled_df['fraud'].value_counts()

In [None]:
oversampled_df['fraud'].value_counts()

In [None]:
df['fraud'].value_counts()

# Balanced Data

In [None]:
sns.countplot(x='fraud', data=oversampled_df)

In [None]:
oversampled_df

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, figsize=(24,20))

# Entire DataFrame
corr = df.corr()
sns.heatmap(corr, cmap='coolwarm_r', annot_kws={'size':20}, ax=ax1)
ax1.set_title("Imbalanced Correlation Matrix \n (don't use for reference)", fontsize=14)


sub_sample_corr = oversampled_df.corr()
sns.heatmap(sub_sample_corr, cmap='coolwarm_r', annot_kws={'size':20}, ax=ax2)
ax2.set_title('SubSample Correlation Matrix \n (use for reference)', fontsize=14)
plt.show()

# Logistic

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
x = oversampled_df.drop(['fraud'],axis=1)

In [None]:
y = oversampled_df['fraud']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

In [None]:
logistic = LogisticRegression(max_iter=1000)

In [None]:
model = logistic.fit(x_train, y_train)

In [None]:
prediction = model.predict(x_test)

In [None]:
accuracy_score(y_test,prediction)

In [None]:
print("Accuracy - " + str(accuracy_score(y_test,prediction)))
print("Recall - " + str(recall_score(y_test,prediction)))
print("precision - " + str(precision_score(y_test,prediction)))

In [None]:
pickle.dump(model, open('../Models/OS_LogisticReg_08132022.pkl', 'wb'))

In [None]:
plot_confusion_matrix(logistic,x_test,y_test)

# Random Classifier Model

In [None]:
x1 = oversampled_df.drop(['fraud'],axis=1)

In [None]:
y1 = oversampled_df['fraud']

In [None]:
x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1, random_state=42)

In [None]:
randomforest = RandomForestClassifier()

In [None]:
model1 = randomforest.fit(x1_train,y1_train)

In [None]:
prediction1 = model1.predict(x1_test)

In [None]:
accuracy_score(y1_test,prediction1)

In [None]:
print("Accuracy - " + str(accuracy_score(y1_test,prediction1)))
print("Recall - " + str(recall_score(y1_test,prediction1)))
print("precision - " + str(precision_score(y1_test,prediction1)))

In [None]:
pickle.dump(model1, open('../Models/OS_RandomForest_08132022.pkl', 'wb'))

In [None]:
plot_confusion_matrix(randomforest,x1_test,y1_test)

# Decision Tree

In [None]:
x2 = oversampled_df.drop(['fraud'],axis=1)

In [None]:
y2 = oversampled_df['fraud']

In [None]:
dt = DecisionTreeRegressor()

In [None]:
x2_train,x2_test,y2_train,y2_test=train_test_split(x2,y2,test_size=0.3,random_state=123)

In [None]:
model2 = dt.fit(x2_train,y2_train)

In [None]:
prediction2 = model2.predict(x2_test)

In [None]:
accuracy_score(y2_test,prediction2)

In [None]:
print("Accuracy - " + str(accuracy_score(y2_test,prediction2)))
print("Recall - " + str(recall_score(y2_test,prediction2)))
print("precision - " + str(precision_score(y2_test,prediction2)))

In [None]:
pickle.dump(model2, open('../Models/OS_DecisionTree_08132022.pkl', 'wb'))