# 1.) Import the Credit Card Fraud Data From CCLE

In [47]:
import pandas as pd
from google.colab import drive
import matplotlib.pyplot as plt
import numpy as np

In [48]:
drive.mount('/content/gdrive/', force_remount = True)

Mounted at /content/gdrive/


In [49]:
df = pd.read_csv("/content/gdrive/MyDrive/441winter/fraudTest.csv")

In [50]:
df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [51]:
df.columns

Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')

# 2.) Select four columns to use as features (one just be trans_date_trans)

In [52]:
df_select = df[["trans_date_trans_time","category","amt","city_pop","is_fraud"]]

In [53]:
df_select.head()

Unnamed: 0,trans_date_trans_time,category,amt,city_pop,is_fraud
0,2020-06-21 12:14:25,personal_care,2.86,333497,0
1,2020-06-21 12:14:33,personal_care,29.84,302,0
2,2020-06-21 12:14:53,health_fitness,41.28,34496,0
3,2020-06-21 12:15:15,misc_pos,60.05,54767,0
4,2020-06-21 12:15:17,travel,3.19,1126,0


# 3.) Create a unique variable out of trans_date.

In [54]:
type(df["trans_date_trans_time"][0])

str

In [55]:
df["trans_date_trans_time"] = pd.to_datetime(df["trans_date_trans_time"])

In [56]:
df_select['time_var'] = [i.second for i in df["trans_date_trans_time"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_select['time_var'] = [i.second for i in df["trans_date_trans_time"]]


In [57]:
dummies = pd.get_dummies(df_select["category"])
x = pd.concat([dummies,df_select[["amt","city_pop","time_var"]]],axis=1)
y = df_select["is_fraud"]

In [58]:
x.head()

Unnamed: 0,entertainment,food_dining,gas_transport,grocery_net,grocery_pos,health_fitness,home,kids_pets,misc_net,misc_pos,personal_care,shopping_net,shopping_pos,travel,amt,city_pop,time_var
0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2.86,333497,25
1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,29.84,302,33
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,41.28,34496,53
3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,60.05,54767,15
4,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3.19,1126,17


# 4.) Oversample the data (this will be your training data).

In [59]:
#resample x = x
#resamole y = y

# 5.) Train a Logistic regression.

In [60]:
from sklearn.linear_model import LogisticRegression

In [61]:
log_reg = LogisticRegression().fit(x,y)

# 6.) The company you are working for wants to target at a False Positive rate of 5% what threshold should you use? (Use oversampled data)

In [82]:
from sklearn.metrics import confusion_matrix
y_prob = log_reg.predict_proba(x)

target_fn_percentage = 5 
threshold = np.percentile(y_prob[:,1], 100-target_fn_percentage)

y_pred = (y_prob[:,1]> threshold).astype(int)

tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()

confusion_matrix(y, y_pred)

array([[527318,  26256],
       [   615,   1530]])

In [83]:
tn, fp, fn, tp

(527318, 26256, 615, 1530)

# 7.) If the company makes .02*amt on True transactions and loses -amt on False (Use original data)

In [84]:
cost_true = 0.02  # cost for true transactions
cost_false = -1  # cost for false transactions

profit = (tn * cost_true) + (fp * cost_false) + (fn * cost_false) + (tp * cost_true)
profit

-16294.039999999999

# 8.) Using Logistic Regression Lasso to inform you. Would you use the selected features in a trusted prediction model?

In [77]:
LogisticRegression("11")

LogisticRegression(penalty='11')

In [79]:
from sklearn.linear_model import LogisticRegressionCV
clf = LogisticRegressionCV(cv=5, penalty='l1', solver='liblinear')
clf.fit(x, y)
coef = clf.coef_
important_features = [i for i in range(len(coef[0])) if coef[0][i]!=0]

In [81]:
important_features

[14, 15, 16]

In [80]:
coef

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  4.63648747e-04,
        -7.72652639e-06, -1.62257904e-01]])