Importing Libraries

In [46]:
import numpy as np
import pandas as pd 
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, classification_report, r2_score, mean_squared_error, mean_absolute_error, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [47]:
train_df = pd.read_csv("fraudTrain.csv")
test_df = pd.read_csv("fraudTest.csv")

train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

In [48]:
def data_pre(X):
    # These columns are presumably considered irrelevant for the analysis
    del_col=['merchant','first','last','street','zip','unix_time','Unnamed: 0','trans_num','cc_num']
    X.drop(columns=del_col,inplace=True)
   
    # Data Conversion 
    # The trans_date_trans_time column is then used to create a new column 
    # trans_date that contains only the date part in the 'YYYY-MM-DD' format.
    X['trans_date_trans_time']=pd.to_datetime(X['trans_date_trans_time'])
    X['trans_date']=X['trans_date_trans_time'].dt.strftime('%Y-%m-%d')
    X['trans_date']=pd.to_datetime(X['trans_date'])
    X['dob']=pd.to_datetime(X['dob'])
    
    #Calculate Age of each transaction
    X["age"] = (X["trans_date"] - X["dob"]).dt.days //365
    X['trans_month']=X['trans_date'].dt.month
    X['trans_year']=X['trans_date'].dt.year
    
    # The gender column is transformed to binary values where 
    # 'M' (presumably for males) is converted to 1, and other values are converted to 0.
    X['gender']=X['gender'].apply(lambda x : 1 if x=='M' else 0)
    X['gender']=X['gender'].astype(int)
    
    # The lat_dis and long_dis columns are created to calculate the absolute 
    # differences between the latitude (lat) and merchant latitude (merch_lat)
    # as well as the longitude (long) and merchant longitude (merch_long).
    X['lat_dis']=abs(X['lat']-X['merch_lat'])
    X['long_dis']=abs(X['long']-X['merch_long'])
    
    # The function applies one-hot encoding to the category column to convert categorical variables
    # into binary indicators. This is achieved using pd.get_dummies.
    X=pd.get_dummies(X,columns=['category'])
    X=X.drop(columns=['city','trans_date_trans_time','state','job','merch_lat','merch_long','lat','long','dob','trans_date'])
    return X
    

In [49]:
train_df_pre=data_pre(train_df.copy())
train_df_pre.head()
test_df_pre=data_pre(test_df.copy())
test_df_pre.head()

test_df_pre.columns

Index(['amt', 'gender', 'city_pop', 'is_fraud', 'age', 'trans_month',
       'trans_year', 'lat_dis', 'long_dis', 'category_entertainment',
       'category_food_dining', 'category_gas_transport',
       'category_grocery_net', 'category_grocery_pos',
       'category_health_fitness', 'category_home', 'category_kids_pets',
       'category_misc_net', 'category_misc_pos', 'category_personal_care',
       'category_shopping_net', 'category_shopping_pos', 'category_travel'],
      dtype='object')

DataSet Split

In [50]:
x_train=train_df_pre.drop('is_fraud',axis=1)
y_train=train_df_pre['is_fraud']
x_test=test_df_pre.drop('is_fraud',axis=1)
y_test=test_df_pre['is_fraud']

Feature Scaling

In [51]:
# Step 1: Fit the StandardScaler on the training data
# The StandardScaler is a transformer from the scikit-learn library 
# that scales features to have zero mean and unit variance.
scaler = StandardScaler()
scaler.fit(x_train)
x_train=scaler.transform(x_train)
x_test=scaler.transform(x_test)


Logistic Regression Model

In [56]:
logistic_regression=LinearRegression()
logistic_regression.fit(x_train,y_train)
y_pred_logistic = logistic_regression.predict(x_test)

print("Mean squared error: %.4f" % mean_squared_error(y_test, y_pred_logistic))
print("Mean absolute error: %.4f" % mean_absolute_error(y_test, y_pred_logistic))
print("R2 score: %.4f" % r2_score(y_test, y_pred_logistic))
print("Accuracy score: %.4f" % accuracy_score(y_test, y_pred_logistic.round()))
print("Confusion matrix: \n", confusion_matrix(y_test, y_pred_logistic.round()))




Mean squared error: 0.0037
Mean absolute error: 0.0103
R2 score: 0.0284
Accuracy score: 0.9960
Confusion matrix: 
 [[553515     54      5]
 [  2145      0      0]
 [     0      0      0]]
