In [59]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [61]:
df=pd.read_csv("credit_card_fraud_dataset.csv")
df

Unnamed: 0,TransactionID,TransactionDate,Amount,MerchantID,TransactionType,Location,IsFraud
0,1,2024-04-03 14:15:35.462794,4189.27,688,refund,San Antonio,0
1,2,2024-03-19 13:20:35.462824,2659.71,109,refund,Dallas,0
2,3,2024-01-08 10:08:35.462834,784.00,394,purchase,New York,0
3,4,2024-04-13 23:50:35.462850,3514.40,944,purchase,Philadelphia,0
4,5,2024-07-12 18:51:35.462858,369.07,475,purchase,Phoenix,0
...,...,...,...,...,...,...,...
99995,99996,2024-06-07 00:57:36.027591,1057.29,289,refund,San Antonio,0
99996,99997,2023-10-22 23:12:36.027594,297.25,745,refund,San Antonio,0
99997,99998,2024-05-31 19:27:36.027597,3448.56,690,purchase,San Antonio,0
99998,99999,2024-10-18 09:43:36.027601,3750.79,644,purchase,Philadelphia,0


In [63]:
print(df.columns)


Index(['TransactionID', 'TransactionDate', 'Amount', 'MerchantID',
       'TransactionType', 'Location', 'IsFraud'],
      dtype='object')


In [66]:
if 'TransactionID' in df.columns:
    df.drop(columns=['TransactionID'], inplace=True)

if 'TransactionDate' in df.columns:
    df['TransactionDate'] = pd.to_datetime(df['TransactionDate'], errors='coerce')
    df['TransactionDate'] = df['TransactionDate'].astype('int64') // 10**9  

In [68]:
df.fillna(df.mode().iloc[0], inplace=True)
df

Unnamed: 0,TransactionDate,Amount,MerchantID,TransactionType,Location,IsFraud
0,1712153735,4189.27,688,refund,San Antonio,0
1,1710854435,2659.71,109,refund,Dallas,0
2,1704708515,784.00,394,purchase,New York,0
3,1713052235,3514.40,944,purchase,Philadelphia,0
4,1720810295,369.07,475,purchase,Phoenix,0
...,...,...,...,...,...,...
99995,1717721856,1057.29,289,refund,San Antonio,0
99996,1698016356,297.25,745,refund,San Antonio,0
99997,1717183656,3448.56,690,purchase,San Antonio,0
99998,1729244616,3750.79,644,purchase,Philadelphia,0


In [70]:
categorical_cols = ['MerchantID', 'TransactionType', 'Location']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

X = df.drop(columns=['IsFraud']) 
y = df['IsFraud'] 

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

results = []
for model_name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    results.append({
        "Model": model_name,
        "Mean Accuracy": scores.mean(),
        "Std Dev": scores.std()
    })

results_df = pd.DataFrame(results)
print("\nModel Performance Comparison:")
print(results_df)


Model Performance Comparison:
                 Model  Mean Accuracy   Std Dev
0  Logistic Regression       0.990162  0.000031
1        Random Forest       0.990162  0.000031
