<a href="https://colab.research.google.com/github/codsofft/codsoft-intern-sridhar-s/blob/main/Codsoft_task_4_creditcard_fraud_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from collections import Counter

# Load the dataset
data = pd.read_csv('/content/sample_data/creditcard.csv')  # Replace this with your dataset's file path

# Check for missing values
print(data.isnull().sum())

# Feature scaling (normalize the Amount and Time columns)
scaler = StandardScaler()
data['Amount'] = scaler.fit_transform(data['Amount'].values.reshape(-1, 1))
data['Time'] = scaler.fit_transform(data['Time'].values.reshape(-1, 1))

# Separate features (X) and target variable (y)
X = data.drop('Class', axis=1)
y = data['Class']

# Handle class imbalance with SMOTE (oversampling the minority class)
print(f"Original dataset class distribution: {Counter(y)}")
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)
print(f"Resampled dataset class distribution: {Counter(y_res)}")

# Split the resampled data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Model training - Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

# Model training - Random Forest (as an alternative)
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Predictions using Logistic Regression
lr_pred = lr_model.predict(X_test)

# Predictions using Random Forest
rf_pred = rf_model.predict(X_test)

# Evaluation using classification metrics
print("Logistic Regression Model Evaluation")
print(classification_report(y_test, lr_pred))
print("Confusion Matrix")
print(confusion_matrix(y_test, lr_pred))

print("Random Forest Model Evaluation")
print(classification_report(y_test, rf_pred))
print("Confusion Matrix")
print(confusion_matrix(y_test, rf_pred))



Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64
Original dataset class distribution: Counter({0: 284315, 1: 492})
Resampled dataset class distribution: Counter({0: 284315, 1: 284315})
Logistic Regression Model Evaluation
              precision    recall  f1-score   support

           0       0.93      0.98      0.95     56750
           1       0.97      0.92      0.95     56976

    accuracy                           0.95    113726
   macro avg       0.95      0.95      0.95    113726
weighted avg       0.95      0.95      0.95    113726

Confusion Matrix
[[55365  1385]
 [ 4426 52550]]
Random Forest Model Evaluation
              precision    recall  f