<a href="https://colab.research.google.com/github/glennamaria/Projects/blob/main/fraud.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [135]:
import pandas as pd

In [136]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
path="/content/drive/MyDrive/Project.csv"
df= pd.read_csv(path)
df

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Display first few rows
df.head()


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno


In [None]:
print("\nDataset Summary:\n", df.describe())

In [None]:
print("\nMissing Values:\n", df.isnull().sum())

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(x='isFraud', data=df, palette=['green', 'red'])
plt.title("Fraud vs Non-Fraud Transactions")
plt.xlabel("Transaction Type (0: Non-Fraud, 1: Fraud)")
plt.ylabel("Count")
plt.show()


In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(x='type', data=df, order=df['type'].value_counts().index, palette="coolwarm")
plt.title("Transaction Type Distribution")
plt.xlabel("Transaction Type")
plt.ylabel("Count")
plt.show()


In [None]:
df['type'] = df['type'].astype('category').cat.codes  # Convert CASH-IN, CASH-OUT, etc. into numeric

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(x='isFraud', y='amount', data=df, palette=['green', 'red'])
plt.title("Transaction Amount Distribution (Fraud vs Non-Fraud)")
plt.xlabel("Transaction Type")
plt.ylabel("Transaction Amount")
plt.yscale("log")  # Log scale for better visualization
plt.show()


In [None]:
plt.figure(figsize=(8, 5))
sns.histplot(df['amount'], bins=50, kde=True, color='purple')
plt.title("Transaction Amount Distribution")
plt.xlabel("Amount")
plt.ylabel("Frequency")
plt.xscale("log")  # Log scale to handle large values
plt.show()



In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(df[df['isFraud'] == 1]['step'], bins=50, kde=True, color='red', label="Fraudulent")
sns.histplot(df[df['isFraud'] == 0]['step'], bins=50, kde=True, color='blue', label="Non-Fraudulent")
plt.title("Fraudulent Transactions Over Time (Step)")
plt.xlabel("Time Step")
plt.ylabel("Frequency")
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(8, 5))
sns.scatterplot(x=df['oldbalanceOrg'], y=df['newbalanceOrig'], hue=df['isFraud'], palette=['blue', 'red'], alpha=0.5)
plt.title("Old Balance vs. New Balance (Fraud vs Non-Fraud)")
plt.xlabel("Old Balance")
plt.ylabel("New Balance")
plt.xscale("log")
plt.yscale("log")
plt.legend(title="Fraud")
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='type', y='amount', data=df[df['isFraud'] == 1], palette='coolwarm')
plt.title("Fraud Amount Distribution by Transaction Type")
plt.xlabel("Transaction Type")
plt.ylabel("Amount")
plt.yscale("log")
plt.show()


In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif


In [None]:
df.drop(columns=['nameOrig', 'nameDest'], inplace=True)

In [None]:
encoder = LabelEncoder()
df['type'] = encoder.fit_transform(df['type'])  # Convert CASH-IN, CASH-OUT, etc., into numeric labels


In [None]:
print("Class Distribution Before Balancing:", Counter(df['isFraud']))  # 0 = Non-Fraud, 1 = Fraud

In [None]:
# Step 5: Split dataset into features (X) and target variable (y)
X = df.drop(columns=['isFraud', 'isFlaggedFraud'])  # Remove target and flagged fraud columns
y = df['isFraud']  # Target (0 = Non-Fraud, 1 = Fraud)

In [None]:
# Step 6: Feature Selection (SelectKBest to keep top features)
selector = SelectKBest(score_func=f_classif, k=5)  # Selecting top 5 best features
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]
print("\nSelected Features:", list(selected_features))

In [None]:
# Step 7: Split data into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
# Step 8: Balance the dataset using SMOTE (Oversampling) + Undersampling
smote = SMOTE(sampling_strategy=0.5, random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

undersample = RandomUnderSampler(sampling_strategy=0.5, random_state=42)
X_train_us, y_train_us = undersample.fit_resample(X_train_sm, y_train_sm)

smote_enn = SMOTEENN(random_state=42)
X_train_balanced, y_train_balanced = smote_enn.fit_resample(X_train_us, y_train_us)

print("Class Distribution After Balancing:", Counter(y_train_balanced))


In [None]:
# Step 9: Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.transform(X_test)


In [None]:
# Step 10: Hyperparameter Tuning using GridSearchCV
def tune_hyperparameters(model, param_grid, model_name):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train_scaled, y_train_balanced)
    print(f"\nBest Parameters for {model_name}: {grid_search.best_params_}")
    return grid_search.best_estimator_
    # Define models and hyperparameter grids
param_grids = {
    "Logistic Regression": (LogisticRegression(), {'C': [0.1, 1, 10], 'max_iter': [100, 200]}),
    "Random Forest": (RandomForestClassifier(), {'n_estimators': [50, 100, 200], 'max_depth': [5, 10, 20]}),
    "XGBoost": (XGBClassifier(use_label_encoder=False, eval_metric='logloss'), {'learning_rate': [0.01, 0.1, 0.2], 'n_estimators': [50, 100]}),
    "Decision Tree": (DecisionTreeClassifier(), {'max_depth': [5, 10, 20], 'criterion': ['gini', 'entropy']})
}

best_models = {}
for model_name, (model, param_grid) in param_grids.items():
    best_model = tune_hyperparameters(model, param_grid, model_name)
    best_models[model_name] = best_model

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

# Step 11: Train and evaluate best models with precision, recall, and F1-score
def train_and_evaluate_model(model, model_name):
    model.fit(X_train_scaled, y_train_balanced)
    y_pred = model.predict(X_test_scaled)

    # Calculate evaluation metrics
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Print results
    print(f"\n{model_name} Model Performance:")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"\nClassification Report:\n", classification_report(y_test, y_pred))
    print(f"\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

for model_name, model in best_models.items():
    train_and_evaluate_model(model, model_name)


In [None]:
# Step 12: Clustering using K-Means to detect fraud patterns
kmeans = KMeans(n_clusters=2, random_state=42)  # Assuming 2 clusters (Fraud & Non-Fraud)
clusters = kmeans.fit_predict(X_train_scaled)

df_clusters = pd.DataFrame(X_train_balanced, columns=selected_features)
df_clusters['Cluster'] = clusters

print("\nClustering Distribution:", Counter(df_clusters['Cluster']))

In [None]:
# Step 13: Isolation Forest for Anomaly Detection
iso_forest = IsolationForest(contamination=0.02, random_state=42)  # 2% contamination assumption
anomaly_scores = iso_forest.fit_predict(X_train_scaled)

# Convert -1 (anomaly) to 1 (fraud) and 1 (normal) to 0 (non-fraud)
anomalies = np.where(anomaly_scores == -1, 1, 0)
df_clusters['Anomaly'] = anomalies

print("\nAnomaly Detection Results (Isolation Forest):")
print("\nFraud Detected by Isolation Forest:", Counter(df_clusters['Anomaly']))

In [None]:
# Step 14: Save the best model (e.g., XGBoost)
import joblib
joblib.dump(best_models["XGBoost"], "fraud_detection_model_xgb.pkl")
joblib.dump(iso_forest, "isolation_forest_model.pkl")
print("\nBest Model (XGBoost) saved as 'fraud_detection_model_xgb.pkl'")
print("\nAnomaly Detection Model (Isolation Forest) saved as 'isolation_forest_model.pkl'")