In [None]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Parameters
num_records = 430693
fraud_rate = 0.01
num_frauds = int(num_records * fraud_rate)

# Generate synthetic data
data = {
    'transaction_id': range(1, num_records + 1),
    'amount': np.random.uniform(1, 10000, num_records),
    'transaction_type': np.random.choice(['purchase', 'withdrawal', 'transfer'], num_records),
    'customer_id': np.random.randint(1000, 5000, num_records),
    'location': np.random.choice(['location_A', 'location_B', 'location_C', 'location_D'], num_records),
}

# Create a DataFrame
df = pd.DataFrame(data)

# Introduce frauds by manipulating certain features
fraud_indices = np.random.choice(num_records, num_frauds, replace=False)
df.loc[fraud_indices, 'amount'] *= 10  # Increase amount for frauds
df.loc[fraud_indices, 'transaction_type'] = 'fraudulent'
df['is_fraud'] = 0
df.loc[fraud_indices, 'is_fraud'] = 1

# Display dataset info
print(df.head())
print(df['is_fraud'].value_counts())


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define features and target
X = df.drop(columns=['transaction_id', 'is_fraud'])
y = df['is_fraud']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['amount']),
        ('cat', OneHotEncoder(), ['transaction_type', 'location'])
    ])

# Transform the data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Initialize and train a Random Forest classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_transformed, y_train)

# Make predictions
y_pred = model.predict(X_test_transformed)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Confusion Matrix Visualization
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraud', 'Fraud'], yticklabels=['Not Fraud', 'Fraud'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()


In [None]:
import joblib

# Save the model and preprocessor
joblib.dump(model, 'fraud_detection_model.pkl')
joblib.dump(preprocessor, 'preprocessor.pkl')
