In [1]:
import pandas as pd

# Load the training and test datasets
train_df = pd.read_csv('data/fraudTrain.csv')  # Replace with your actual path
test_df = pd.read_csv('data/fraudTest.csv')    # Replace with your actual path



In [2]:
# Step 1: Drop the 'Unnamed: 0' column for both train and test datasets
train_df = train_df.drop(columns=['Unnamed: 0'])
test_df = test_df.drop(columns=['Unnamed: 0'])



In [3]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Step 1: Convert datetime columns
for df in [train_df, test_df]:
    df["trans_date_trans_time"] = pd.to_datetime(df["trans_date_trans_time"])
    df["year"] = df["trans_date_trans_time"].dt.year
    df["month"] = df["trans_date_trans_time"].dt.month
    df["day"] = df["trans_date_trans_time"].dt.day
    df["hour"] = df["trans_date_trans_time"].dt.hour
    df["minute"] = df["trans_date_trans_time"].dt.minute
    df["second"] = df["trans_date_trans_time"].dt.second

# Step 2: Drop columns that won't help modeling directly (like long text or IDs)
cols_to_drop = ["trans_date_trans_time", "first", "last", "street", "dob", "trans_num"]
train_df = train_df.drop(columns=cols_to_drop)
test_df = test_df.drop(columns=cols_to_drop)

# Step 3: Encode categorical variables safely and quickly
categorical_cols = train_df.select_dtypes(include="object").columns

for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    
    # Create a dictionary to map known values
    mapping_dict = {label: idx for idx, label in enumerate(le.classes_)}
    
    # Use map and fillna for unseen labels in test set
    test_df[col] = test_df[col].map(mapping_dict).fillna(-1).astype(int)

print("✅ All categorical columns encoded successfully.")


✅ All categorical columns encoded successfully.


In [5]:
from sklearn.preprocessing import StandardScaler

# 1. Separate features and target
X_train = train_df.drop(columns=['is_fraud'])
y_train = train_df['is_fraud']

X_test = test_df.drop(columns=['is_fraud'])
y_test = test_df['is_fraud']  # Only if you have this in test data

# 2. Initialize the scaler
scaler = StandardScaler()

# 3. Fit on training data and transform both
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Optional: Check shapes to confirm
print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)


X_train_scaled shape: (1296675, 21)
X_test_scaled shape: (555719, 21)


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Logistic Regression with class weight balanced
model_balanced = LogisticRegression(max_iter=1000, class_weight='balanced')
model_balanced.fit(X_train_scaled, y_train)

# Predict on test set
y_pred_balanced = model_balanced.predict(X_test_scaled)

# Evaluate model
print("Classification Report (with class_weight='balanced'):")
print(classification_report(y_test, y_pred_balanced))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_balanced))


Classification Report (with class_weight='balanced'):
              precision    recall  f1-score   support

           0       1.00      0.97      0.98    553574
           1       0.09      0.74      0.16      2145

    accuracy                           0.97    555719
   macro avg       0.54      0.85      0.57    555719
weighted avg       1.00      0.97      0.98    555719


Confusion Matrix:
[[537283  16291]
 [   568   1577]]


In [8]:
import joblib

# Save the model
joblib.dump(model, 'logistic_model.pkl')

# Save the scaler
joblib.dump(scaler, 'scaler.pkl')


['scaler.pkl']