## split dataset

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the preprocessed data from the CSV file
data = pd.read_csv(r"D:\major project\undersampled_processed_data.csv")

# Split the data into features (X) and target (y)
X = data.drop(["is_fraud"], axis=1)
y = data["is_fraud"]

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Save the resulting datasets to CSV files
X_train.to_csv('D:\major project\X_train.csv', index=False)
y_train.to_csv('D:\major project\y_train.csv', index=False)
X_test.to_csv('D:\major project\X_test.csv', index=False)
y_test.to_csv('D:\major project\y_test.csv', index=False)

# Display the shapes of the resulting datasets
print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_test shape: {y_test.shape}')

X_train shape: (12009, 21)
y_train shape: (12009,)
X_test shape: (3003, 21)
y_test shape: (3003,)


## Decision tree classifier

In [18]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load the training and testing datasets
# X_train = pd.read_csv('link')
# y_train = pd.read_csv('link')
# X_test = pd.read_csv('link')
# y_test = pd.read_csv('link')

# Perform one-hot encoding on categorical columns
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

# Align the columns of X_test to match X_train (in case some categories are missing in the test set)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Initialize the Decision Tree Classifier
dtc = DecisionTreeClassifier(random_state=42)

# Train the classifier
dtc.fit(X_train, y_train)

# Make predictions on the test set
y_pred = dtc.predict(X_test)

# Evaluate the model's performance
print("Confusion Matrix:")
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

print("\nClassification Report:")
class_report = classification_report(y_test, y_pred)
print(class_report)

print("\nAccuracy Score:")
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

Confusion Matrix:
[[1440   67]
 [  54 1442]]

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1507
           1       0.96      0.96      0.96      1496

    accuracy                           0.96      3003
   macro avg       0.96      0.96      0.96      3003
weighted avg       0.96      0.96      0.96      3003


Accuracy Score:
0.9597069597069597


In [19]:
import joblib
# Save the model
model_path = r'D:\major project\models\dtc_model.pkl'
joblib.dump(dtc, model_path)

# Save the evaluation results
evaluation_path = r'D:\major project\results\dtc_results.txt'
with open(evaluation_path, 'w') as f:
    f.write("Accuracy Score:")
    f.write(str(accuracy) + "\n")
    f.write("Confusion Matrix:\n")
    f.write(str(conf_matrix) + "\n\n")
    f.write("Classification Report:\n")
    f.write(class_report + "\n")
    

print(f"Model saved to {model_path}")
print(f"Evaluation results saved to {evaluation_path}")

Model saved to D:\major project\models\dtc_model.pkl
Evaluation results saved to D:\major project\results\dtc_results.txt


## Random forest classifier

In [20]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest Classifier
rfc = RandomForestClassifier(random_state=42)

# Train the classifier
rfc.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rfc.predict(X_test)

# Evaluate the model's performance
print("Confusion Matrix:")
conf_matix = confusion_matrix(y_test, y_pred)
print(conf_matix)

print("\nClassification Report:")
class_report = classification_report(y_test,y_pred)
print(class_report)

print("\nAccuracy Score:")
accuracy = accuracy_score(y_test,y_pred)
print(accuracy)

Confusion Matrix:
[[1459   48]
 [  62 1434]]

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.96      1507
           1       0.97      0.96      0.96      1496

    accuracy                           0.96      3003
   macro avg       0.96      0.96      0.96      3003
weighted avg       0.96      0.96      0.96      3003


Accuracy Score:
0.9633699633699634


In [21]:
import joblib
# Save the model

model_path = r'D:\major project\models\rfc_model.pkl'
joblib.dump(rfc, model_path)

# Save the evaluation results
evaluation_path = r'D:\major project\results\rfc_results.txt'
with open(evaluation_path, 'w') as f:
    f.write("Accuracy Score: ")
    f.write(str(accuracy) + "\n")
    f.write("Confusion Matrix:\n")
    f.write(str(conf_matix) + "\n\n")
    f.write("Classification Report:\n")
    f.write(class_report + "\n")
    

print(f"Model saved to {model_path}")
print(f"Evaluation results saved to {evaluation_path}")

Model saved to D:\major project\models\rfc_model.pkl
Evaluation results saved to D:\major project\results\rfc_results.txt


## Gradient boosting classifier

In [22]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [23]:
# Initialize the Gradient Boosting Classifier
gbc = GradientBoostingClassifier(random_state=42)

# Train the classifier
gbc.fit(X_train, y_train)

# Make predictions on the test set
y_pred = gbc.predict(X_test)

# Evaluate the model's performance
print("Confusion Matrix:")
conf_matix = confusion_matrix(y_test, y_pred)
print(conf_matix)

print("\nClassification Report:")
class_report = classification_report(y_test, y_pred)
print(class_report)

print("\nAccuracy Score:")
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

Confusion Matrix:
[[1420   87]
 [  76 1420]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.94      0.95      1507
           1       0.94      0.95      0.95      1496

    accuracy                           0.95      3003
   macro avg       0.95      0.95      0.95      3003
weighted avg       0.95      0.95      0.95      3003


Accuracy Score:
0.9457209457209457


In [24]:
import joblib
# Save the model
model_path = r'D:\major project\models\gbc_model.pkl'
joblib.dump(gbc, model_path)

# Save the evaluation results
evaluation_path = r'D:\major project\results\gbc_results.txt'
with open(evaluation_path, 'w') as f:
    f.write("Accuracy Score: ")
    f.write(str(accuracy) + "\n")
    f.write("Confusion Matrix:\n")
    f.write(str(conf_matix) + "\n\n")
    f.write("Classification Report:\n")
    f.write(class_report + "\n")
    

print(f"Model saved to {model_path}")
print(f"Evaluation results saved to {evaluation_path}")

Model saved to D:\major project\models\gbc_model.pkl
Evaluation results saved to D:\major project\results\gbc_results.txt


### Select best model

In [25]:
import shutil
import os

def model_selection():
    file_paths = [
        r'D:\major project\results\dtc_results.txt',
        r'D:\major project\results\rfc_results.txt',
        r'D:\major project\results\gbc_results.txt'
    ]

    max_accuracy = -1.0  # Initialize with a very low value
    best_file = None
    best_model = None

    for file_path in file_paths:
        with open(file_path, 'r') as file:
            file_contents = file.read()

        # Find the accuracy in the file
        accuracy = None
        for line in file_contents.split('\n'):
            if line.startswith("Accuracy Score:"):
                accuracy = float(line.split(":")[1].strip())

        # Update if a higher accuracy is found
        if accuracy is not None and accuracy > max_accuracy:
            max_accuracy = accuracy
            best_file = file_path

    print(f"The file with the highest accuracy is {best_file} with an accuracy of {max_accuracy}")

    if best_file == r'D:\major project\results\gbc_results.txt':
        best_model = r'D:\major project\models\gbc_model.pkl'
    elif best_file == r'D:\major project\results\rfc_results.txt':
        best_model =  r'D:\major project\models\rfc_model.pkl'
    elif best_file == r'D:\major project\results\dtc_results.txt':
        best_model =  r'D:\major project\models\dtc_model.pkl'
    else:
        raise ValueError("No valid model found")

    final_model = r'D:\major project\models\final_model.pkl'
    shutil.copy(best_model , final_model)

model_selection()

The file with the highest accuracy is D:\major project\results\rfc_results.txt with an accuracy of 0.9633699633699634
