In [1]:
# Import pandas
import pandas as pd
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../src')))
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../scripts')))


In [2]:
data = pd.read_csv('../data/cleaned_data.csv')

In [3]:
from modelling.model_pipeline import ModelPipeline


target_column = 'FraudResult'
pipeline = ModelPipeline(data, target_column)


In [4]:
# Preprocess the data
pipeline.preprocess_data()

Data preprocessing completed: Shape of processed data = (95662, 296081)


In [5]:
# Split the data
pipeline.split_data()


Data split completed: Training data size = (76529, 296081), Test data size = (19133, 296081)


In [6]:
# Add models to the pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


pipeline.add_model('Logistic Regression', LogisticRegression(max_iter=1000, random_state=42))
pipeline.add_model('Random Forest', RandomForestClassifier(random_state=42))


In [7]:
# # Train Logistic Regression
# logistic_regression = LogisticRegression()
# logistic_regression.fit(pipeline.X_train, pipeline.y_train)

# # Train Random Forest
# random_forest = RandomForestClassifier()
# random_forest.fit(pipeline.X_train, pipeline.y_train)

In [8]:

# Train the models
pipeline.train_model('Logistic Regression')


Model Logistic Regression trained successfully. No 'estimators_' attribute (non-ensemble model).


In [9]:
pipeline.train_model('Random Forest')


AttributeError: 'RandomForestClassifier' object has no attribute 'estimators_'

In [11]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [None]:
pipeline.hyperparameter_tuning(model_name='Random Forest', param_grid=param_grid, search_type='grid')

In [None]:




# Hyperparameter tuning
param_grid_rf = {
'n_estimators': [100, 200, 300],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
}
pipeline.hyperparameter_tuning('Random Forest', param_grid_rf, search_type='grid')

# Evaluate models
pipeline.evaluate_model('Logistic Regression')
pipeline.evaluate_model('Random Forest')

# Plot ROC Curve
pipeline.plot_roc_curve('Random Forest')

