## Task 5 - Modelling


In [21]:
from load import load_data
from feature_engineering import encode
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from model import train_and_evaluate_model
from model import train_and_evaluate_with_grid_search
from model import train_and_evaluate_with_random_search
import pickle


In [5]:
df=load_data(r"C:\Users\ASUS VIVO\Desktop\Week_6\data (2).csv")
df['ProductCategory']=df['ProductCategory'].apply(encode)

## Split the Data
Splitting the data into training and testing sets helps evaluate the model’s performance on unseen data.


In [6]:
y=df['FraudResult']
columns_to_drop = df.select_dtypes(include=['object']).columns.tolist() + ['FraudResult']+['CountryCode']
x = df.drop(columns=columns_to_drop)
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=999)

## Choose Models
   a: Logistic Regression
   


In [7]:
model = LogisticRegression(max_iter=1000) 
model.fit(x_train, y_train)

In [8]:
y_pred=model.predict(x_test)

In [9]:
conf_matrix = confusion_matrix(y_test, y_pred)
print(" Logistic Confusion Matrix:")
print(conf_matrix)

class_report = classification_report(y_test, y_pred)
print("Logistic Classification Report:")
print(class_report)

 Logistic Confusion Matrix:
[[28631    15]
 [   36    17]]
Logistic Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28646
           1       0.53      0.32      0.40        53

    accuracy                           1.00     28699
   macro avg       0.76      0.66      0.70     28699
weighted avg       1.00      1.00      1.00     28699



b: Decision Trees

In [10]:
decision_tree_model = DecisionTreeClassifier(random_state=999)

In [11]:
train_and_evaluate_model(decision_tree_model, x_train, y_train, x_test, y_test, "Decision Tree")

Decision Tree Confusion Matrix:
[[28645     1]
 [    7    46]]
Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28646
           1       0.98      0.87      0.92        53

    accuracy                           1.00     28699
   macro avg       0.99      0.93      0.96     28699
weighted avg       1.00      1.00      1.00     28699



c: RandomForestClassifier

In [12]:
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=999)
train_and_evaluate_model(random_forest_model, x_train, y_train, x_test, y_test, "Random Forest")

Random Forest Confusion Matrix:
[[28644     2]
 [    6    47]]
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28646
           1       0.96      0.89      0.92        53

    accuracy                           1.00     28699
   macro avg       0.98      0.94      0.96     28699
weighted avg       1.00      1.00      1.00     28699



## Hyperparameter Tunning
Improve model performance using hyperparameter tuning, use techniques like:
Grid Search


Define the Hyperparameter Search

In [17]:
# Logistic Regression Hyperparameters
lr_param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Inverse of regularization strength
    'solver': ['liblinear', 'saga'],  # Solvers to use
}

# Decision Tree Hyperparameters
dt_param_grid = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Random Forest Hyperparameters
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

In [14]:

logistic_model = LogisticRegression(random_state=999)
decision_tree_model = DecisionTreeClassifier(random_state=999)
random_forest_model = RandomForestClassifier(random_state=999)


train_and_evaluate_with_grid_search(logistic_model, lr_param_grid, x_train, y_train, x_test, y_test, "Logistic Regression")


train_and_evaluate_with_grid_search(decision_tree_model, dt_param_grid, x_train, y_train, x_test, y_test, "Decision Tree")


train_and_evaluate_with_grid_search(random_forest_model, rf_param_grid, x_train, y_train, x_test, y_test, "Random Forest")

Best parameters for Logistic Regression: {'C': 0.01, 'solver': 'liblinear'}
Logistic Regression Confusion Matrix:
[[28633    13]
 [   29    24]]
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28646
           1       0.65      0.45      0.53        53

    accuracy                           1.00     28699
   macro avg       0.82      0.73      0.77     28699
weighted avg       1.00      1.00      1.00     28699

Best parameters for Decision Tree: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10}
Decision Tree Confusion Matrix:
[[28645     1]
 [    7    46]]
Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28646
           1       0.98      0.87      0.92        53

    accuracy                           1.00     28699
   macro avg       0.99      0.93      0.96     28699
weighted a

## Random Search
Model Evaluation
Assess model performance using the following metrics
Accuracy: The ratio of correctly predicted observations to the total observations.
Precision: The ratio of correctly predicted positive observations to the total predicted positives.
Recall (Sensitivity): The ratio of correctly predicted positive observations to all observations in the actual class.
F1 Score: The weighted average of Precision and Recall.
ROC-AUC: Area Under the Receiver Operating Characteristic Curve, which measures the ability of the model to distinguish between classes.




In [18]:
train_and_evaluate_with_random_search(logistic_model, lr_param_grid, x_train, y_train, x_test, y_test, "Logistic Regression")



Best parameters for Logistic Regression: {'solver': 'liblinear', 'C': 0.01}
Logistic Regression Evaluation Metrics:
Accuracy: 0.9985
Precision: 0.6486
Recall: 0.4528
F1 Score: 0.5333
ROC-AUC: 0.9487
Logistic Regression Confusion Matrix:
[[28633    13]
 [   29    24]]
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28646
           1       0.65      0.45      0.53        53

    accuracy                           1.00     28699
   macro avg       0.82      0.73      0.77     28699
weighted avg       1.00      1.00      1.00     28699



In [19]:

train_and_evaluate_with_random_search(decision_tree_model, dt_param_grid, x_train, y_train, x_test, y_test, "Decision Tree")



Best parameters for Decision Tree: {'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': None}
Decision Tree Evaluation Metrics:
Accuracy: 0.9997
Precision: 0.9787
Recall: 0.8679
F1 Score: 0.9200
ROC-AUC: 0.9900
Decision Tree Confusion Matrix:
[[28645     1]
 [    7    46]]
Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28646
           1       0.98      0.87      0.92        53

    accuracy                           1.00     28699
   macro avg       0.99      0.93      0.96     28699
weighted avg       1.00      1.00      1.00     28699



In [20]:
train_and_evaluate_with_random_search(random_forest_model, rf_param_grid, x_train, y_train, x_test, y_test, "Random Forest")



Best parameters for Random Forest: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': None}
Random Forest Evaluation Metrics:
Accuracy: 0.9997
Precision: 0.9787
Recall: 0.8679
F1 Score: 0.9200
ROC-AUC: 0.9899
Random Forest Confusion Matrix:
[[28645     1]
 [    7    46]]
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28646
           1       0.98      0.87      0.92        53

    accuracy                           1.00     28699
   macro avg       0.99      0.93      0.96     28699
weighted avg       1.00      1.00      1.00     28699



## Task 6 - Model Serving API Call
Create a REST API to serve the trained machine-learning models for real-time predictions.
Choose a framework:
Select a suitable framework for building REST APIs (e.g., Flask, FastAPI, Django REST framework).
Load the model:
Use the model from Task 4 to load the trained machine-learning model.
Define API endpoints:
Create API endpoints that accept input data and return predictions.
Handle requests:
Implement logic to receive input data, preprocess it, and make predictions using the loaded model.
Return predictions:
Format the predictions and return them as a response to the API call.
Deployment:
Deploy the API to a web server or cloud platform.


In [23]:
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [24]:
# Load the model from the file
with open('model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

In [25]:
pip install fastapi uvicorn

Collecting fastapi
  Downloading fastapi-0.115.7-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting starlette<0.46.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.45.3-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic!=1.8,!=1.8.1,!=2.0.0,!=2.0.1,!=2.1.0,<3.0.0,>=1.7.4 (from fastapi)
  Downloading pydantic-2.10.6-py3-none-any.whl.metadata (30 kB)
Collecting h11>=0.8 (from uvicorn)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Collecting annotated-types>=0.6.0 (from pydantic!=1.8,!=1.8.1,!=2.0.0,!=2.0.1,!=2.1.0,<3.0.0,>=1.7.4->fastapi)
  Downloading annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Collecting pydantic-core==2.27.2 (from pydantic!=1.8,!=1.8.1,!=2.0.0,!=2.0.1,!=2.1.0,<3.0.0,>=1.7.4->fastapi)
  Downloading pydantic_core-2.27.2-cp312-cp312-win_amd64.whl.metadata (6.7 kB)
Collecting typing-extensions>=4.8.0 (from fastapi)
  Using cached typing_extensions-4.12.2-py3-none-

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
dash 2.18.2 requires dash_core_components==2.0.0, which is not installed.
dash 2.18.2 requires dash_html_components==2.0.0, which is not installed.
dash 2.18.2 requires dash_table==5.0.0, which is not installed.
dash 2.18.2 requires Flask<3.1,>=1.0.4, but you have flask 3.1.0 which is incompatible.


In [26]:
from fastapi import FastAPI
from pydantic import BaseModel
import pickle
import numpy as np

In [27]:
x

Unnamed: 0,ProductCategory,Amount,Value,PricingStrategy
0,0,1000.0,1000,2
1,1,-20.0,20,2
2,0,500.0,500,2
3,2,20000.0,21800,2
4,1,-644.0,644,2
...,...,...,...,...
95657,1,-1000.0,1000,2
95658,0,1000.0,1000,2
95659,1,-20.0,20,2
95660,4,3000.0,3000,2


In [28]:

with open('model.pkl', 'rb') as file:
    model = pickle.load(file)

app = FastAPI()
feature1='ProductCategory'
feature2='Amount'
feature3='Value'
feature4='PricingStrategy'

# Define the input data structure
class InputData(BaseModel):
    feature1: float
    feature2: float
    feature3: float
    feature4:float

@app.post("/predict/")
async def predict(input_data: InputData):
    input_values = np.array([[input_data.feature1, input_data.feature2, input_data.feature3]])
 
    prediction = model.predict(input_values)

    return {"prediction": prediction[0]}