In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('data/engine_data.csv')
data.head()

Unnamed: 0,Engine rpm,Lub oil pressure,Fuel pressure,Coolant pressure,lub oil temp,Coolant temp,Engine Condition
0,700,2.493592,11.790927,3.178981,84.144163,81.632187,1
1,876,2.941606,16.193866,2.464504,77.640934,82.445724,0
2,520,2.961746,6.553147,1.064347,77.752266,79.645777,1
3,473,3.707835,19.510172,3.727455,74.129907,71.774629,1
4,619,5.672919,15.738871,2.052251,78.396989,87.000225,0


In [3]:
from sklearn.model_selection import train_test_split

# Assuming 'data' is your DataFrame with the features and target
features = ['Engine rpm', 'Lub oil pressure', 'Fuel pressure', 'Coolant pressure', 'lub oil temp', 'Coolant temp']
target = 'Engine Condition'
X = data[features]
y = data[target]

# Step 1: Split off 10% for the test set, leaving 90% for training + validation
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.1, random_state=31)

# Step 2: From the 90% (train + validation), split 10% of the total for validation
# 10% of total = 11.11% of the 90% (0.1 / 0.9 ≈ 0.1111)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.1111, random_state=42)

In [5]:
import warnings 
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Initialize the model
model = LogisticRegression()

# Step 3: Train the model on the training set
model.fit(X_train, y_train)

# Step 4 (Optional): Check performance on the validation set
y_val_pred = model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy}")

# Step 5: Test the model on the test data
y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy}")

Validation Accuracy: 0.654042988741044
Test Accuracy: 0.6427840327533265


In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Test degrees 1 to 3
best_val_accuracy = 0
best_degree = 1

for degree in range(1, 4):
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('poly', PolynomialFeatures(degree=degree, include_bias=False)),
        ('logistic', LogisticRegression(max_iter=1000))
    ])
    pipe.fit(X_train, y_train)
    y_val_pred = pipe.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    print(f"Degree {degree}: Validation Accuracy = {val_accuracy:.4f}")
    
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_degree = degree

# Final model with best degree
final_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=best_degree, include_bias=False)),
    ('logistic', LogisticRegression(max_iter=1000))
])
final_pipe.fit(X_train, y_train)
y_test_pred = final_pipe.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy with degree {best_degree}: {test_accuracy:.4f}")

Degree 1: Validation Accuracy = 0.6540
Degree 2: Validation Accuracy = 0.6551
Degree 3: Validation Accuracy = 0.6540
Test Accuracy with degree 2: 0.6556
