<a href="https://colab.research.google.com/github/djorgeext/Predictive_Maintenance_using_Gradient_Boosted_Trees/blob/main/Predictive_Maintenance_using_Gradient_Boosted_Trees.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Predictive Maintenance Using Decision Trees

This script demonstrates how to use Decision Trees for predictive maintenance
classification tasks using the AI4I 2020 dataset.

Dataset: AI4I 2020 Predictive Maintenance Dataset
Source: UCI Machine Learning Repository
DOI: https://doi.org/10.24432/C5HS5C

In [None]:
!pip install ydf
!pip install pandas
!pip install sklearn

In [24]:
import os
import pandas as pd
import ydf
from sklearn.model_selection import train_test_split

In [25]:
def load_data(file_path):
    """Load and display basic dataset information."""
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Dataset not found at {file_path}")

    df = pd.read_csv(file_path)

    print("Dataset Overview:")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print(f"\nMissing values:\n{df.isnull().sum()}")
    print(f"\nTarget distribution:\n{df['Machine failure'].value_counts()}")

    failure_rate = df['Machine failure'].mean()
    print(f"Failure rate: {failure_rate:.1%}")

    return df


def get_features(df):
    """Select relevant features for training."""
    # Ignore ID columns, target, and failure type columns
    ignored = ["UDI", "Product ID", "Machine failure", "TWF", "HDF", "PWF", "OSF", "RNF"]
    features = [col for col in df.columns if col not in ignored]
    print(f"Selected features: {features}")
    return features


def train_model(df_train, features):
    """Train Gradient Boosted Trees model."""
    print("Training Gradient Boosted Trees model...")

    learner = ydf.GradientBoostedTreesLearner(
        label="Machine failure",
        features=features,
        include_all_columns=False
    )

    model = learner.train(df_train)
    return model


def evaluate_model(model, df_test):
    """Evaluate model performance."""
    print("\n" + "="*50)
    print("MODEL EVALUATION")
    print("="*50)

    # YDF evaluation (this already gives us all the metrics we need)
    evaluation = model.evaluate(df_test)
    print("YDF Evaluation Results:")
    print(evaluation)

In [26]:
def main():
    """Run the predictive maintenance analysis."""
    # Configuration
    DATASET_PATH = "/content/drive/MyDrive/Challenges_ML-DL/ai4i2020.csv" # change path if necessary
    TEST_SIZE = 0.2
    RANDOM_STATE = 0

    try:
        # Load data
        df = load_data(DATASET_PATH)

        # Split dataset
        df_train, df_test = train_test_split(
            df,
            test_size=TEST_SIZE,
            random_state=RANDOM_STATE,
            stratify=df['Machine failure']
        )

        print(f"\nTraining set size: {len(df_train)}")
        print(f"Test set size: {len(df_test)}")

        # Get features
        features = get_features(df)

        # Train model
        model = train_model(df_train, features)

        # Evaluate model
        evaluate_model(model, df_test)

        print("\n" + "="*50)
        print("ANALYSIS COMPLETE")
        print("="*50)
        return model

    except Exception as e:
        print(f"Error occurred: {e}")
        raise

if __name__ == "__main__":
    model = main()

Dataset Overview:
Shape: (10000, 14)
Columns: ['UDI', 'Product ID', 'Type', 'Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]', 'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF']

Missing values:
UDI                        0
Product ID                 0
Type                       0
Air temperature [K]        0
Process temperature [K]    0
Rotational speed [rpm]     0
Torque [Nm]                0
Tool wear [min]            0
Machine failure            0
TWF                        0
HDF                        0
PWF                        0
OSF                        0
RNF                        0
dtype: int64

Target distribution:
Machine failure
0    9661
1     339
Name: count, dtype: int64
Failure rate: 3.4%

Training set size: 8000
Test set size: 2000
Selected features: ['Type', 'Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']
Training Gradient Boosted Trees model.

In [27]:
model.describe()