In [1]:
import numpy as np
import pandas as pd
import os
import scipy.io
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
def extract_features(signal):
    mean = np.mean(signal)
    max_val = np.max(signal)
    min_val = np.min(signal)
    rms = np.sqrt(np.mean(np.square(signal)))
    kurtosis = pd.Series(signal).kurtosis()
    skewness = pd.Series(signal).skew()
    std_dev = np.std(signal)
    form_factor = rms / mean
    crest_factor = max_val / rms
    
    return [mean, max_val, min_val, rms, kurtosis, skewness, std_dev, form_factor, crest_factor]


In [3]:
def process_mat_files(directory):
    feature_list = []
    label_list = []
    
    for file_name in os.listdir(directory):
        if file_name.endswith('.mat'):
            file_path = os.path.join(directory, file_name)
            mat_data = scipy.io.loadmat(file_path)
            signal = mat_data['signal'].flatten()  # Adjust based on how your data is structured
            
            features = extract_features(signal)
            feature_list.append(features)
            
            # Append the label based on the folder name
            label_list.append(directory.split('\\')[-1].lower())
    
    columns = ['mean', 'max', 'min', 'rms', 'kurtosis', 'skewness', 'std_dev', 'form_factor', 'crest_factor']
    return pd.DataFrame(feature_list, columns=columns), label_list


In [4]:
directories = {
    'inner': r'E:\Bearings\Dataset\Bearing Dataset2\Inner (1800)',
    'outer': r'E:\Bearings\Dataset\Bearing Dataset2\Outer (1800)',
    'roller': r'E:\Bearings\Dataset\Bearing Dataset2\Roller (1800)',
    'normal': r'E:\Bearings\Dataset\Bearing Dataset2\Normal (1800)'
}

# Initialize an empty DataFrame for features and an empty list for labels
all_features = pd.DataFrame()
all_labels = []

# Process each directory
for condition, directory in directories.items():
    features, labels = process_mat_files(directory)
    all_features = pd.concat([all_features, features], ignore_index=True)
    all_labels += labels

# Convert the labels to a DataFrame for easier processing
all_labels = pd.DataFrame(all_labels, columns=['label'])

print("Feature extraction complete.")


Feature extraction complete.


In [5]:
from sklearn.preprocessing import LabelEncoder

# Encode the labels (converting 'inner', 'outer', etc. to numeric labels)
le = LabelEncoder()
all_labels_encoded = le.fit_transform(all_labels.values.ravel())

# Combine the features and labels into one DataFrame
data = pd.concat([all_features, pd.Series(all_labels_encoded, name='label')], axis=1)

# Split the data into training (70%), validation (15%), and test (15%)
X = data.drop('label', axis=1)
y = data['label']

# First, split into training+validation and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)

# Then, split the training+validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.176, random_state=42, stratify=y_train_val)

print(f"Training set size: {X_train.shape[0]}, Validation set size: {X_val.shape[0]}, Test set size: {X_test.shape[0]}")


Training set size: 959, Validation set size: 205, Test set size: 206


In [6]:
# Apply PCA to reduce to 5 principal components
pca = PCA(n_components=5)
X_train_pca = pca.fit_transform(X_train)
X_val_pca = pca.transform(X_val)
X_test_pca = pca.transform(X_test)

# Check explained variance to see how much information is retained
explained_variance_ratio = pca.explained_variance_ratio_
print(f"Explained variance ratio: {explained_variance_ratio}")


Explained variance ratio: [9.99988786e-01 1.03789970e-05 7.38277997e-07 7.36523697e-08
 1.74598097e-08]


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Initialize classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='linear', random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}

print("Evaluating classifiers on PCA-transformed data:")

# Train and evaluate each classifier
for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X_train_pca, y_train)
    
    # Predict on validation and test sets
    y_val_pred = clf.predict(X_val_pca)
    y_test_pred = clf.predict(X_test_pca)
    
    # Print validation and test accuracy
    val_accuracy = clf.score(X_val_pca, y_val)
    test_accuracy = clf.score(X_test_pca, y_test)
    print(f"{name} - Validation Accuracy: {val_accuracy * 100:.2f}%")
    print(f"{name} - Test Accuracy: {test_accuracy * 100:.2f}%")
    
    # Print classification report and confusion matrix
    print(f"Classification Report (Test Set) for {name}:")
    print(classification_report(y_test, y_test_pred, target_names=le.classes_))
    
    print(f"Confusion Matrix (Test Set) for {name}:")
    print(confusion_matrix(y_test, y_test_pred))
    print("\n")


Evaluating classifiers on PCA-transformed data:
Random Forest - Validation Accuracy: 96.59%
Random Forest - Test Accuracy: 98.54%
Classification Report (Test Set) for Random Forest:
               precision    recall  f1-score   support

 inner (1800)       0.98      0.98      0.98        56
normal (1800)       1.00      0.98      0.99        52
 outer (1800)       0.98      1.00      0.99        52
roller (1800)       0.98      0.98      0.98        46

     accuracy                           0.99       206
    macro avg       0.99      0.99      0.99       206
 weighted avg       0.99      0.99      0.99       206

Confusion Matrix (Test Set) for Random Forest:
[[55  0  0  1]
 [ 0 51  1  0]
 [ 0  0 52  0]
 [ 1  0  0 45]]


SVM - Validation Accuracy: 93.66%
SVM - Test Accuracy: 93.20%
Classification Report (Test Set) for SVM:
               precision    recall  f1-score   support

 inner (1800)       0.91      0.86      0.88        56
normal (1800)       1.00      0.98      0.99      

In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd
import numpy as np


In [9]:
classifiers = {
    'Fine Tree': DecisionTreeClassifier(max_depth=100, criterion='gini'),
    'Medium Tree': DecisionTreeClassifier(max_depth=20, criterion='gini'),
    'Coarse Tree': DecisionTreeClassifier(max_depth=4, criterion='gini'),
    'Linear Discriminant': LinearDiscriminantAnalysis(),
    'Quadratic Discriminant': QuadraticDiscriminantAnalysis()
}


In [10]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming y_train contains the original labels (either categorical or integer)
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)
y_test_encoded = label_encoder.transform(y_test)

# Convert class indices back to strings (or use actual class names if available)
class_names = list(map(str, label_encoder.classes_))

# Iterate through each classifier
results = {}
for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X_train_pca, y_train_encoded)
    
    # Validate on validation set
    y_val_pred = clf.predict(X_val_pca)
    val_accuracy = accuracy_score(y_val_encoded, y_val_pred)
    
    # Test on test set
    y_test_pred = clf.predict(X_test_pca)
    test_accuracy = accuracy_score(y_test_encoded, y_test_pred)
    
    # Generate classification report for the test set
    report = classification_report(y_test_encoded, y_test_pred, target_names=class_names)
    
    # Confusion matrix for the test set
    confusion = confusion_matrix(y_test_encoded, y_test_pred)
    
    # Store the results
    results[name] = {
        'Validation Accuracy': val_accuracy * 100,
        'Test Accuracy': test_accuracy * 100,
        'Classification Report': report,
        'Confusion Matrix': confusion
    }

    print(f"\n{name} - Validation Accuracy: {val_accuracy * 100:.2f}%")
    print(f"{name} - Test Accuracy: {test_accuracy * 100:.2f}%")
    print(f"Classification Report (Test Set) for {name}:\n", report)
    print(f"Confusion Matrix (Test Set) for {name}:\n", confusion)



Fine Tree - Validation Accuracy: 95.12%
Fine Tree - Test Accuracy: 96.60%
Classification Report (Test Set) for Fine Tree:
               precision    recall  f1-score   support

           0       0.98      0.95      0.96        56
           1       0.98      0.96      0.97        52
           2       0.94      0.96      0.95        52
           3       0.96      1.00      0.98        46

    accuracy                           0.97       206
   macro avg       0.97      0.97      0.97       206
weighted avg       0.97      0.97      0.97       206

Confusion Matrix (Test Set) for Fine Tree:
 [[53  0  1  2]
 [ 0 50  2  0]
 [ 1  1 50  0]
 [ 0  0  0 46]]

Medium Tree - Validation Accuracy: 94.15%
Medium Tree - Test Accuracy: 97.09%
Classification Report (Test Set) for Medium Tree:
               precision    recall  f1-score   support

           0       0.98      0.96      0.97        56
           1       0.98      0.96      0.97        52
           2       0.94      0.96      0.95

In [11]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Add Naive Bayes and SVM variations to the classifiers dictionary
classifiers.update({
    'Gaussian Naive Bayes': GaussianNB(),
    'Linear SVM': SVC(kernel='linear'),
    'Quadratic SVM': SVC(kernel='poly', degree=2),
    'Cubic SVM': SVC(kernel='poly', degree=3),
    'Fine Gaussian SVM': SVC(kernel='rbf', gamma=0.75),
    'Medium Gaussian SVM': SVC(kernel='rbf', gamma=3),
    'Coarse Gaussian SVM': SVC(kernel='rbf', gamma=12),
    'Fine KNN': KNeighborsClassifier(n_neighbors=1),
    'Medium KNN': KNeighborsClassifier(n_neighbors=10),
    'Weighted KNN': KNeighborsClassifier(n_neighbors=10, weights='distance')
})


In [12]:
# Iterate through each new classifier
for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X_train_pca, y_train)
    
    # Validate on validation set
    y_val_pred = clf.predict(X_val_pca)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    
    # Convert class indices back to strings (or use actual class names if available)
    class_names = list(map(str, label_encoder.classes_))
    # Test on test set
    y_test_pred = clf.predict(X_test_pca)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    # Generate classification report for the test set
    report = classification_report(y_test, y_test_pred, target_names=class_names)
    
    # Confusion matrix for the test set
    confusion = confusion_matrix(y_test, y_test_pred)
    
    # Store the results
    results[name] = {
        'Validation Accuracy': val_accuracy * 100,
        'Test Accuracy': test_accuracy * 100,
        'Classification Report': report,
        'Confusion Matrix': confusion
    }

    print(f"\n{name} - Validation Accuracy: {val_accuracy * 100:.2f}%")
    print(f"{name} - Test Accuracy: {test_accuracy * 100:.2f}%")
    print(f"Classification Report (Test Set) for {name}:\n", report)
    print(f"Confusion Matrix (Test Set) for {name}:\n", confusion)



Fine Tree - Validation Accuracy: 92.68%
Fine Tree - Test Accuracy: 94.66%
Classification Report (Test Set) for Fine Tree:
               precision    recall  f1-score   support

           0       0.96      0.91      0.94        56
           1       0.98      0.96      0.97        52
           2       0.94      0.94      0.94        52
           3       0.90      0.98      0.94        46

    accuracy                           0.95       206
   macro avg       0.95      0.95      0.95       206
weighted avg       0.95      0.95      0.95       206

Confusion Matrix (Test Set) for Fine Tree:
 [[51  0  1  4]
 [ 0 50  2  0]
 [ 1  1 49  1]
 [ 1  0  0 45]]

Medium Tree - Validation Accuracy: 92.68%
Medium Tree - Test Accuracy: 94.66%
Classification Report (Test Set) for Medium Tree:
               precision    recall  f1-score   support

           0       0.95      0.93      0.94        56
           1       0.96      0.98      0.97        52
           2       0.96      0.90      0.93

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Cubic SVM - Validation Accuracy: 27.32%
Cubic SVM - Test Accuracy: 28.16%
Classification Report (Test Set) for Cubic SVM:
               precision    recall  f1-score   support

           0       0.29      1.00      0.46        56
           1       0.12      0.04      0.06        52
           2       0.00      0.00      0.00        52
           3       0.00      0.00      0.00        46

    accuracy                           0.28       206
   macro avg       0.10      0.26      0.13       206
weighted avg       0.11      0.28      0.14       206

Confusion Matrix (Test Set) for Cubic SVM:
 [[56  0  0  0]
 [50  2  0  0]
 [42 10  0  0]
 [42  4  0  0]]

Fine Gaussian SVM - Validation Accuracy: 64.88%
Fine Gaussian SVM - Test Accuracy: 63.11%
Classification Report (Test Set) for Fine Gaussian SVM:
               precision    recall  f1-score   support

           0       0.55      0.84      0.66        56
           1       0.65      0.67      0.66        52
           2       0.57  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier

# Define Ensemble methods with the correct 'estimator' argument
ensemble_classifiers = {
    'Ensemble Bagged Tree': BaggingClassifier(estimator=DecisionTreeClassifier(max_features='sqrt'), n_estimators=30, max_samples=0.9),
    'Ensemble Subspace Discriminant': BaggingClassifier(estimator=LinearDiscriminantAnalysis(), n_estimators=30, max_features=0.3),
    'Ensemble Subspace KNN': BaggingClassifier(estimator=KNeighborsClassifier(n_neighbors=5), n_estimators=30, max_features=0.5),
}
class_names = list(map(str, label_encoder.classes_))

# Train and evaluate each ensemble method
for name, clf in ensemble_classifiers.items():
    clf.fit(X_train_pca, y_train)
    
    # Evaluate on validation and test sets
    y_val_pred = clf.predict(X_val_pca)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    
    y_test_pred = clf.predict(X_test_pca)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    # Classification report and confusion matrix
    report = classification_report(y_test, y_test_pred, target_names=class_names)
    confusion = confusion_matrix(y_test, y_test_pred)
    
    print(f"\n{name} - Validation Accuracy: {val_accuracy * 100:.2f}%")
    print(f"{name} - Test Accuracy: {test_accuracy * 100:.2f}%")
    print(f"Classification Report (Test Set) for {name}:\n", report)
    print(f"Confusion Matrix (Test Set) for {name}:\n", confusion)



Ensemble Bagged Tree - Validation Accuracy: 97.56%
Ensemble Bagged Tree - Test Accuracy: 98.54%
Classification Report (Test Set) for Ensemble Bagged Tree:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98        56
           1       1.00      0.98      0.99        52
           2       0.98      1.00      0.99        52
           3       0.98      0.98      0.98        46

    accuracy                           0.99       206
   macro avg       0.99      0.99      0.99       206
weighted avg       0.99      0.99      0.99       206

Confusion Matrix (Test Set) for Ensemble Bagged Tree:
 [[55  0  0  1]
 [ 0 51  1  0]
 [ 0  0 52  0]
 [ 1  0  0 45]]

Ensemble Subspace Discriminant - Validation Accuracy: 82.44%
Ensemble Subspace Discriminant - Test Accuracy: 83.50%
Classification Report (Test Set) for Ensemble Subspace Discriminant:
               precision    recall  f1-score   support

           0       0.75      0.79      0.77       

In [14]:
from sklearn.neural_network import MLPClassifier

# Define Neural Networks configurations
nn_classifiers = {
    'Narrow NN': MLPClassifier(hidden_layer_sizes=(25,), max_iter=1000, activation='relu', solver='adam'),
    'Medium NN': MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, activation='relu', solver='adam'),
    'Wide NN': MLPClassifier(hidden_layer_sizes=(200,), max_iter=1000, activation='relu', solver='adam'),
    'Bilayered NN': MLPClassifier(hidden_layer_sizes=(15, 20), max_iter=1000, activation='relu', solver='adam'),
    'Trilayered NN': MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=1000, activation='relu', solver='adam'),
}
class_names = list(map(str, label_encoder.classes_))

# Train and evaluate each neural network
for name, clf in nn_classifiers.items():
    clf.fit(X_train_pca, y_train)
    
    # Evaluate on validation and test sets
    y_val_pred = clf.predict(X_val_pca)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    
    y_test_pred = clf.predict(X_test_pca)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    # Classification report and confusion matrix
    report = classification_report(y_test, y_test_pred, target_names=class_names)
    confusion = confusion_matrix(y_test, y_test_pred)
    
    print(f"\n{name} - Validation Accuracy: {val_accuracy * 100:.2f}%")
    print(f"{name} - Test Accuracy: {test_accuracy * 100:.2f}%")
    print(f"Classification Report (Test Set) for {name}:\n", report)
    print(f"Confusion Matrix (Test Set) for {name}:\n", confusion)



Narrow NN - Validation Accuracy: 82.44%
Narrow NN - Test Accuracy: 81.55%
Classification Report (Test Set) for Narrow NN:
               precision    recall  f1-score   support

           0       0.71      0.93      0.81        56
           1       0.87      0.87      0.87        52
           2       0.81      0.81      0.81        52
           3       1.00      0.63      0.77        46

    accuracy                           0.82       206
   macro avg       0.85      0.81      0.81       206
weighted avg       0.84      0.82      0.81       206

Confusion Matrix (Test Set) for Narrow NN:
 [[52  1  3  0]
 [ 0 45  7  0]
 [ 4  6 42  0]
 [17  0  0 29]]

Medium NN - Validation Accuracy: 73.66%
Medium NN - Test Accuracy: 73.79%
Classification Report (Test Set) for Medium NN:
               precision    recall  f1-score   support

           0       0.70      0.75      0.72        56
           1       0.69      1.00      0.82        52
           2       0.69      0.56      0.62      