In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

# Load the dataset
file_path = 'hdddata.csv'
df = pd.read_csv(file_path)

# Separate features (X) and target variable (y)
X = df.drop('failure', axis=1)
y = df['failure']

# Handle NaN values using SimpleImputer (replace NaN with the mean)
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a decision tree classifier on the original dataset
original_model = DecisionTreeClassifier(random_state=42)
original_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_original = original_model.predict(X_test)

# Evaluate the performance of the original model
print("Results before SMOTE:")
print("Accuracy:", accuracy_score(y_test, y_pred_original))
print("Classification Report:\n", classification_report(y_test, y_pred_original))

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Train a decision tree classifier on the SMOTE dataset
smote_model = DecisionTreeClassifier(random_state=42)
smote_model.fit(X_train_smote, y_train_smote)

# Make predictions on the test set using the model trained on SMOTE dataset
X_test = imputer.transform(X_test)  # Impute test set as well
y_pred_smote = smote_model.predict(X_test)

# Evaluate the performance of the model after SMOTE
print("\nResults after SMOTE:")
print("Accuracy:", accuracy_score(y_test, y_pred_smote))
print("Classification Report:\n", classification_report(y_test, y_pred_smote))


Results before SMOTE:
Accuracy: 0.9990133656072406
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     73447
           1       0.99      0.98      0.99      2569

    accuracy                           1.00     76016
   macro avg       0.99      0.99      0.99     76016
weighted avg       1.00      1.00      1.00     76016


Results after SMOTE:
Accuracy: 0.9858713954956851
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99     73447
           1       0.79      0.79      0.79      2569

    accuracy                           0.99     76016
   macro avg       0.89      0.89      0.89     76016
weighted avg       0.99      0.99      0.99     76016





In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

# Load the dataset
file_path = 'hdddata.csv'
df = pd.read_csv(file_path)

# Separate features (X) and target variable (y)
X = df.drop('failure', axis=1)
y = df['failure']

# Handle NaN values using SimpleImputer (replace NaN with the mean)
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest classifier on the original dataset
original_model = RandomForestClassifier(random_state=42)
original_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_original = original_model.predict(X_test)

# Evaluate the performance of the original model
print("Results before SMOTE:")
print("Accuracy:", accuracy_score(y_test, y_pred_original))
print("Classification Report:\n", classification_report(y_test, y_pred_original))

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Train a Random Forest classifier on the SMOTE dataset
smote_model = RandomForestClassifier(random_state=42)
smote_model.fit(X_train_smote, y_train_smote)

# Make predictions on the test set using the model trained on SMOTE dataset
X_test = imputer.transform(X_test)  # Impute test set as well
y_pred_smote = smote_model.predict(X_test)

# Evaluate the performance of the model after SMOTE
print("\nResults after SMOTE:")
print("Accuracy:", accuracy_score(y_test, y_pred_smote))
print("Classification Report:\n", classification_report(y_test, y_pred_smote))


Results before SMOTE:
Accuracy: 0.9992106924857924
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     73447
           1       1.00      0.98      0.99      2569

    accuracy                           1.00     76016
   macro avg       1.00      0.99      0.99     76016
weighted avg       1.00      1.00      1.00     76016






Results after SMOTE:
Accuracy: 0.9986187118501368
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     73447
           1       1.00      0.96      0.98      2569

    accuracy                           1.00     76016
   macro avg       1.00      0.98      0.99     76016
weighted avg       1.00      1.00      1.00     76016

