In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, f1_score, confusion_matrix, ConfusionMatrixDisplay

In [None]:
train_df = pd.read_csv('data/aps_failure_training_set.csv', na_values='na')
    test_df = pd.read_csv('data/aps_failure_test_set.csv', na_values='na')
    print("--- Datasets loaded successfully ---")
    print(f"Training data shape: {train_df.shape}")
    print(f"Test data shape: {test_df.shape}")
except FileNotFoundError:
    print("\n---! ERROR !---")
    print("Dataset files not found. Please ensure your folder structure is correct.")
    print("The script expects a 'data' folder in the same directory, containing the CSV files.")
    # Exit gracefully if files are not found
    exit()

In [None]:
combined_df = pd.concat([train_df, test_df], ignore_index=True)

In [None]:
# Map target variable 'class' to numerical values
class_mapping = {'neg': 0, 'pos': 1}
combined_df['class'] = combined_df['class'].map(class_mapping)

print("\n--- Target variable 'class' mapped to 0s and 1s ---")

In [None]:
X = combined_df.drop('class', axis=1)
y = combined_df['class']

# Check for class imbalance
print("\n--- Initial Class Distribution ---")
print(y.value_counts(normalize=True))

In [None]:
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)

# Convert back to a DataFrame to keep column names
X_imputed_df = pd.DataFrame(X_imputed, columns=X.columns)

print("\n--- Missing values handled using Median Imputation ---")


In [None]:
train_len = len(train_df)
X_train_orig = X_imputed_df.iloc[:train_len]
y_train_orig = y.iloc[:train_len]
X_test = X_imputed_df.iloc[train_len:]
y_test = y.iloc[train_len:]

# --- Apply SMOTE only to the training data ---
print("\n--- Applying SMOTE to the training data to handle class imbalance... ---")
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_orig, y_train_orig)

print("--- SMOTE applied successfully ---")
print("\n--- Class Distribution After SMOTE ---")
print(pd.Series(y_train_smote).value_counts(normalize=True))

In [None]:
# Scale features after applying SMOTE to prevent data leakage
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_smote)
X_test = scaler.transform(X_test)

In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

results = {}

print("\n--- Training and evaluating models... ---")
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train_smote)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Calculate F1 score
    f1 = f1_score(y_test, y_pred)
    results[name] = f1
    
    print(f"\n--- Model: {name} ---")
    print(f"F1-Score: {f1:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # Store and display confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap=plt.cm.Blues)
    plt.title(f'Confusion Matrix - {name}')
    # You can save this figure to your 'images/' folder
    # plt.savefig(f'images/confusion_matrix_{name.replace(" ", "_")}.png')
    plt.show()


In [None]:
print("\n--- Final Model Comparison (based on F1-Score) ---")
results_df = pd.DataFrame.from_dict(results, orient='index', columns=['F1-Score']).sort_values(by='F1-Score', ascending=False)
print(results_df)

best_model_name = results_df.index[0]
best_model_score = results_df.iloc[0, 0]
print(f"\nBest performing model is '{best_model_name}' with an F1-Score of {best_model_score:.4f}.")
print("\n--- Project 1 script finished ---")