In [1]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold

In [2]:
# Generate synthetic data
X, y = make_classification(n_samples=1000, n_features=5, n_informative=3, n_redundant=1, n_classes=3, n_clusters_per_class=2, random_state=42)


In [3]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [4]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
# Define classification models
models = {
    "Logistic Regression": LogisticRegression(multi_class='ovr', random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(kernel='rbf', random_state=42)
}


In [8]:
# Function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy, classification_report(y_test, y_pred), confusion_matrix(y_test, y_pred)

In [9]:
# Train and evaluate models
results = {}
for name, model in models.items():
    accuracy, report, cm = evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test)
    results[name] = {"Accuracy": accuracy, "Report": report, "Confusion Matrix": cm}
    
    # Perform stratified k-fold cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=cv, scoring='accuracy')
    results[name]["CV_Accuracy"] = cv_scores.mean()



In [10]:
# Print results
print("Model Evaluation Results:")
for name, metrics in results.items():
    print(f"\n{name}:")
    print(f"  Accuracy: {metrics['Accuracy']:.4f}")
    print(f"  Cross-validation Accuracy: {metrics['CV_Accuracy']:.4f}")
    print("  Classification Report:")
    print(metrics['Report'])
    print("  Confusion Matrix:")
    print(metrics['Confusion Matrix'])


Model Evaluation Results:

Logistic Regression:
  Accuracy: 0.6800
  Cross-validation Accuracy: 0.6488
  Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.79      0.70        67
           1       0.60      0.39      0.48        66
           2       0.79      0.85      0.82        67

    accuracy                           0.68       200
   macro avg       0.67      0.68      0.66       200
weighted avg       0.67      0.68      0.67       200

  Confusion Matrix:
[[53  9  5]
 [30 26 10]
 [ 2  8 57]]

Decision Tree:
  Accuracy: 0.8150
  Cross-validation Accuracy: 0.8313
  Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.85      0.83        67
           1       0.76      0.85      0.80        66
           2       0.89      0.75      0.81        67

    accuracy                           0.81       200
   macro avg       0.82      0.82      0.82       200
weighted avg

In [11]:
# Feature importance for Random Forest
rf_model = models["Random Forest"]
rf_model.fit(X_train_scaled, y_train)
feature_importance = rf_model.feature_importances_
print("\nRandom Forest Feature Importances:")
for i, importance in enumerate(feature_importance):
    print(f"Feature {i+1}: {importance:.4f}")


Random Forest Feature Importances:
Feature 1: 0.2066
Feature 2: 0.2432
Feature 3: 0.3084
Feature 4: 0.0475
Feature 5: 0.1944


In [12]:
# Predictions on new data
new_data = np.random.randn(5, 5)  # 5 samples with 5 features
new_data_scaled = scaler.transform(new_data)
print("\nPredictions on new data:")
for name, model in models.items():
    predictions = model.predict(new_data_scaled)
    print(f"\n{name} predictions:")
    print(predictions)


Predictions on new data:

Logistic Regression predictions:
[0 0 2 0 0]

Decision Tree predictions:
[1 0 1 1 2]

Random Forest predictions:
[0 0 1 1 0]

SVM predictions:
[0 0 1 1 0]
