### Setup and Common Code

In [14]:
# Import all necessary libraries
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Import Datasets
from sklearn.datasets import load_iris, load_wine, load_breast_cancer, make_classification

# Function to quickly evaluate a model
def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print(f"Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, predictions))

### Model-Specific Code

In [15]:
# Option A: Dataset 1 - load_iris() (Multi-class)
data = load_iris()
X, y = data.data, data.target
dataset_name = "Iris"

# Option B: Dataset 2 - load_wine() (Multi-class)
data = load_wine()
X, y = data.data, data.target
dataset_name = "Wine"

# Option C: Dataset 3 - load_breast_cancer() (Binary)
data = load_breast_cancer()
X, y = data.data, data.target
dataset_name = "Breast Cancer"

# Option D: Dataset 4 - make_classification() (Customizable)
X, y = make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=42)
dataset_name = "Synthetic Classification"

# Classification Models

### 1. Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression

# Load data (Choose one from Options A, B, C, or D above)
data = load_iris()
X, y = data.data, data.target
dataset_name = "Iris"

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create, train, and evaluate the model
print(f"Logistic Regression on {dataset_name}:")
model = LogisticRegression(max_iter=100, random_state=42) # max_iter increased for convergence
model.fit(X_train, y_train)
evaluate_model(model, X_test, y_test)

Logistic Regression on Iris:
Accuracy: 0.9667
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.90      0.95        10
           2       0.91      1.00      0.95        10

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30



### 2. Support Vector Classifier (SVC)

In [17]:
# Note: SVMs often benefit from feature scaling.
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create a pipeline that scales the data then applies SVC
print(f"Support Vector Classifier (SVC) on {dataset_name}:")
model = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(kernel='rbf', random_state=42)) # 'rbf' kernel is common for non-linear problems
])
model.fit(X_train, y_train)
evaluate_model(model, X_test, y_test)

Support Vector Classifier (SVC) on Iris:
Accuracy: 0.9667
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.90      0.95        10
           2       0.91      1.00      0.95        10

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30



### 3. Decision Tree Classifier

In [18]:
from sklearn.tree import DecisionTreeClassifier

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create, train, and evaluate the model
print(f"Decision Tree Classifier on {dataset_name}:")
model = DecisionTreeClassifier(max_depth=4, random_state=42) # Limiting depth prevents overfitting
model.fit(X_train, y_train)
evaluate_model(model, X_test, y_test)

Decision Tree Classifier on Iris:
Accuracy: 0.9333
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       0.90      0.90      0.90        10
           2       0.90      0.90      0.90        10

    accuracy                           0.93        30
   macro avg       0.93      0.93      0.93        30
weighted avg       0.93      0.93      0.93        30



### 4. Random Forest Classifier

In [19]:
from sklearn.ensemble import RandomForestClassifier

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create, train, and evaluate the model
print(f"Random Forest Classifier on {dataset_name}:")
model = RandomForestClassifier(n_estimators=100, random_state=42) # n_estimators = number of trees
model.fit(X_train, y_train)
evaluate_model(model, X_test, y_test)

Random Forest Classifier on Iris:
Accuracy: 0.9000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       0.82      0.90      0.86        10
           2       0.89      0.80      0.84        10

    accuracy                           0.90        30
   macro avg       0.90      0.90      0.90        30
weighted avg       0.90      0.90      0.90        30



### 5. Gradient Boosting Classifier

In [20]:
from sklearn.ensemble import GradientBoostingClassifier

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create, train, and evaluate the model
print(f"Gradient Boosting Classifier on {dataset_name}:")
model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)
evaluate_model(model, X_test, y_test)

Gradient Boosting Classifier on Iris:
Accuracy: 0.9667
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.90      0.95        10
           2       0.91      1.00      0.95        10

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30



### 6. K-Nearest Neighbors Classifier (KNN)

In [21]:
# Note: KNN also benefits greatly from feature scaling.
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create a pipeline that scales the data then applies KNN
print(f"K-Nearest Neighbors Classifier (KNN) on {dataset_name}:")
model = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=5)) # n_neighbors is the 'k' value
])
model.fit(X_train, y_train)
evaluate_model(model, X_test, y_test)

K-Nearest Neighbors Classifier (KNN) on Iris:
Accuracy: 0.9333
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       0.83      1.00      0.91        10
           2       1.00      0.80      0.89        10

    accuracy                           0.93        30
   macro avg       0.94      0.93      0.93        30
weighted avg       0.94      0.93      0.93        30



### 7. Gaussian Naive Bayes

In [22]:
from sklearn.naive_bayes import GaussianNB

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create, train, and evaluate the model
print(f"Gaussian Naive Bayes on {dataset_name}:")
model = GaussianNB()
model.fit(X_train, y_train)
evaluate_model(model, X_test, y_test)

Gaussian Naive Bayes on Iris:
Accuracy: 0.9667
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.90      0.95        10
           2       0.91      1.00      0.95        10

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30



### 8. Linear Discriminant Analysis (LDA)

In [23]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create, train, and evaluate the model
print(f"Linear Discriminant Analysis (LDA) on {dataset_name}:")
model = LinearDiscriminantAnalysis()
model.fit(X_train, y_train)
evaluate_model(model, X_test, y_test)

Linear Discriminant Analysis (LDA) on Iris:
Accuracy: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        10
           2       1.00      1.00      1.00        10

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



## Complete Example Run

In [28]:
# --- COMPLETE EXAMPLE: All 4 datasets ---
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris, load_wine, load_breast_cancer, make_classification
import warnings

warnings.filterwarnings('ignore')

def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print(f"Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, predictions))

# Define the datasets
datasets = {
    "Iris": load_iris(),
    "Wine": load_wine(),
    "Breast Cancer": load_breast_cancer(),
    "Synthetic": make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=42)
}

all_models = {}

# Initialize the models
model = LogisticRegression(max_iter=100, random_state=42) # max_iter increased for convergence
all_models.update({'Logistic Regression' : model})

model = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(kernel='rbf', random_state=42)) # 'rbf' kernel is common for non-linear problems
])
all_models.update({'Support Vector Classifier (SVC)' : model})

model = DecisionTreeClassifier(max_depth=4, random_state=42) # Limiting depth prevents overfitting
all_models.update({'Decision Tree Classifier' : model})

model = RandomForestClassifier(n_estimators=100, random_state=42) # n_estimators = number of trees
all_models.update({'Random Forest Classifier' : model})

model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
all_models.update({'Gradient Boosting Classifier' : model})

model = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=5)) # n_neighbors is the 'k' value
])
all_models.update({'K-Nearest Neighbors Classifier (KNN)' : model})

model = GaussianNB()
all_models.update({'Gaussian Naive Bayes' : model})

model = LinearDiscriminantAnalysis()
all_models.update({'Linear Discriminant Analysis (LDA)' : model})

# Train and evaluate on each dataset
for model_name, model in all_models.items(): 
    print(f"{model_name}")
    for name, data in datasets.items():
        if name == "Synthetic":
            X, y = data
        else:
            X, y = data.data, data.target
            
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
        model.fit(X_train, y_train)
        
        print(f"{name} Dataset:")
        evaluate_model(model, X_test, y_test)
    print("*"*75)

Logistic Regression
Iris Dataset:
Accuracy: 0.9667
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.90      0.95        10
           2       0.91      1.00      0.95        10

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30

Wine Dataset:
Accuracy: 0.9722
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       0.93      1.00      0.97        14
           2       1.00      0.90      0.95        10

    accuracy                           0.97        36
   macro avg       0.98      0.97      0.97        36
weighted avg       0.97      0.97      0.97        36

Breast Cancer Dataset:
Accuracy: 0.9649
              precision    recall  f1-score   support

           0       0.97      0.93      0.95        42
       