<a href="https://colab.research.google.com/github/ganashreecs22/ml_lab/blob/main/k-cluster_ND_boosting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Function to calculate weighted accuracy
def calculate_weighted_error(y_true, y_pred, weights):
    return np.sum(weights * (y_true != y_pred)) / np.sum(weights)

# Function to update weights
def update_weights(weights, alpha, y_true, y_pred):
    return weights * np.exp(alpha * (y_true != y_pred).astype(float))

# AdaBoost implementation
def adaboost(X, y, n_estimators):
    n_samples, n_features = X.shape
    weights = np.ones(n_samples) / n_samples
    estimators = []
    alphas = []

    for _ in range(n_estimators):
        # Train a weak learner (Decision Stump)
        best_feature, best_threshold, best_polarity, best_error = None, None, None, float('inf')

        for feature in range(n_features):
            thresholds = np.unique(X[:, feature])

            for threshold in thresholds:
                for polarity in [1, -1]:
                    y_pred = np.ones(n_samples)
                    y_pred[polarity * X[:, feature] < polarity * threshold] = -1

                    error = calculate_weighted_error(y, y_pred, weights)

                    if error < best_error:
                        best_feature = feature
                        best_threshold = threshold
                        best_polarity = polarity
                        best_error = error

        # Calculate alpha (model weight)
        alpha = 0.5 * np.log((1 - best_error) / (best_error + 1e-10))

        # Update weights
        y_pred = np.ones(n_samples)
        y_pred[best_polarity * X[:, best_feature] < best_polarity * best_threshold] = -1
        weights = update_weights(weights, alpha, y, y_pred)

        estimators.append((best_feature, best_threshold, best_polarity))
        alphas.append(alpha)

    return estimators, alphas

# Prediction function
def predict(X, estimators, alphas):
    n_samples = X.shape[0]
    final_prediction = np.zeros(n_samples)

    for (feature, threshold, polarity), alpha in zip(estimators, alphas):
        prediction = np.ones(n_samples)
        prediction[polarity * X[:, feature] < polarity * threshold] = -1
        final_prediction += alpha * prediction

    return np.sign(final_prediction)

# Load dataset
iris = pd.read_csv('/Iris.csv')

# Prepare features and target
X = iris[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']].values
y = iris['Species']

# Convert target to numerical labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
y = np.where(y == 0, -1, 1)  # Convert labels to -1 and 1

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train AdaBoost
n_estimators = 50
estimators, alphas = adaboost(X_train, y_train, n_estimators)

# Make predictions
y_pred = predict(X_test, estimators, alphas)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

print("AdaBoost Model Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_report(y_test, y_pred))

AdaBoost Model Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
Confusion Matrix:
 [[19  0]
 [ 0 26]]
Classification Report:
               precision    recall  f1-score   support

          -1       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        26

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45



In [16]:
import numpy as np
import pandas as pd
from sklearn.metrics import silhouette_score

# K-Means Clustering Implementation
def kmeans(X, n_clusters, max_iters=300, tol=1e-4):
    n_samples, n_features = X.shape

    # Randomly initialize cluster centers
    rng = np.random.default_rng(seed=42)
    centroids = X[rng.choice(n_samples, n_clusters, replace=False)]

    for _ in range(max_iters):
        # Assign samples to nearest centroid
        distances = np.linalg.norm(X[:, np.newaxis] - centroids, axis=2)
        cluster_assignments = np.argmin(distances, axis=1)

        # Calculate new centroids
        new_centroids = np.array([X[cluster_assignments == k].mean(axis=0) for k in range(n_clusters)])

        # Check for convergence
        if np.linalg.norm(new_centroids - centroids) < tol:
            break

        centroids = new_centroids

    return centroids, cluster_assignments

# Load dataset
iris = pd.read_csv('/Iris.csv')

# Prepare features
X = iris[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']].values

# Number of clusters
n_clusters = 3

# Apply K-Means
centroids, cluster_assignments = kmeans(X, n_clusters)

# Evaluate clustering using silhouette score
silhouette_avg = silhouette_score(X, cluster_assignments)

# Print results
print("Centroids:")
print(centroids)
print("\nCluster Assignments:")
print(cluster_assignments)
print("\nSilhouette Score:", silhouette_avg)

# Add cluster assignments to the original dataset
iris['Cluster'] = cluster_assignments
print("\nDataset with Clusters:")
print(iris.head())


Centroids:
[[5.88360656 2.74098361 4.38852459 1.43442623]
 [5.006      3.418      1.464      0.244     ]
 [6.85384615 3.07692308 5.71538462 2.05384615]]

Cluster Assignments:
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 2 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 2 2 2 0 2 2 2 2
 2 2 0 0 2 2 2 2 0 2 0 2 0 2 2 0 0 2 2 2 2 2 0 2 2 2 2 0 2 2 2 0 2 2 2 0 2
 2 0]

Silhouette Score: 0.5509643746707443

Dataset with Clusters:
   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species  \
0   1            5.1           3.5            1.4           0.2  Iris-setosa   
1   2            4.9           3.0            1.4           0.2  Iris-setosa   
2   3            4.7           3.2            1.3           0.2  Iris-setosa   
3   4            4.6           3.1            1.5           0.2  Iris-setosa   
4   5            5.0           3.6            1.4        