SMOTE

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from collections import Counter


In [3]:
df=sns.load_dataset("iris")

In [4]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [5]:
(df["species"]).unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [6]:
# Map species to numeric
class_map = {"setosa": 0, "versicolor": 1, "virginica": 2}
class_unmap = {0:"setosa",1: "versicolor", 2:"virginica"}
df["species"] = df["species"].replace(class_map)

  df["species"] = df["species"].replace(class_map)


# model

In [7]:
def linear_regression(X, y, alpha=0.1, epochs=1000):
    n_samples, n_features = X.shape
    w = np.zeros((n_features, 1))       # column vector
    y = y.reshape(-1, 1)                # ensure column vector
    loss_history = []

    for _ in range(epochs):
        y_pred = X @ w                  # (n_samples,1)
        error = y_pred - y              # (n_samples,1)
        gradient = (1 / n_samples) * (X.T @ error)  # (n_features,1)
        w -= alpha * gradient           # update
        loss = (1 / (2 * n_samples)) * np.sum(error ** 2)
        loss_history.append(loss)

    return w, loss_history

In [8]:
# Prediction
def predict(X, w):
    # X_b = np.c_[X, np.ones(len(X))]
    return X.dot(w)

# Convert regression outputs → class labels
def classify(preds):
    return np.round(preds).astype(int).clip(0, 2)

In [9]:
# Evaluation metrics
def evaluate(y_true, y_pred, name="Model"):
    accuracy = np.mean(y_true == y_pred)
    precision_list, recall_list, f1_list = [], [], []

    for c in np.unique(y_true):
        tp = np.sum((y_pred == c) & (y_true == c))
        fp = np.sum((y_pred == c) & (y_true != c))
        fn = np.sum((y_pred != c) & (y_true == c))

        precision = tp / (tp + fp + 1e-9)
        recall = tp / (tp + fn + 1e-9)
        f1 = 2 * precision * recall / (precision + recall + 1e-9)

        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)

    print(f"\n{name} Results:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {np.mean(precision_list):.2f}")
    print(f"Recall: {np.mean(recall_list):.2f}")
    print(f"F1 Score: {np.mean(f1_list):.2f}")


In [10]:
def evaluate_model(X, y, w, name="Model"):
    predictions = [sum(w[j] * x[j] for j in range(len(x))) + w[-1] for x in X]
    avg_error = np.mean(np.abs(np.array(predictions) - y))
    print(f"{name} Average Absolute Error: {avg_error:.2f}")

# split

In [11]:
def one_hot_encode(labels):
    classes = sorted(set(labels))
    class_to_idx = {c: i for i, c in enumerate(classes)}
    Y = np.zeros((len(labels), len(classes)))
    for i, label in enumerate(labels):
        Y[i, class_to_idx[label]] = 1
    return Y, class_to_idx

In [12]:
def split(X, y, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
    assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-6, "Ratios must sum to 1."

    np.random.seed(42)
    indices = np.arange(len(X))
    np.random.shuffle(indices)

    # Ensure NumPy arrays
    X = np.array(X)
    y = np.array(y)

    X = X[indices]
    y = y[indices]

    n_train = int(train_ratio * len(X))
    n_val = int(val_ratio * len(X))

    X_train, y_train = X[:n_train], y[:n_train]
    X_val, y_val = X[n_train:n_train+n_val], y[n_train:n_train+n_val]
    X_test, y_test = X[n_train+n_val:], y[n_train+n_val:]

    return X_train, y_train, X_val, y_val, X_test, y_test


In [13]:
def remove(X_train,y_train,remove_class,frac_remove = 0.5):
    
    mask = np.ones(len(y_train), dtype=bool)
    cls_idx = np.where(y_train == remove_class)[0]
    remove_n = int(len(cls_idx) * frac_remove)
    remove_idx = np.random.choice(cls_idx, remove_n, replace=False)
    mask[remove_idx] = False
    X_train_reduced, y_train_reduced = X_train[mask], y_train[mask]

    return X_train_reduced, y_train_reduced

In [None]:
def smote(X, y, k=5): #Synthetic Minority Over-sampling Technique
    from sklearn.neighbors import NearestNeighbors
    counts = Counter(y) #each class ka max samples
    max_count = max(counts.values())  
    new_X, new_y = [X], [y]
    for cls in counts:
        n_samples = counts[cls]
        if n_samples < max_count:
            X_cls = X[y==cls]
            n_to_generate = max_count - n_samples
            # Nearest neighbors
            nn = NearestNeighbors(n_neighbors=min(k, len(X_cls))).fit(X_cls)
            synth = []
            for _ in range(n_to_generate):
                i = np.random.randint(0, len(X_cls))
                x = X_cls[i]
                nn_idx = nn.kneighbors([x], return_distance=False)[0]
                x_nn = X_cls[np.random.choice(nn_idx)]
                lam = np.random.rand()
                new_point = x + lam*(x_nn - x)  #x+λ(xnn-x)
                synth.append(new_point)
            new_X.append(np.array(synth))
            new_y.append(np.full(n_to_generate, cls))
    return np.vstack(new_X), np.hstack(new_y)

In [15]:
X=df.drop(columns="species")
y=df["species"]

In [16]:
Xcol=X.columns
# ycol=y.columns


In [17]:
X_train, y_train, X_val, y_val, X_test, y_test = split(X, y)

In [18]:
X.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [20]:
X2 ,y2 = smote(X,y,2)

In [21]:
X2=pd.DataFrame(X2,columns=Xcol)

In [22]:
X2.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [23]:
X_train_reduced, y_train_reduced= remove(X_train,y_train,remove_class=2,frac_remove = 0.5) 

In [24]:
print("Before SMOTE:", Counter(y_train_reduced))

Before SMOTE: Counter({np.int64(0): 40, np.int64(1): 33, np.int64(2): 16})


In [25]:
X_train_bal, y_train_bal = smote(X_train_reduced, y_train_reduced,2)

In [26]:
print("After SMOTE:", Counter(y_train_bal))


After SMOTE: Counter({np.int64(1): 40, np.int64(0): 40, np.int64(2): 40})


In [27]:
w,_ = linear_regression(X_train_bal, y_train_bal,alpha=0.01, epochs=2000)


In [30]:
labels=y.unique()

In [31]:
acc_val = np.mean(classify(predict(X_val, w)) == y_val)
acc_test = np.mean(classify(predict(X_test, w)) == y_test)
acc_train = np.mean(classify(predict(X_train, w)) == y_test)


print("Validation accuracy:", acc_val)
print("Test accuracy:", acc_test)
print("Train accuracy:", acc_train)


Validation accuracy: 0.359504132231405
Test accuracy: 0.3686200378071834
Train accuracy: 0.32919254658385094
