In [None]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix

# ------------ Load dataset ------------
df = pd.read_csv("spam.csv", encoding='latin-1')[["v1", "v2"]]
df.columns = ["label", "text"]

# ------------ Convert label spam=1, ham=0 ------------
df['label'] = df['label'].map({'spam': 1, 'ham': 0})

# ------------ Text cleaning ------------
def clean_text(t):
    t = t.lower()
    t = re.sub(r'[^\w\s]', '', t)  # remove punctuation
    return t

df['text'] = df['text'].apply(clean_text)

# ------------ TF-IDF Vectorization ------------
tfidf = TfidfVectorizer(stop_words='english')
X = tfidf.fit_transform(df['text'])
y = df['label']

# ------------ Train–test split ------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ------------ Class distribution ------------
print("Class distribution:")
print(df['label'].value_counts())


In [None]:
from sklearn.tree import DecisionTreeClassifier

stump = DecisionTreeClassifier(max_depth=1)
stump.fit(X_train, y_train)

train_acc = stump.score(X_train, y_train)
test_acc = stump.score(X_test, y_test)

print("\n=== Decision Stump Performance ===")
print("Train Accuracy:", train_acc)
print("Test Accuracy:", test_acc)

y_pred = stump.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

print("\nWhy stump performs poorly?")
print("Because text data is high-dimensional & nonlinear; a depth-1 stump cannot split complex patterns.")


In [None]:
import math
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt

T = 15
n = X_train.shape[0]
weights = np.ones(n) / n

alphas = []
errors = []

for t in range(T):

    stump = DecisionTreeClassifier(max_depth=1)
    stump.fit(X_train, y_train, sample_weight=weights)
    pred = stump.predict(X_train)

    # weighted error
    err = np.sum(weights * (pred != y_train))

    # avoid division by zero
    err = max(err, 1e-10)

    alpha = 0.5 * np.log((1 - err) / err)

    # Print iteration output
    print(f"\n=== Iteration {t+1} ===")
    print("Weighted error:", err)
    print("Alpha:", alpha)

    misclassified_idx = np.where(pred != y_train)[0]
    print("Misclassified sample indices:", misclassified_idx)
    print("Weights of misclassified samples:", weights[misclassified_idx][:10])

    # update weights
    weights *= np.exp(-alpha * y_train.replace({0:-1}) * pred)
    weights /= np.sum(weights)

    alphas.append(alpha)
    errors.append(err)

# ----- Final boosted model prediction -----
def predict_boost(X):
    final = np.zeros(X.shape[0])
    for t in range(T):
        stump = DecisionTreeClassifier(max_depth=1)
        stump.fit(X_train, y_train, sample_weight=weights)
        final += alphas[t] * stump.predict(X)
    return (final > 0).astype(int)

y_pred_train = predict_boost(X_train)
y_pred_test = predict_boost(X_test)

print("\n=== Manual AdaBoost Results ===")
print("Train Accuracy:", accuracy_score(y_train, y_pred_train))
print("Test Accuracy:", accuracy_score(y_test, y_pred_test))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))

# ----- Plots -----
plt.plot(errors, marker='o')
plt.title("Iteration vs Weighted Error")
plt.show()

plt.plot(alphas, marker='o')
plt.title("Iteration vs Alpha")
plt.show()


In [None]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(max_depth=1),
    n_estimators=100,
    learning_rate=0.6
)

ada.fit(X_train, y_train)

print("\n=== Sklearn AdaBoost ===")
print("Train Accuracy:", ada.score(X_train, y_train))
print("Test Accuracy:", ada.score(X_test, y_test))
print("Confusion Matrix:\n", confusion_matrix(y_test, ada.predict(X_test)))

print("\nComparison:")
print("Sklearn AdaBoost is more stable and usually more accurate than manual implementation.")


In [None]:
from sklearn.datasets import load_heart_disease
data = load_heart_disease()

X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

# split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

stump = DecisionTreeClassifier(max_depth=1)
stump.fit(X_train, y_train)

print("\nWeak Stump — Heart Disease")
print("Train Accuracy:", stump.score(X_train, y_train))
print("Test Accuracy:", stump.score(X_test, y_test))
print("Confusion:\n", confusion_matrix(y_test, stump.predict(X_test)))


In [None]:
n_estimators_list = [5,10,25,50,100]
learning_rates = [0.1, 0.5, 1.0]

results = {}

for lr in learning_rates:
    results[lr] = []
    for ne in n_estimators_list:
        ada = AdaBoostClassifier(
            base_estimator=DecisionTreeClassifier(max_depth=1),
            n_estimators=ne,
            learning_rate=lr
        )
        ada.fit(X_train, y_train)
        acc = ada.score(X_test, y_test)
        results[lr].append(acc)

print("\nHyperparameter Results:")
print(results)


In [None]:
best_lr = 1.0
best_ne = 100

ada_best = AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(max_depth=1),
    n_estimators=best_ne,
    learning_rate=best_lr
)
ada_best.fit(X_train, y_train)

errors = []
sample_weights = []

for est in ada_best.estimators_:
    pred = est.predict(X_train)
    err = np.mean(pred != y_train)
    errors.append(err)

plt.plot(errors, marker='o')
plt.title("Weak Learner Error vs Iteration")
plt.show()

# final weight distribution
plt.hist(ada_best.estimator_weights_)
plt.title("Final AdaBoost Sample Weights")
plt.show()


In [None]:
importances = ada_best.feature_importances_
feat_imp = pd.DataFrame({
    "feature": X.columns,
    "importance": importances
}).sort_values("importance", ascending=False)

print("\nTop 5 Important Features:")
print(feat_imp.head())


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# -------------------- Load Dataset --------------------
file = "WISDM_ar_v1.1_raw.txt"

cols = ["user", "activity", "timestamp", "x", "y", "z"]
df = pd.read_csv(file, header=None, names=cols, delim_whitespace=True)

# Keep only numeric accelerometer columns
df = df[["activity", "x", "y", "z"]]

# Remove missing/dirty rows
df = df.replace(";", np.nan)
df.dropna(inplace=True)

df["x"] = df["x"].astype(float)
df["y"] = df["y"].astype(float)
df["z"] = df["z"].astype(float)

# -------------------- Binary Label: vigorous vs light --------------------
vigorous = ["Jogging", "Upstairs"]
light = ["Walking", "Sitting", "Standing", "Downstairs"]

df["label"] = df["activity"].apply(lambda a: 1 if a in vigorous else 0)

X = df[["x", "y", "z"]]
y = df["label"]

# Train-test split (70/30)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

print("Dataset shape:", df.shape)
print(df["label"].value_counts())


In [None]:
stump = DecisionTreeClassifier(max_depth=1)
stump.fit(X_train, y_train)

y_pred_train = stump.predict(X_train)
y_pred_test = stump.predict(X_test)

print("\n=== Decision Stump Results ===")
print("Train Accuracy:", accuracy_score(y_train, y_pred_train))
print("Test Accuracy:", accuracy_score(y_test, y_pred_test))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))

print("\nInterpretation:")
print("A stump can only split based on one threshold,")
print("So it struggles with complex continuous sensor data.")


In [None]:
T = 20
n = X_train.shape[0]

weights = np.ones(n) / n

alphas = []
errors = []
misclassified_history = []
weight_history = []

# Convert labels from {0,1} → {-1,+1} for Adaboost math
y_train_mod = y_train.replace({0: -1, 1: 1})

for t in range(T):

    stump = DecisionTreeClassifier(max_depth=1)
    stump.fit(X_train, y_train_mod, sample_weight=weights)

    pred = stump.predict(X_train)

    # weighted error
    err = np.sum(weights * (pred != y_train_mod))
    err = max(err, 1e-10)

    alpha = 0.5 * np.log((1 - err) / err)

    # Print required outputs
    print(f"\n===== Round {t+1} =====")
    print("Weighted Error:", err)
    print("Alpha:", alpha)

    mc = np.where(pred != y_train_mod)[0]
    print("Misclassified Indices:", mc[:20])
    print("Weights of Misclassified Samples:", weights[mc][:10])

    # Save history for plots
    errors.append(err)
    alphas.append(alpha)
    misclassified_history.append(mc)
    weight_history.append(weights.copy())

    # Update weights
    weights = weights * np.exp(-alpha * y_train_mod * pred)
    weights = weights / np.sum(weights)

# ---------- Final strong classifier ----------
def predict_manual_boost(X):
    final_pred = np.zeros(X.shape[0])
    for t in range(T):
        stump = DecisionTreeClassifier(max_depth=1)
        stump.fit(X_train, y_train_mod, sample_weight=weight_history[t])
        final_pred += alphas[t] * stump.predict(X)
    return (final_pred > 0).astype(int)

y_pred_train = predict_manual_boost(X_train)
y_pred_test = predict_manual_boost(X_test)

print("\n=== Manual AdaBoost Final Results ===")
print("Train Accuracy:", accuracy_score(y_train, y_pred_train))
print("Test Accuracy:", accuracy_score(y_test, y_pred_test))
print("Confusion:\n", confusion_matrix(y_test, y_pred_test))


In [None]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(max_depth=1),
    n_estimators=100,
    learning_rate=1.0
)

ada.fit(X_train, y_train)

print("\n=== Sklearn AdaBoost ===")
print("Train Accuracy:", ada.score(X_train, y_train))
print("Test Accuracy:", ada.score(X_test, y_test))
print("Confusion Matrix:\n", confusion_matrix(y_test, ada.predict(X_test)))

print("\nComparison:")
print("Sklearn AdaBoost is more stable because:")
print("✔ Perfect weight normalization")
print("✔ Better stump-based sampling")
print("✔ Optimized implementation")
