In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
import re
import string
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

df = pd.read_csv('spam.csv', encoding='latin-1')
df = df[['v1', 'v2']]
df.columns = ['label', 'text']
df['label'] = df['label'].map({'spam': 1, 'ham': 0})

print(df.head())
print(df.shape)
print(df['label'].value_counts())

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

df['cleaned_text'] = df['text'].apply(preprocess_text)

tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(df['cleaned_text']).toarray()
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

print(X_train.shape)
print(X_test.shape)

stump = DecisionTreeClassifier(max_depth=1, random_state=42)
stump.fit(X_train, y_train)

y_train_pred = stump.predict(X_train)
y_test_pred = stump.predict(X_test)

train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)
cm = confusion_matrix(y_test, y_test_pred)

print(train_acc)
print(test_acc)
print(cm)

class ManualAdaBoost:
    def __init__(self, T=15):
        self.T = T
        self.alphas = []
        self.stumps = []
        self.errors = []

    def fit(self, X, y):
        n_samples = X.shape[0]
        w = np.ones(n_samples) / n_samples
        y_signed = 2 * y - 1

        for t in range(self.T):
            stump = DecisionTreeClassifier(max_depth=1, random_state=42)
            stump.fit(X, y, sample_weight=w)
            predictions = stump.predict(X)
            pred_signed = 2 * predictions - 1

            misclassified = (predictions != y)
            error = np.sum(w[misclassified])
            error = np.clip(error, 1e-10, 1 - 1e-10)

            alpha = 0.5 * np.log((1 - error) / error)

            w = w * np.exp(-alpha * y_signed * pred_signed)
            w = w / np.sum(w)

            print(error)
            print(alpha)

            self.alphas.append(alpha)
            self.stumps.append(stump)
            self.errors.append(error)

        return self

    def predict(self, X):
        stump_preds = np.array([2*stump.predict(X) - 1 for stump in self.stumps])
        weighted_sum = np.dot(self.alphas, stump_preds)
        return (weighted_sum > 0).astype(int)

manual_ada = ManualAdaBoost(T=15)
manual_ada.fit(X_train, y_train)

y_train_pred_manual = manual_ada.predict(X_train)
y_test_pred_manual = manual_ada.predict(X_test)

train_acc_manual = accuracy_score(y_train, y_train_pred_manual)
test_acc_manual = accuracy_score(y_test, y_test_pred_manual)
cm_manual = confusion_matrix(y_test, y_test_pred_manual)

print(train_acc_manual)
print(test_acc_manual)
print(cm_manual)

sklearn_ada = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1),
    n_estimators=100,
    learning_rate=0.6,
    random_state=42
)

sklearn_ada.fit(X_train, y_train)

y_train_pred_sklearn = sklearn_ada.predict(X_train)
y_test_pred_sklearn = sklearn_ada.predict(X_test)

train_acc_sklearn = accuracy_score(y_train, y_train_pred_sklearn)
test_acc_sklearn = accuracy_score(y_test, y_test_pred_sklearn)
cm_sklearn = confusion_matrix(y_test, y_test_pred_sklearn)

print(train_acc_sklearn)
print(test_acc_sklearn)
print(cm_sklearn)

print([train_acc, test_acc])
print([train_acc_manual, test_acc_manual])
print([train_acc_sklearn, test_acc_sklearn])


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


   label                                               text
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...
(5572, 2)
label
0    4825
1     747
Name: count, dtype: int64
(4457, 3000)
(1115, 3000)
0.8837783262284048
0.8923766816143498
[[930  36]
 [ 84  65]]
0.11622167377159526
1.0143534603557114
0.26087922870101093
0.5207018789750049
0.3512557431516268
0.3067620053374075
0.36581609850424546
0.275104116902884
0.4429219882660905
0.11465581563866874
0.41177269499634184
0.17832098020348391
0.4286849954067594
0.14360917827958558
0.423971825276512
0.15324478537279745
0.43552512780910013
0.129671687911075
0.4381784558754228
0.12427900070986571
0.44595125533486046
0.10852150828043412
0.44044492057667584
0.119678283888183
0.44752638756957897
0.1053

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt


df = pd.read_csv('heart.csv')
print(df.head())
print(df.shape)
print(df.isnull().sum().sum())

df = df.dropna()
X = df.drop('target', axis=1)
y = (df['target'].astype(int) > 0).astype(int)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

stump = DecisionTreeClassifier(max_depth=1, random_state=42)
stump.fit(X_train, y_train)

y_train_pred = stump.predict(X_train)
y_test_pred = stump.predict(X_test)

train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)
cm = confusion_matrix(y_test, y_test_pred)

print(train_acc)
print(test_acc)
print(cm)
print(classification_report(y_test, y_test_pred, target_names=['No Disease','Disease']))


n_estimators_list = [5, 10, 25, 50, 100]
learning_rates = [0.1, 0.5, 1.0]

results = {}
best_acc = -1.0
best_cfg = None

for lr in learning_rates:
    accs = []
    for n_est in n_estimators_list:
        ada = AdaBoostClassifier(
            estimator=DecisionTreeClassifier(max_depth=1),
            n_estimators=n_est,
            learning_rate=lr,
            random_state=42,
            algorithm='SAMME'
        )
        ada.fit(X_train, y_train)
        acc = accuracy_score(y_test, ada.predict(X_test))
        accs.append(acc)
        if acc > best_acc:
            best_acc = acc
            best_cfg = {'learning_rate': lr, 'n_estimators': n_est, 'model': ada}
    results[lr] = accs

for lr in learning_rates:
    print(results[lr])

print(best_cfg['learning_rate'])
print(best_cfg['n_estimators'])
print(best_acc)

best_model = best_cfg['model']

class AdaBoostTracker:
    def __init__(self, n_estimators, learning_rate):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.estimator_errors = []
        self.estimator_alphas = []
        self.sample_weights_history = []

    def fit(self, X, y):
        n = X.shape[0]
        w = np.ones(n) / n
        y_signed = 2*y - 1
        for i in range(self.n_estimators):
            stump = DecisionTreeClassifier(max_depth=1, random_state=42+i)
            stump.fit(X, y, sample_weight=w)
            pred = stump.predict(X)
            pred_signed = 2*pred - 1

            incorrect = (pred != y)
            err = np.sum(w[incorrect])
            err = np.clip(err, 1e-10, 1-1e-10)

            alpha = self.learning_rate * 0.5 * np.log((1-err)/err)

            self.estimator_errors.append(err)
            self.estimator_alphas.append(alpha)
            self.sample_weights_history.append(w.copy())

            w = w * np.exp(-alpha * y_signed * pred_signed)
            w = w / np.sum(w)

        self.final_weights = w
        return self

tracker = AdaBoostTracker(best_cfg['n_estimators'], best_cfg['learning_rate'])
tracker.fit(X_train, y_train)

print(tracker.estimator_errors)
print(tracker.final_weights[:10])
plt.figure(figsize=(10,4))
plt.plot(range(1, len(tracker.estimator_errors)+1), tracker.estimator_errors, marker='o')
plt.xlabel('iteration'); plt.ylabel('error'); plt.tight_layout(); plt.show()

plt.figure(figsize=(6,4))
plt.hist(tracker.final_weights, bins=40)
plt.tight_layout(); plt.show()

top_k = 10
top_idx = np.argsort(tracker.final_weights)[-top_k:][::-1]
print(top_idx)
print(tracker.final_weights[top_idx])


fi = best_model.feature_importances_
names = X.columns if isinstance(X, pd.DataFrame) else df.drop('target', axis=1).columns
order = np.argsort(fi)[::-1]
top_k = min(10, len(fi))
print(fi[order][:top_k])
print(names[order][:top_k])

plt.figure(figsize=(8,5))
plt.barh(range(top_k), fi[order][:top_k][::-1])
plt.yticks(range(top_k), names[order][:top_k][::-1])
plt.gca().invert_yaxis()
plt.tight_layout(); plt.show()
