**SVM & Naive Bayes**

In [None]:
# ✅ Q1: What is a Support Vector Machine (SVM), and how does it work?
# Answer:
# SVM is a supervised learning algorithm that finds the optimal hyperplane separating classes
# with the maximum margin. Only the training points on the edges of the margin (support vectors)
# influence the decision boundary. For non-linearly separable data, SVM uses kernels to project
# data into higher-dimensional feature spaces where a linear separator can exist.

# ✅ Q2: Difference between Hard Margin and Soft Margin SVM
# Answer:
# - Hard Margin: Assumes data are perfectly linearly separable; no misclassification allowed.
#   Maximizes margin with strict constraints (prone to overfitting/outliers).
# - Soft Margin: Allows some violations controlled by C (regularization). Balances margin size
#   and classification errors; robust to outliers and noise.

# ✅ Q3: Kernel Trick in SVM + Example
# Answer:
# The kernel trick computes inner products in a high-dimensional (possibly infinite) feature space
# without explicitly transforming the data. Example: RBF (Gaussian) kernel k(x, x') = exp(-γ||x - x'||²).
# Use case: When classes are not linearly separable in the original space; RBF flexibly models complex, curved boundaries.

# ✅ Q4: What is a Naïve Bayes Classifier, and why “naïve”?
# Answer:
# Naïve Bayes applies Bayes’ Theorem with the assumption that features are conditionally independent
# given the class label — that simplifying assumption is the “naïve” part. It is fast, works well with high-dimensional,
# sparse text data, and yields calibrated probabilities.

# ✅ Q5: Gaussian, Multinomial, and Bernoulli Naïve Bayes — when to use which?
# Answer:
# - Gaussian NB: Continuous, approximately normal features (e.g., sensor readings).
# - Multinomial NB: Count features (e.g., word counts in documents via CountVectorizer/TF-IDF).
# - Bernoulli NB: Binary features (presence/absence of words, or thresholded features).

# =========================
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import TfidfVectorizer

# -------------------------
# Q6: Iris + SVM (linear kernel): print accuracy & support vectors
# -------------------------
print("\n" + "="*15 + " Q6: SVM (Linear) on Iris " + "="*15)
iris = datasets.load_iris()
X_i, y_i = iris.data, iris.target
X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(X_i, y_i, test_size=0.3, random_state=42, stratify=y_i)

# Without scaling (linear SVM can still benefit from scaling; we show both as asked)
svm_linear_noscale = SVC(kernel='linear', random_state=42)
svm_linear_noscale.fit(X_train_i, y_train_i)
y_pred_i_ns = svm_linear_noscale.predict(X_test_i)
acc_i_ns = accuracy_score(y_test_i, y_pred_i_ns)

# With scaling
pipe_linear_scaled = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC(kernel='linear', random_state=42))
])
pipe_linear_scaled.fit(X_train_i, y_train_i)
y_pred_i_sc = pipe_linear_scaled.predict(X_test_i)
acc_i_sc = accuracy_score(y_test_i, y_pred_i_sc)

print(f"Accuracy (no scaling): {acc_i_ns:.4f}")
print(f"Accuracy (with scaling): {acc_i_sc:.4f}")

# Support vectors are only accessible from the fitted SVC (not from pipeline directly)
n_support = svm_linear_noscale.n_support_
svectors_shape = svm_linear_noscale.support_vectors_.shape
print(f"Support vectors per class (no scaling model): {n_support}")
print(f"Support vectors array shape (no scaling model): {svectors_shape}")

# -------------------------
# Q7: Breast Cancer + Gaussian Naive Bayes — classification report
# -------------------------
print("\n" + "="*15 + " Q7: GaussianNB on Breast Cancer " + "="*15)
bc = datasets.load_breast_cancer()
X_b, y_b = bc.data, bc.target
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_b, y_b, test_size=0.3, random_state=42, stratify=y_b)

gnb = GaussianNB()
gnb.fit(X_train_b, y_train_b)
y_pred_b = gnb.predict(X_test_b)
print(classification_report(y_test_b, y_pred_b, target_names=bc.target_names))

# -------------------------
# Q8: Wine + SVM with GridSearchCV (tune C, gamma for RBF)
# -------------------------
print("\n" + "="*15 + " Q8: GridSearch SVM (RBF) on Wine " + "="*15)
wine = datasets.load_wine()
X_w, y_w = wine.data, wine.target
X_train_w, X_test_w, y_train_w, y_test_w = train_test_split(X_w, y_w, test_size=0.3, random_state=42, stratify=y_w)

pipe_rbf = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC(kernel='rbf', probability=False, random_state=42))
])

param_grid = {
    "svm__C": [0.1, 1, 10, 100],
    "svm__gamma": ["scale", 0.1, 0.01, 0.001]
}
grid = GridSearchCV(pipe_rbf, param_grid=param_grid, cv=5, n_jobs=-1)
grid.fit(X_train_w, y_train_w)

best_model = grid.best_estimator_
acc_w = accuracy_score(y_test_w, best_model.predict(X_test_w))
print(f"Best Params: {grid.best_params_}")
print(f"CV Best Score: {grid.best_score_:.4f}")
print(f"Test Accuracy with Best Params: {acc_w:.4f}")

# -------------------------
# Q9: Naive Bayes on text (20 Newsgroups subset) + ROC-AUC
# -------------------------
print("\n" + "="*15 + " Q9: Text NB + ROC-AUC " + "="*15)
# We'll use a binary subset for a clean ROC-AUC (spam-like vs ham-like themes).
# If fetching fails (no internet), we fall back to a small synthetic example.
try:
    from sklearn.datasets import fetch_20newsgroups
    cats = ['sci.space', 'talk.politics.mideast']  # two classes -> binary ROC-AUC
    train = fetch_20newsgroups(subset='train', categories=cats, remove=('headers','footers','quotes'))
    test  = fetch_20newsgroups(subset='test',  categories=cats, remove=('headers','footers','quotes'))
    X_tr_text, y_tr = train.data, train.target
    X_te_text, y_te = test.data,  test.target

    text_pipe = Pipeline([
        ("tfidf", TfidfVectorizer(min_df=3, ngram_range=(1,2))),
        ("nb", MultinomialNB())
    ])
    text_pipe.fit(X_tr_text, y_tr)
    # ROC-AUC for binary: use predict_proba or decision_function
    proba = text_pipe.predict_proba(X_te_text)[:, 1]
    auc = roc_auc_score(y_te, proba)
    print(f"ROC-AUC (MultinomialNB on 20 Newsgroups subset): {auc:.4f}")
except Exception as e:
    print("Fetch failed, using synthetic tiny text dataset. Reason:", str(e))
    X_text = [
        "earn money fast limited offer buy now",
        "investment opportunity returns guaranteed",
        "meeting schedule for next week and agenda",
        "project update attached please review",
        "cheap meds available without prescription",
        "family dinner plan this weekend at home"
    ]
    y_text = np.array([1,1,0,0,1,0])  # 1=spam-like, 0=ham
    X_tr_text, X_te_text, y_tr, y_te = train_test_split(X_text, y_text, test_size=0.5, random_state=42, stratify=y_text)
    text_pipe = Pipeline([
        ("tfidf", TfidfVectorizer()),
        ("nb", MultinomialNB())
    ])
    text_pipe.fit(X_tr_text, y_tr)
    proba = text_pipe.predict_proba(X_te_text)[:, 1]
    auc = roc_auc_score(y_te, proba)
    print(f"ROC-AUC (MultinomialNB on synthetic text): {auc:.4f}")

# =========================
# Q10: Spam vs Not-Spam — Full Approach (Theory + Practical Guidance)
# =========================
# Scenario:
# - Text emails with diverse vocabulary
# - Class imbalance (ham >> spam)
# - Missing/incomplete data

# Approach:
# 1) Preprocessing
#    - Handle missing: replace missing subject/body with empty string; drop rows with no usable text if needed.
#    - Text cleaning: lowercasing, optional punctuation/URL removal (preserve numbers if useful).
#    - Vectorization: TfidfVectorizer with n-grams (1–2), min_df to remove rare noise, optionally sublinear_tf=True.
#    - Optional feature selection: chi-square to keep top-k features for speed.
#
# 2) Model choice & justification (SVM vs Naive Bayes)
#    - MultinomialNB:
#        * Pros: Very fast, robust with high-dimensional sparse text, good baseline, calibrated probabilities.
#        * Cons: Assumes feature independence; decision boundary is simpler.
#    - Linear SVM (e.g., LinearSVC or SVC(kernel='linear')):
#        * Pros: Strong performance on text, handles high-dimensional sparse data well.
#        * Cons: Needs probability calibration (CalibratedClassifierCV) if you need probabilities.
#    - Recommendation: Start with MultinomialNB as baseline, then train Linear SVM; pick the better one via CV.
#
# 3) Address class imbalance
#    - Use class_weight='balanced' for SVM; for NB, adjust decision threshold post-hoc.
#    - Resampling strategies: RandomUnderSampler/SMOTE (on TF-IDF can be tricky; usually adjust threshold + class weights first).
#    - Use stratified folds in cross-validation.
#
# 4) Evaluation metrics
#    - Accuracy can be misleading with imbalance; focus on Precision, Recall, F1 (particularly Recall for spam),
#      PR-AUC (Precision-Recall AUC), ROC-AUC.
#    - Calibrate threshold to meet business objectives (e.g., minimize false negatives or false positives depending on cost).
#
# 5) Business impact
#    - Reduced exposure to spam & phishing, improved employee productivity.
#    - Cost control by reducing manual triage; explainable thresholds for compliance.
#    - Continuous monitoring: track drift (new spam tactics) and retrain periodically.

# (Optional) Mini reference pipeline sketch (no execution, just for clarity):
# from sklearn.calibration import CalibratedClassifierCV
# pipe = Pipeline([
#     ("tfidf", TfidfVectorizer(min_df=3, ngram_range=(1,2), sublinear_tf=True)),
#     ("clf", LinearSVC(class_weight="balanced", random_state=42))
# ])
# clf = CalibratedClassifierCV(pipe, method="sigmoid", cv=5)  # to get probabilities if needed
# clf.fit(X_train_text, y_train)
# y_proba = clf.predict_proba(X_test_text)[:,1]
# Evaluate: precision, recall, f1, roc_auc, pr_auc; choose threshold by business cost curve.


