<a href="https://colab.research.google.com/github/ernykth/SF2935-projekt/blob/main/RBF_SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RBF SVM



## Binary class case

### 1. Import

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

from sklearn.multiclass import OneVsRestClassifier

from sklearn.inspection import permutation_importance

from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, confusion_matrix, log_loss


import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

from sklearn.preprocessing import LabelEncoder
from matplotlib.colors import ListedColormap, BoundaryNorm

from sklearn.metrics import average_precision_score
from functools import partial



# Ladda data (om du kör vanligt)
#df = pd.read_excel("Project SF2935.xlsx", sheet_name="Wine data")


# 1) Klona repot lokalt i Colab
!rm -rf SF2935-projekt
!git clone https://github.com/ernykth/SF2935-projekt.git
%cd SF2935-projekt

# 2) Hitta Excel-filen/filerna – skriv ut fullständiga sökvägar
import glob, os
xlsx_paths = glob.glob("**/*.xlsx", recursive=True)
print("Hittade Excel-filer:", xlsx_paths)

# 3) Välj rätt fil (om du vet ungefär namnet, filtrera; annars ta första)
target = None
candidates = [p for p in xlsx_paths if "Project" in os.path.basename(p)]
target = candidates[0] if candidates else (xlsx_paths[0] if xlsx_paths else None)
print("Vald fil:", target)

if target is None:
    raise FileNotFoundError("Hittade ingen .xlsx i repot. Kontrollera filnamn/placering i GitHub.")

# 4) Undersök vilka blad som finns och läs in
import pandas as pd
from openpyxl import load_workbook

xls = pd.ExcelFile(target, engine="openpyxl")
print("Tillgängliga ark:", xls.sheet_names)

# Försök läsa bladet "Wine data"; fallback till första bladet om namnet inte stämmer
sheet = "Wine data" if "Wine data" in xls.sheet_names else 0
df = pd.read_excel(target, sheet_name=sheet, engine="openpyxl")
df.head()







Cloning into 'SF2935-projekt'...
remote: Enumerating objects: 22, done.[K
remote: Counting objects: 100% (22/22), done.[K
remote: Compressing objects: 100% (19/19), done.[K
remote: Total 22 (delta 5), reused 5 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (22/22), 8.45 MiB | 14.64 MiB/s, done.
Resolving deltas: 100% (5/5), done.
/content/SF2935-projekt
Hittade Excel-filer: ['Project SF2935.xlsx']
Vald fil: Project SF2935.xlsx
Tillgängliga ark: ['Wine data', 'Correlation report']


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red


### 2. Features & target

In [None]:
# X = alla numeriska features
# X = df.drop(columns=["type"])
X = df.drop(columns=["type", "quality"])    # Tar bort båda kvalitativa mått.
# y = vintyp (red=0, white=1)
y = df["type"].map({"red": 0, "white": 1})



### 3. Train/test split and scaling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)




### 4. Linear SVM

In [None]:
linear_svm = SVC(kernel="linear", C=1, class_weight="balanced", random_state=42)
linear_svm.fit(X_train_scaled, y_train)

y_pred_lin = linear_svm.predict(X_test_scaled)
print("=== Linear kernel performance ===")
print(confusion_matrix(y_test, y_pred_lin))
print(classification_report(y_test, y_pred_lin))



=== Linear kernel performance ===
[[319   1]
 [  4 976]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       320
           1       1.00      1.00      1.00       980

    accuracy                           1.00      1300
   macro avg       0.99      1.00      0.99      1300
weighted avg       1.00      1.00      1.00      1300



### 5. Binary classification comparison pipeline
#### SVM: RBF Kernel classifier, nested cross-validation, y-stratified data:

Compares 4 ways of balancing our imbalanced y-class data (white wines >> reds) using the following strategies:
1) Plain (data as-is)
2) Balanced: reweights classes inversely proportional to their prevalence
3) Undersampled: undersamples datapoints in dominant class to match number of underrepresented datapoints
4) SMOTE: oversamples by taking random points along the direction of one of the k nearest neighbors of the actual data points in the feature space.

The analysis pipeline uses nested cross validation: instead of initially splitting the dataset just once into a train and test set and then performing hyperparameter tuning using CV on the training set, finally evaluating the selected hyperparameters on the single held out test set, the initial train-test split is in itself performed k times in an outer k-fold CV loop. In each rotation, an inner CV loop is executed to tune the hyperparameters. Finally, the average performance metrics of the outer CV loop on the model is reported.

This minimized data leakage and bias from the hyperparameter tuning, which is essential when the purpose is benchmarking the model against other methods (in our case against NNs.)


In [None]:
from sklearn.base import clone
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, make_scorer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.svm import SVC
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

# --- 1. Read df and encode labels ---
X = df.drop(['type','quality'], axis=1)
y = df['type']
y_num = y.map({"red": 1, "white": 0})

# --- 2. Define scorers ---
scorers = {
    "accuracy": make_scorer(accuracy_score),
    "precision": make_scorer(precision_score, pos_label=1),
    "recall": make_scorer(recall_score, pos_label=1),
    "f1_red": make_scorer(f1_score, pos_label=1),
    "macro_f1": make_scorer(f1_score, average="macro"),
    "roc_auc": make_scorer(roc_auc_score, response_method="predict_proba")
}

# --- 3. Define parameter grid ---
param_grid = {
    "svc__C": [0.001, 0.01, 0.1, 1, 10, 100],
    "svc__gamma": ["scale", 0.1, 0.01],
    "svc__kernel": ["rbf"]
}

# --- 4. Define models ---
pipelines = {
    "Plain": Pipeline([("svc", SVC(probability=True, random_state=42))]),
    "Balanced": Pipeline([("svc", SVC(probability=True, class_weight="balanced", random_state=42))]),
    "Undersampled": Pipeline([
        ("under", RandomUnderSampler(random_state=42)),
        ("svc", SVC(probability=True, random_state=42))
    ]),
    "SMOTE": Pipeline([
        ("smote", SMOTE(random_state=42)),
        ("svc", SVC(probability=True, random_state=42))
    ])
}

# --- 5. Outer & Inner CV ---
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=7)

results = []

def get_mode(lst):
    if not lst:
        return None
    counter = Counter(lst)
    return counter.most_common(1)[0][0]

for name, pipe in pipelines.items():
    print(f"Running nested CV for: {name}...")
    best_params_list = []
    c_list = []
    gamma_list = []
    outer_scores = []

    for train_idx, test_idx in outer_cv.split(X, y_num):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y_num.iloc[train_idx], y_num.iloc[test_idx]

        grid = GridSearchCV(
            estimator=clone(pipe),
            param_grid=param_grid,
            scoring="f1_macro",
            cv=inner_cv,
            n_jobs=-1
        )
        grid.fit(X_train, y_train)
        best_params = grid.best_params_
        c_val = best_params["svc__C"]
        gamma_val = best_params["svc__gamma"]
        c_list.append(c_val)
        gamma_list.append(gamma_val)

        # Get the actual SVC estimator (handle imblearn pipeline)
        svc_est = grid.best_estimator_
        if hasattr(svc_est, "named_steps"):
            svc = svc_est.named_steps["svc"]
        else:
            svc = svc_est

        # Format gamma for display
        if gamma_val == "scale":
            n_features = X_train.shape[1]
            X_var = X_train.var().mean()
            scale_value = 1.0 / (n_features * X_var) if X_var > 0 else np.nan
            gamma_str = f"gamma: scale = {scale_value:.5g}"
        else:
            gamma_str = f"gamma: {gamma_val}"

        params_str = f"C: {c_val}, {gamma_str}"
        best_params_list.append(params_str)

        y_pred = grid.predict(X_test)
        outer_scores.append({
            "accuracy": accuracy_score(y_test, y_pred),
            "precision": precision_score(y_test, y_pred, pos_label=1),
            "recall": recall_score(y_test, y_pred, pos_label=1),
            "f1_red": f1_score(y_test, y_pred, pos_label=1),
            "macro_f1": f1_score(y_test, y_pred, average="macro"),
            "roc_auc": roc_auc_score(y_test, grid.predict_proba(X_test)[:,1])
        })

    # Aggregate metrics
    avg_scores = {metric: np.mean([fold[metric] for fold in outer_scores]) for metric in outer_scores[0]}
    params_str_all = "; ".join(best_params_list)
    most_common_c = get_mode(c_list)
    most_common_gamma = get_mode(gamma_list)

    results.append({
        "Model": name,
        "Accuracy": avg_scores["accuracy"],
        "Precision (red)": avg_scores["precision"],
        "Recall (red)": avg_scores["recall"],
        "F1 (red)": avg_scores["f1_red"],
        "Macro F1": avg_scores["macro_f1"],
        "ROC-AUC": avg_scores["roc_auc"],
        "Most Common C": most_common_c,
        "Most Common gamma": most_common_gamma,
        "Best Params (per fold)": params_str_all
    })

df_results = pd.DataFrame(results).set_index("Model")
pd.set_option("display.precision", 3)
print(df_results)

# --- 6. Plot results ---
df_results[["Macro F1", "ROC-AUC"]].plot(kind="bar", figsize=(8,5))
plt.title("Nested CV Performance Comparison")
plt.ylabel("Score")
plt.ylim(0,1)
plt.show()

# Export results to LaTeX table
# with open("df_results_binart.tex", "w") as f:
#   f.write(df_results.to_latex(float_format="%.3f"))

## Multiclass case

### 1. Multiclass classification comparison pipeline
#### SVM: RBF Kernel classifier (One-versus-Rest), nested cross-validation, y-stratified data:

This pipeline compares four strategies for handling class imbalance in multiclass wine quality prediction:
1) **Plain:** Uses the data as-is, without any balancing.
2) **Balanced:** Reweights classes inversely proportional to their prevalence.
3) **Undersampled:** Randomly undersamples the dominant classes to match the number of underrepresented samples.
4) **SMOTE:** Oversamples minority classes by generating synthetic samples along the line segments joining minority class instances and their nearest neighbors.

Three different strategies are available for defining the multiclass target variable:
- **Strategy 1:** Uses the original wine quality labels (integers 3–9) as class labels.
- **Strategy 2:** Manually buckets the quality scores into three classes: 'bad' (3–4), 'medium' (5–7), and 'good' (8–9).
- **Strategy 3:** Stratifies the quality scores into three equally sized buckets ('bad', 'medium', 'good') using quantiles.

The analysis uses nested cross-validation: the dataset is split *k* times in an outer k-fold loop, and for each outer fold, an inner cross-validation loop is used to tune hyperparameters. The selected hyperparameters are evaluated on the held-out outer test fold, and the average performance metrics across all outer folds are reported. This approach minimizes data leakage and bias from hyperparameter tuning, ensuring a fair comparison between balancing strategies.

First run the code block immediately below, choosing bucketing strategy by changing variable choose_strategy as 1, 2 or 3.

In [None]:
df_trimmed = df[(df['quality'] != 1) & (df['quality'] != 2)]  # Trims empty classes

choose_strategy = 3

if choose_strategy == 1:  # Use current quality labels as class labels
    X = df_trimmed.drop(['type','quality'], axis=1)
    y = df_trimmed['quality'].astype(int)
    print(df_trimmed['quality'].value_counts())   # shows numbers of datapoints that goes into each class

elif choose_strategy == 2:   # Manual bucketing into 3 classes
    quality_map = {3: 'bad', 4: 'bad', 5: 'medium', 6: 'medium', 7: 'medium', 8: 'good', 9: 'good'}
    df_trimmed['quality_bucket_manual'] = df_trimmed['quality'].map(quality_map)
    X = df_trimmed.drop(['type', 'quality', 'quality_bucket_manual'], axis=1)
    y = df_trimmed['quality_bucket_manual'].astype('category').cat.codes
    print(df_trimmed['quality_bucket_manual'].value_counts())   # shows numbers of datapoints that goes into each class

elif choose_strategy == 3:   # Stratified bucketing into 3 classes
    labels = ['bad', 'medium', 'good']
    df_trimmed['quality_bucket_stratified'] = pd.qcut(df_trimmed['quality'], q=3, labels=labels)
    X = df_trimmed.drop(['type', 'quality', 'quality_bucket_stratified'], axis=1)
    y = df_trimmed['quality_bucket_stratified'].astype('category').cat.codes
    print(df_trimmed['quality_bucket_stratified'].value_counts())  # shows numbers of datapoints that goes into each class
    # Print which quality scores go into each bucket
    for label in labels:
        scores = df_trimmed.loc[df_trimmed['quality_bucket_stratified'] == label, 'quality'].unique()
        print(f"{label}: {np.sort(scores)}")



In [None]:
from sklearn.base import clone
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, make_scorer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.svm import SVC
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter


# --- 2. Define scorers ---
scorers = {
    "accuracy": make_scorer(accuracy_score),
    "macro_f1": make_scorer(f1_score, average="macro"),
    "weighted_f1": make_scorer(f1_score, average="weighted"),
    "roc_auc_ovr": make_scorer(roc_auc_score, multi_class="ovr", needs_proba=True)
}

# --- 3. Define parameter grid ---
param_grid = {
    "svc__C": [0.1, 1, 10],
    "svc__gamma": ["scale", 0.1, 0.01],
    "svc__kernel": ["rbf"]
}

# --- 4. Define models (pipelines) ---
pipelines = {
    "Plain": Pipeline([("svc", SVC(probability=True, random_state=42))]),
    "Balanced": Pipeline([("svc", SVC(probability=True, class_weight="balanced", random_state=42))]),
    "Undersampled": Pipeline([
        ("under", RandomUnderSampler(random_state=42)),
        ("svc", SVC(probability=True, random_state=42))
    ]),
    "SMOTE": Pipeline([
        ("smote", SMOTE(random_state=42, k_neighbors=1)),
        ("svc", SVC(probability=True, random_state=42))
    ])
}

# --- 5. Outer & Inner CV ---
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

results = []

def get_mode(lst):
    if not lst:
        return None
    counter = Counter(lst)
    return counter.most_common(1)[0][0]

for name, pipe in pipelines.items():
    print(f"Running nested CV for: {name}...")
    best_params_list = []
    c_list = []
    gamma_list = []
    outer_scores = []

    for train_idx, test_idx in outer_cv.split(X, y_num):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        grid = GridSearchCV(
            estimator=clone(pipe),
            param_grid=param_grid,
            scoring="f1_macro",
            cv=inner_cv,
            n_jobs=-1
        )
        grid.fit(X_train, y_train)
        best_params = grid.best_params_
        c_val = best_params["svc__C"]
        gamma_val = best_params["svc__gamma"]
        c_list.append(c_val)
        gamma_list.append(gamma_val)

        # Format gamma for display
        if gamma_val == "scale":
            n_features = X_train.shape[1]
            X_var = X_train.var().mean()
            scale_value = 1.0 / (n_features * X_var) if X_var > 0 else np.nan
            gamma_str = f"gamma: scale = {scale_value:.5g}"
        else:
            gamma_str = f"gamma: {gamma_val}"

        params_str = f"C: {c_val}, {gamma_str}"
        best_params_list.append(params_str)

        y_pred = grid.predict(X_test)
        y_proba = grid.predict_proba(X_test)
        # For multiclass, roc_auc_score needs y_test and y_proba for all classes
        try:
            roc_auc = roc_auc_score(y_test, y_proba, multi_class="ovr")
        except Exception:
            roc_auc = np.nan

        outer_scores.append({
            "accuracy": accuracy_score(y_test, y_pred),
            "macro_f1": f1_score(y_test, y_pred, average="macro"),
            "weighted_f1": f1_score(y_test, y_pred, average="weighted"),
            "roc_auc_ovr": roc_auc
        })

    # Aggregate metrics
    avg_scores = {metric: np.mean([fold[metric] for fold in outer_scores]) for metric in outer_scores[0]}
    most_common_c = get_mode(c_list)
    most_common_gamma = get_mode(gamma_list)

    results.append({
        "Model": name,
        "Accuracy": avg_scores["accuracy"],
        "Macro F1": avg_scores["macro_f1"],
        "Weighted F1": avg_scores["weighted_f1"],
        "ROC-AUC (OvR)": avg_scores["roc_auc_ovr"],
        "Most Common C": most_common_c,
        "Most Common gamma": most_common_gamma
    })

df_results = pd.DataFrame(results).set_index("Model")
pd.set_option("display.precision", 3)
print(df_results)

# --- 6. Plot ---
df_results[["Macro F1", "ROC-AUC (OvR)"]].plot(kind="bar", figsize=(8,5))
plt.title("Multiclass SVC Performance (Nested CV)")
plt.ylabel("Score")
plt.ylim(0,1)
plt.show()

# Export results to LaTeX table
# with open(f"df_results_multiclass_{str(choose_strategy)}.tex", "w") as f:
#    f.write(df_results.to_latex(float_format="%.3f"))

In the next part Linear SVM is prepared and run to create a 'baseline' for RBF SVM

### 2. Create three classes

In [None]:
# (A) Fixed bins (interpretable)
# 3–4 = low, 5–6 = medium, 7–9 = high
def label_quality_fixed(q):
    if q <= 4:
        return "low"
    elif q <= 6:
        return "medium"
    else:
        return "high"

df["quality_3_fixed"] = df["quality"].apply(label_quality_fixed)

# (B) Quantile bins (often more balanced)
q_edges = df["quality"].quantile([0, 1/3, 2/3, 1]).to_numpy()
labels_q = ["low_q", "mid_q", "high_q"]
df["quality_3_quant"] = pd.cut(
    df["quality"],
    bins=[-np.inf, q_edges[1], q_edges[2], np.inf],
    labels=labels_q,
    include_lowest=True
)

print("Counts (fixed):")
print(df["quality_3_fixed"].value_counts().sort_index())
print("\nCounts (quantile):")
print(df["quality_3_quant"].value_counts().sort_index())

# --- choose target column here ---
target_col = "quality_3_fixed"   # or "quality_3_quant"

Counts (fixed):
quality_3_fixed
high      1277
low        246
medium    4974
Name: count, dtype: int64

Counts (quantile):
quality_3_quant
low_q     2384
mid_q     2836
high_q    1277
Name: count, dtype: int64


### 3. Prepare train/test split

In [None]:
# === Prepare X, y, split (stratified) ===
from sklearn.model_selection import train_test_split

# Include 'type' as a categorical feature? (can help quality classification)
INCLUDE_TYPE = True

# Numeric base features (exclude original 'quality')
X_num = df.select_dtypes(include=[np.number]).drop(columns=["quality"], errors="ignore")

if INCLUDE_TYPE and "type" in df.columns:
    X = pd.concat([X_num, pd.get_dummies(df["type"], prefix="type", drop_first=False)], axis=1)
else:
    X = X_num.copy()

y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

X_train.shape, X_test.shape, y_train.value_counts().to_dict(), y_test.value_counts().to_dict()


((5197, 13),
 (1300, 13),
 {'medium': 3979, 'high': 1021, 'low': 197},
 {'medium': 995, 'high': 256, 'low': 49})

### 4. Linear SVM

In [None]:
# === Linear SVM (OvR) with probability for log-loss ===
lin_svm_ovr = make_pipeline(
    StandardScaler(),
    OneVsRestClassifier(
        SVC(kernel="linear", C=1, class_weight="balanced", probability=True, random_state=42)
    )
)

lin_svm_ovr.fit(X_train, y_train)
y_pred_lin = lin_svm_ovr.predict(X_test)
y_proba_lin = lin_svm_ovr.predict_proba(X_test)

print("=== Linear SVM (OvR, 3-class) ===")
print(confusion_matrix(y_test, y_pred_lin))
print(classification_report(y_test, y_pred_lin))

# Note: SVM trains per-class hinge losses (OvR). We can still report cross-entropy on calibrated probs.
ce_lin = log_loss(y_test, y_proba_lin, labels=lin_svm_ovr.named_steps["onevsrestclassifier"].classes_)
print(f"Cross-entropy (log-loss): {ce_lin:.4f}")



=== Linear SVM (OvR, 3-class) ===
[[203   6  47]
 [  8  17  24]
 [295  89 611]]
              precision    recall  f1-score   support

        high       0.40      0.79      0.53       256
         low       0.15      0.35      0.21        49
      medium       0.90      0.61      0.73       995

    accuracy                           0.64      1300
   macro avg       0.48      0.58      0.49      1300
weighted avg       0.77      0.64      0.67      1300

Cross-entropy (log-loss): 0.5268
