# Imports

In [11]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import VotingClassifier, VotingRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn import metrics
from sklearn.svm import SVC

# Data Load and exploration

In [2]:
data_path = "https://storage.googleapis.com/edulabs-public-datasets/pulsar_stars.csv"
df = pd.read_csv(data_path)

In [3]:
df

Unnamed: 0,Mean of the integrated profile,Standard deviation of the integrated profile,Excess kurtosis of the integrated profile,Skewness of the integrated profile,Mean of the DM-SNR curve,Standard deviation of the DM-SNR curve,Excess kurtosis of the DM-SNR curve,Skewness of the DM-SNR curve,target_class
0,140.562500,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.882430,0.465318,-0.515088,1.677258,14.860146,10.576487,127.393580,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.750000,57.178449,-0.068415,-0.636238,3.642977,20.959280,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.178930,11.468720,14.269573,252.567306,0
...,...,...,...,...,...,...,...,...,...
17893,136.429688,59.847421,-0.187846,-0.738123,1.296823,12.166062,15.450260,285.931022,0
17894,122.554688,49.485605,0.127978,0.323061,16.409699,44.626893,2.945244,8.297092,0
17895,119.335938,59.935939,0.159363,-0.743025,21.430602,58.872000,2.499517,4.595173,0
17896,114.507812,53.902400,0.201161,-0.024789,1.946488,13.381731,10.007967,134.238910,0


In [4]:
df.target_class.value_counts()

Unnamed: 0_level_0,count
target_class,Unnamed: 1_level_1
0,16259
1,1639


In [10]:
# prompt: create plotly graph with distributions of all the features

import plotly.express as px

for column in df.columns:
  fig = px.histogram(df, x=column, color='target_class', marginal="box",
                     title=f'Distribution of {column} by Target Class',
                     labels={'target_class': 'Target Class'})
  fig.show()


Output hidden; open in https://colab.research.google.com to view.

# Data Split
**Splitting to train and validation only for demo purposes - in real world must either split into 3 data sets, or use cross-validation!!!**

In [6]:
X = df.drop(columns=['target_class'])
y = df.target_class
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=77, stratify=y)

# Normalize the data - must for SVM!!!

In [7]:
# prompt: normalize train and test and store normalized data in X_train_normalized and X_test_normalized dataframes

scaler = StandardScaler()
X_train_normalized = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_normalized = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)


# Run SVM with default hyperparameters

In [13]:
svc=SVC()
svc.fit(X_train_normalized,y_train)

In [15]:
svc.predict(X_test_normalized[:10])

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0])

In [None]:
# no predict_proba
# svc.predict_proba(X_test_normalized[:10])

In [18]:
print(metrics.classification_report(y_test,svc.predict(X_test_normalized)))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      3252
           1       0.93      0.82      0.87       328

    accuracy                           0.98      3580
   macro avg       0.96      0.91      0.93      3580
weighted avg       0.98      0.98      0.98      3580



In [19]:
print(metrics.confusion_matrix(y_test,svc.predict(X_test_normalized)))

[[3232   20]
 [  60  268]]


# Tune Parameters - C

In [23]:
svc=SVC(C=10000)
svc.fit(X_train_normalized,y_train)

print("TEST")
print(metrics.classification_report(y_test,svc.predict(X_test_normalized)))
print(metrics.confusion_matrix(y_test,svc.predict(X_test_normalized)))

print("TRAIN")
print(metrics.classification_report(y_train,svc.predict(X_train_normalized)))
print(metrics.confusion_matrix(y_train,svc.predict(X_train_normalized)))

TEST
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      3252
           1       0.91      0.82      0.86       328

    accuracy                           0.98      3580
   macro avg       0.95      0.90      0.93      3580
weighted avg       0.98      0.98      0.98      3580

[[3227   25]
 [  60  268]]
TRAIN
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     13007
           1       0.98      0.88      0.93      1311

    accuracy                           0.99     14318
   macro avg       0.98      0.94      0.96     14318
weighted avg       0.99      0.99      0.99     14318

[[12980    27]
 [  151  1160]]


# Tune parameters - Choice of kernel

In [24]:
svc=SVC(kernel="linear")
svc.fit(X_train_normalized,y_train)

print("TEST")
print(metrics.classification_report(y_test,svc.predict(X_test_normalized)))
print(metrics.confusion_matrix(y_test,svc.predict(X_test_normalized)))

print("TRAIN")
print(metrics.classification_report(y_train,svc.predict(X_train_normalized)))
print(metrics.confusion_matrix(y_train,svc.predict(X_train_normalized)))

TEST
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      3252
           1       0.94      0.84      0.89       328

    accuracy                           0.98      3580
   macro avg       0.96      0.92      0.94      3580
weighted avg       0.98      0.98      0.98      3580

[[3234   18]
 [  53  275]]
TRAIN
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     13007
           1       0.94      0.82      0.88      1311

    accuracy                           0.98     14318
   macro avg       0.96      0.91      0.93     14318
weighted avg       0.98      0.98      0.98     14318

[[12941    66]
 [  232  1079]]


In [25]:
svc=SVC(kernel="poly")
svc.fit(X_train_normalized,y_train)

print("TEST")
print(metrics.classification_report(y_test,svc.predict(X_test_normalized)))
print(metrics.confusion_matrix(y_test,svc.predict(X_test_normalized)))

print("TRAIN")
print(metrics.classification_report(y_train,svc.predict(X_train_normalized)))
print(metrics.confusion_matrix(y_train,svc.predict(X_train_normalized)))

TEST
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      3252
           1       0.93      0.80      0.86       328

    accuracy                           0.98      3580
   macro avg       0.95      0.90      0.92      3580
weighted avg       0.98      0.98      0.98      3580

[[3232   20]
 [  66  262]]
TRAIN
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     13007
           1       0.95      0.80      0.87      1311

    accuracy                           0.98     14318
   macro avg       0.96      0.90      0.93     14318
weighted avg       0.98      0.98      0.98     14318

[[12949    58]
 [  261  1050]]


# Exercise




## Task 1

**Hyperparameter Optimization using GridSearch CV**

Use GridSearch to find the best model.

Use the following to set parameters for your grid search:






---

✅ **Most commonly tried values in grid search for SVM:**

### 📚 1. \( C \) (Regularization parameter)

- Typical values (exponentially spaced):
\[
[0.001, 0.01, 0.1, 1, 10, 100, 1000]
\]
- You often search across a wide range (very small to very large) because \( C \) can dramatically change model behavior.

---

### 📚 2. \( \gamma \) (RBF Kernel parameter)

- Typical values (also exponentially spaced):
\[
[0.0001, 0.001, 0.01, 0.1, 1, 10]
\]
- \( \gamma \) is very sensitive: small \( \gamma \) = smoother decision boundary; large \( \gamma \) = tighter, more complex boundary.

---

### 📚 3. \( d \) (Degree of Polynomial Kernel)

- Typical values:
\[
[2, 3, 4, 5]
\]
- Rarely go higher than degree 5.
  - Higher degrees → massive model complexity → high risk of overfitting.
- In practice, degrees 2 (quadratic) or 3 (cubic) are the most common.

---



| Parameter | Common Values to Try |
|:---|:---|
| \( C \) | [0.001, 0.01, 0.1, 1, 10, 100, 1000] |
| \( \gamma \) | [0.0001, 0.001, 0.01, 0.1, 1, 10] |
| \( d \) (Polynomial degree) | [2, 3, 4, 5] |

---

✅ **Pro Tip:**
- **Use exponential grids** first to explore rough values.
- **Zoom in** with finer grids if needed after a rough best region is found.

✅ **Another tip:**
- In scikit-learn, `gamma='scale'` is often a **good default** — it automatically sets \( \gamma = \frac{1}{\text{(n\_features * X.var())}} \).



# Decision functions and thresholds



🔵 **By default in SVM (`sklearn.svm.SVC`):**

- After training, SVM predicts based on the **sign** of the decision function:
[
decision(x) = w dot x + b
]
- The standard threshold is **0**:
  - If decision(x) > 0  → predict class 1
  - If decision(x) < 0  → predict class -1 (or 0 depending on labels)

✅ **But** — you can **manually adjust** the threshold to change the sensitivity or specificity of the classifier!

---

# 📚 How to tune the threshold manually:

1. **Train the SVM normally.**
2. **Get the decision function values** (continuous scores) instead of class labels:
```python
scores = model.decision_function(X_val)
```
3. **Set a custom threshold**:
```python
threshold = 0.2  # example: shift the threshold
predictions = (scores > threshold).astype(int)
```
4. **Evaluate your metrics** (precision, recall, F1, etc.) based on this new threshold.

---

✅ **Why would you want to tune the threshold?**
- To **favor recall** over precision (e.g., in medical diagnosis, fraud detection).
- To **favor precision** over recall (e.g., spam detection where false positives are bad).
- To **optimize a custom metric** (like F1-score, ROC AUC).

---


✅ **Extra note:**  
- If you use `probability=True` when training `SVC`, you can also use `predict_proba()` and apply thresholding on probabilities instead of raw scores.
- But: **Enabling `probability=True` makes SVM slower**, because it fits an extra **Platt scaling** (logistic regression on top of decision scores) to calibrate probabilities.

---

In [27]:
svc.decision_function(X_test_normalized[:20])

array([-1.2706558 , -1.12831177, -1.28466672,  7.68344566, -1.12272161,
       -0.17210062, -1.12595275, -1.08901699, -1.26026516, -1.67947468,
       -1.64102178, -1.20854751, -0.70164275, -1.11081022, -1.06175537,
       -1.12206921, -1.09400118, -1.10220987, -1.10137691, -1.07025863])

In [28]:
svc.predict(X_test_normalized[:20])

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

# Task 2

* Calculate AUC
* Display ROC curve and find the most optimal threshold


# Task 3

As we've seen, the dataset is unbalanced.

Try improving the performance by:


*   using weighted learning with class_weight parameter
*   use oversampling technique to synthesize samples from under-sampled class
* try combine both weighted learning and oversampling

