In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from mlutils import decision_regions

np.set_printoptions(suppress=True)

# Mis-steps in machine learning

First we'll import some data. I'm using an extract from the Rock Property Catalog, https://subsurfwiki.org/wiki/Rock_Property_Catalog

In [None]:
url = "https://raw.githubusercontent.com/scienxlab/datasets/refs/heads/main/rpc/rpc-4-lithologies.csv"
df = pd.read_csv(url)

df.head()

We're going to predict lithology from `Vp` and `rho`.

It's usually easier to look at a plot:

### ❓ What do we think of this dataset?

<p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p>

## A linear model: SVM

The **support vector machine** or SVM is a good model to start supervised classification with. It attempts to separate the classes with lines.

We will make a prediction called `y_pred`...

In [None]:
extent = [1400, 6500, 1.9, 2.9]
y_pred, y_all = decision_regions(svc, X, y, extent, step=(2, 0.005))

plt.figure(figsize=(10, 6))
plt.imshow(y_all <= 0.0, extent=extent, origin='lower', aspect='auto', alpha=0.5, interpolation='none')
plt.scatter(*X.T, c=is_sand(y), s=80, cmap='bwr')
plt.scatter(*X.T, c=is_sand(y_pred))
plt.show()

And we'd like an accuracy score:

<h2>❓ What do we think of this?</h2>

- What is 'good performance'?
- What could make a good benchmark?
- What would make a good lower bound? (Check out [the imbalance notebook](Balance_classes_with_SMOTE.ipynb).)
- Is there an upper bound?


<p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p>

----

## Validation

We only tested the model performance against the training data. We need to check against some rocks the model has not seen before.

### ❓ What could we do about it? What rocks can we use? How many do we need?

<p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p>

Split off the first 300 rows:

In [None]:
X_train, X_val = X[:300], X[300:]
y_train, y_val = y[:300], y[300:]

plt.scatter(*X_train.T, label='train')  # Blue points.
plt.scatter(*X_val.T, label='val')      # Orange points.
plt.legend()

<p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p>

That's no good. We need a random split instead.

In [None]:
plt.scatter(*X_train.T, label='train')
plt.scatter(*X_val.T, label='val')
plt.legend()

### ❓ Can you think of some occasions when a random split might not be okay?

- Certain kinds of data?
- Certain proportions of classes?
- Do we really want to roll the dice every time?

<p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p>

In [None]:
plt.scatter(*X_train.T, label='train')
plt.scatter(*X_val.T, label='val')
plt.legend()

### ❓ Will the model be better or worse now?

<p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p>

Because we're using less data to train, and because we're no longer checking the performance against data we trained on, the model probably gets a bit less predictive... but maybe we trust the prediction of future accuracy more.

In [None]:
svc = SVC(kernel='linear')

svc.fit(X_train, y_train)

y_pred = svc.predict(X_val)

print(accuracy_score(y_val, y_pred))

Do we care about the score against the training data?

Would we expect it to be lower or higher than the validation data?

<p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p>

In [None]:
y_pred_ = svc.predict(X_train)

print(accuracy_score(y_train, y_pred_))

Let's check how we're doing.

In [None]:
y_pred, y_all = decision_regions(svc, X_val, y_val, extent, step=(2, 0.005))

plt.figure(figsize=(10, 6))
plt.imshow(y_all <= 0.0, extent=extent, origin='lower', aspect='auto', alpha=0.5, interpolation='none')
plt.scatter(*X_train.T, c=is_sand(y_train), marker='+', cmap='bwr', alpha=0.50)
plt.scatter(*X_val.T, c=is_sand(y_val), s=80, cmap='bwr')
plt.scatter(*X_val.T, c=is_sand(y_pred))

---

## A more complex model

Notice that the model is linear. This makes the possibly big assumption that the decision boundary is linear in the feature space.


### ❓ Will a non-linear model do better or worse?

<p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p>

Let's try the default non-linear SVM, which uses a 'radial basis function' kernel.

In [None]:
svc = SVC(kernel='rbf')

svc.fit(X_train, y_train)

y_pred = svc.predict(X_val)

print(accuracy_score(y_val, y_pred))

In [None]:
y_pred, y_all = decision_regions(svc, X_val, y_val, extent, step=(2, 0.005))

plt.figure(figsize=(10, 6))
plt.imshow(y_all <= 0.0, extent=extent, origin='lower', aspect='auto', alpha=0.5, interpolation='none')
plt.scatter(*X_train.T, c=is_sand(y_train), marker='+', cmap='bwr', alpha=0.50)
plt.scatter(*X_val.T, c=is_sand(y_val), s=80, cmap='bwr')
plt.scatter(*X_val.T, c=is_sand(y_pred))

### ❓ The model is terrible. Why?

<p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p>

In [None]:
plt.scatter(*X.T, c=is_sand(y), s=20*(1+is_sand(y)), cmap='bwr')
plt.axis('equal')  # <-- So we can see the data space as sklearn 'sees' it.

In [None]:
y_pred, y_all = decision_regions(svc, X_val, y_val, [-500, 8000, -2500, 2500], step=(20, 20))

plt.figure(figsize=(10, 6))
plt.imshow(y_all <= 0.0, extent=[-500, 8000, -2500, 2500], origin='lower', aspect='auto', alpha=0.5, interpolation='none')
plt.scatter(*X_train.T, c=is_sand(y_train), s=20*(1+is_sand(y_train)), cmap='bwr')
plt.axis('equal')  # <-- So we can see the data space as sklearn 'sees' it.

### ❓ What can we do about it?

<p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p>

---

## Standardization

We gave the model our raw data. We get away with it with the linear model, but any algorithms that depend on distance — either in the learning algorithm, or the cost function, or in regularization — would prefer to have standardized data. That way, they work in 'Z-score' space. (When you plot with `matplotlib` it's doing a min/max scaling on both axes so that the points look reasonable. It's a similar idea.)

In [None]:
scaler = StandardScaler()



### ❓ Now what do I do?

<p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p>

In [None]:
plt.scatter(*X_train.T, label='train')
plt.scatter(*X_val.T, label='val')
plt.legend()
plt.axis('equal')  # <-- So we can see the data space as sklearn 'sees' it.
plt.show()

The linear model should do about as well as before:

In [None]:
svc = SVC(kernel='linear')

svc.fit(X_train, y_train)

y_pred = svc.predict(X_val)

print(accuracy_score(y_val, y_pred))

But the non-linear model will be much better:

In [None]:
svc = SVC(kernel='rbf')

svc.fit(X_train, y_train)

y_pred = svc.predict(X_val)

print(accuracy_score(y_val, y_pred))

In [None]:
extent = [-3, 3, -3, 3]

y_pred, y_all = decision_regions(svc, X_val, y_val, extent, step=(0.01, 0.01))

plt.figure(figsize=(10, 10))
plt.imshow(y_all <= 0.0, extent=extent, origin='lower', aspect='auto', alpha=0.5, interpolation='none')
plt.scatter(*X_train.T, c=is_sand(y_train), marker='+', cmap='bwr', alpha=0.5)
plt.scatter(*X_val.T, c=is_sand(y_val), s=80, cmap='bwr')
plt.scatter(*X_val.T, c=is_sand(y_pred))
plt.axis('equal')
plt.xlim(-2.5, 2.5); plt.ylim(-2.5, 2.5)

Notice at the bottom that the model is still not suitable for extrapolation.

---

## Untuned model

We didn't try to adjust hyperparameters to get a better fit. Turns out, if you do this, the model does better with a different value for `C`:

In [None]:
vals = []
for C in (Cs := np.logspace(-3, 4, 15)):
    svc = SVC(C=C).fit(X_train, y_train)
    vals.append(svc.score(X_val, y_val))
    
plt.plot(Cs, vals, 'o-')
plt.xscale('log')
plt.ylim(0.9, 1.0)

In [None]:
svc = SVC(C=100)

svc.fit(X_train, y_train)

y_pred = svc.predict(X_val)

print(accuracy_score(y_val, y_pred))

plt.scatter(*X_val.T, c=is_sand(y_val), s=80, cmap='bwr')
plt.scatter(*X_val.T, c=is_sand(y_pred))

In [None]:
y_pred, y_all = decision_regions(svc, X_val, y_val, extent, step=0.02)
    
plt.figure(figsize=(10, 10))
plt.imshow(y_all <= 0.0, extent=extent, origin='lower', aspect='auto', alpha=0.5, interpolation='none')
plt.scatter(*X_train.T, c=is_sand(y_train), marker='+', cmap='bwr')
plt.scatter(*X_val.T, c=is_sand(y_val), s=80, cmap='bwr')
plt.scatter(*X_val.T, c=is_sand(y_pred))
plt.axis('equal')
plt.xlim(-2.5, 2.5); plt.ylim(-2.5, 2.5)
plt.show()

Now we have to make a decision about what we think is more reasonable.

After that, there are plenty more gotchas:

- We have assumed that the labels are correct and the data is accurate.
- A few hundred records is not much data; we should be careful about where we apply this model.
- We have only tried one model type, and have not tuned all of its hyperparameters.
- As we add more features, we have to remember the curse of dimensionality.
- As we try more things, we need to start using a `test` dataset.

---

&copy; 2023 Matt Hall, licensed CC BY