# Splitting autocorrelated data

We'll make some autocorrelated data and then try learning on it, with and without leakage.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

well_logs = 'https://geocomp.s3.amazonaws.com/data/Colorado_wells.csv.zip'
df = pd.read_csv(well_logs, index_col=0)
df.describe()

In [None]:
df = df.dropna()

In [None]:
df = df[df['DT'] > 50]

In [None]:
df['log_RES'] = np.log10(df['RES'])

In [None]:
df.head()

In [None]:
len(df['Well'].unique())

## Reduce dataset size and make holdout

The effect I want to illustrate will show up more obviously in a relatively small dataset. Let's start with only 10 wells, but we can change it later and run again.

In [None]:
df_ = df[df['Well'] >= 80]
test = df[df['Well'] < 80]

In [None]:
df_['Well'].unique()

## Split naively

In [None]:
from sklearn.model_selection import train_test_split

features = ['GR', 'log_RES', 'NPHI', 'RHOB']

X = df_[features].values
y = df_['DT'].values

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.6, random_state=42)

X_test = test[features].values
y_test = test['DT'].values

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge, LinearRegression

pipe = make_pipeline(StandardScaler(), Ridge())

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_val)
np.sqrt(mean_squared_error(y_val, y_pred)), r2_score(y_val, y_pred)

Let's look at the performance on the holdout:

In [None]:
y_pred_test = pipe.predict(X_test)
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_test)))
print("R2:  ", r2_score(y_test, y_pred_test))

## Split by well

Note that wells can still be spatially correlated, we don't have location info in this dataset.

In [None]:
from sklearn.model_selection import GroupShuffleSplit

gss = GroupShuffleSplit(n_splits=1, train_size=0.6, random_state=42)
(train_idx, val_idx), = gss.split(X, y, df_['Well'])

In [None]:
X_train = df[features].values[train_idx]
X_val = df[features].values[val_idx]

y_train = df['DT'].values[train_idx]
y_val = df['DT'].values[val_idx]

In [None]:
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_val)
np.sqrt(mean_squared_error(y_val, y_pred)), r2_score(y_val, y_pred)

In [None]:
y_pred_test_correct = pipe.predict(X_test)
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_test_correct)))
print("R2:  ", r2_score(y_test, y_pred_test_correct))

## Compare

How do they compare qualitatively?

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(15, 3))
ax.plot(y_test[:500], label='Actual')
ax.plot(y_pred_test[:500], label='Leaky')
ax.plot(y_pred_test_correct[:500], label='Correct')
ax.legend()

---

&copy; 2023 Matt Hall, licensed CC BY