In [2]:
import pandas as pd
import creep_event_picker as cep
import numpy as np
from tqdm import tqdm
import scipy




In [3]:
# Load the data from the CSV file
data = pd.read_csv('../../Data/DATA_tidied/CSV/xhr2.csv', parse_dates=['Time'])

# Assuming 'Time' is the timestamp column and 'Slip' is the value column
time = pd.to_datetime(data['Time'])
slip = data['Slip']

# Upsample the data to 1-minute intervals using cep.interpolate
tm_int, slip_int, upsampled = cep.interpolate(time, slip, 10)  # 1-minute interval

sos = scipy.signal.butter(4,[1/7200,1/120], 'band',output = 'sos',fs=0.10) #bandpass filter for 2hrs and 5days
creep_data  = scipy.signal.sosfiltfilt(sos,slip_int) # filter the data
time_series_data = pd.Series(creep_data, index=pd.to_datetime(tm_int))

In [4]:
event_catalogue = pd.read_csv('../../Data/all_creep_event_picks_new_qc_Oct_02_2024.csv',index_col=0)
event_catalogue.drop(event_catalogue[event_catalogue['Creepmeter_abbrv']!='XHR2'].index,inplace=True)
event_catalogue.reset_index(inplace=True,drop=True)
event_catalogue['start_time'] = pd.to_datetime(event_catalogue['ST'])
event_catalogue['end_time'] = pd.to_datetime(event_catalogue['ET'])

In [5]:
# Assuming `time_series_data` is a Pandas Series with the time series values
# and `event_catalogue` is a DataFrame with the event start and end times

# Initialize a labels array with zeros
labels = np.zeros(len(time_series_data), dtype=int)

# Label the data based on event start and end times
for _, row in event_catalogue.iterrows():
    start_time = row['start_time']
    end_time = row['end_time']
    
    # Find the index positions for start and end times
    start_index = time_series_data.index.searchsorted(start_time, side='left')
    end_index = time_series_data.index.searchsorted(end_time, side='right') - 1
    
    # Ensure indices are within bounds
    start_index = min(start_index, len(time_series_data) - 1)
    end_index = min(end_index, len(time_series_data) - 1)
    
    # Label the range between start and end indices
    if start_index <= end_index:
        labels[start_index:end_index + 1] = 1  # +1 to include the end index

# Combine into a DataFrame if needed
data_with_labels = pd.DataFrame({
    'value': time_series_data,
    'label': labels
})

# Optional: Display the first few rows of the labeled data
print(data_with_labels.head())


                        value  label
1991-11-24 00:00:00 -0.009379      0
1991-11-24 00:10:00 -0.015654      0
1991-11-24 00:20:00 -0.021321      0
1991-11-24 00:30:00 -0.025900      0
1991-11-24 00:40:00 -0.029126      0


In [6]:
# Assume 'data_with_labels' is your DataFrame with 'value' and 'label' columns

# Create rolling features
data_with_labels['rolling_mean'] = data_with_labels['value'].rolling(window=60).mean()  # Mean over the last hour
data_with_labels['rolling_std'] = data_with_labels['value'].rolling(window=60).std()   # Std over the last hour

# Create lag features
data_with_labels['lag_1'] = data_with_labels['value'].shift(1)  # Previous value
data_with_labels['lag_2'] = data_with_labels['value'].shift(2)  # Value from two time steps back

# Cumulative features
data_with_labels['cumulative_sum'] = data_with_labels['value'].cumsum()
data_with_labels['cumulative_mean'] = data_with_labels['value'].expanding().mean()


# Fill NaNs created by rolling or lag features
data_with_labels.fillna(method='bfill', inplace=True)  # Backfill NaNs
data_with_labels.fillna(method='ffill', inplace=True)  # Forward fill if needed

  data_with_labels.fillna(method='bfill', inplace=True)  # Backfill NaNs
  data_with_labels.fillna(method='ffill', inplace=True)  # Forward fill if needed


In [7]:
data_with_labels

Unnamed: 0,value,label,rolling_mean,rolling_std,lag_1,lag_2,cumulative_sum,cumulative_mean
1991-11-24 00:00:00,-0.009379,0,-0.028697,0.003331,-0.009379,-0.009379,-0.009379,-0.009379
1991-11-24 00:10:00,-0.015654,0,-0.028697,0.003331,-0.009379,-0.009379,-0.025034,-0.012517
1991-11-24 00:20:00,-0.021321,0,-0.028697,0.003331,-0.015654,-0.009379,-0.046355,-0.015452
1991-11-24 00:30:00,-0.025900,0,-0.028697,0.003331,-0.021321,-0.015654,-0.072254,-0.018064
1991-11-24 00:40:00,-0.029126,0,-0.028697,0.003331,-0.025900,-0.021321,-0.101381,-0.020276
...,...,...,...,...,...,...,...,...
2005-06-26 23:40:00,0.004005,0,0.000900,0.006366,0.004169,0.003741,-1.366874,-0.000002
2005-06-26 23:50:00,0.003129,0,0.000808,0.006291,0.004005,0.004169,-1.363744,-0.000002
2005-06-27 00:00:00,0.001537,0,0.000683,0.006199,0.003129,0.004005,-1.362208,-0.000002
2005-06-27 00:10:00,-0.000644,0,0.000514,0.006092,0.001537,0.003129,-1.362851,-0.000002


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Step 1: Define features and target
X = data_with_labels.drop(columns=['value', 'label'])
y = data_with_labels['label']

# Step 2: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 3: Reshape the data for LSTM
# Assuming your data has a single timestep, adjust if necessary
X_train_reshaped = X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_reshaped = X_test.values.reshape((X_test.shape[0], 1, X_test.shape[1]))

# Step 4: Build the LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))  # Use 'softmax' for multi-class classification

# Step 5: Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Step 6: Train the model
model.fit(X_train_reshaped, y_train, epochs=50, batch_size=32, validation_data=(X_test_reshaped, y_test))

# Step 7: Evaluate the model
loss, accuracy = model.evaluate(X_test_reshaped, y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")


In [12]:
del event_catalogue, creep_data,tm_int,slip_int,slip, time, data

In [13]:


def create_windows(data, labels, window_size):
    # Ensure the data and labels are numpy arrays
    data = data.values  # Convert to numpy array if it's not already
    labels = labels.values  # Convert to numpy array if it's not already

    # Calculate the number of windows
    num_windows = len(data) - window_size + 1

    # Create 3D array for the windows
    X = np.empty((num_windows, window_size))  # Pre-allocate memory for efficiency

    # Use tqdm to display progress
    for i in tqdm(range(num_windows), desc="Creating windows"):
        X[i] = data[i:i + window_size]  # Fill the window

    # Create labels for each window using the last label in each window
    y = labels[window_size - 1:]  # Get labels for the last point in each window
    
    return X, y

# Define your window size (e.g., 1440 for 1 day if data is at 1-minute intervals)
window_size = 1440  # Change as needed

# Create windows of data
X, y = create_windows(data_with_labels['value'], data_with_labels['label'], window_size)

# Check the shape of the output
print("Shape of X:", X.shape)  # Should be (num_samples, window_size)
print("Shape of y:", y.shape)  # Should be (num_samples,)



Creating windows: 100%|██████████| 7146742/7146742 [00:45<00:00, 157531.17it/s]

Shape of X: (7146742, 1440)
Shape of y: (7146742,)





  data_with_labels.fillna(method='bfill', inplace=True)  # Backfill NaNs
  data_with_labels.fillna(method='ffill', inplace=True)  # Forward fill if needed


In [7]:
# Reshape X to be 3D for LSTM (samples, time steps, features)
X_reshaped = X.reshape((X.shape[0], X.shape[1], 1))  # Assuming 1 feature


In [8]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y, test_size=0.2, random_state=42, shuffle=False)

# Check the shapes of the resulting datasets
print("Training data shape:", X_train.shape)  # Should be (num_samples_train, window_size, 1)
print("Testing data shape:", X_test.shape)    # Should be (num_samples_test, window_size, 1)
print("Training labels shape:", y_train.shape)  # Should match num_samples_train
print("Testing labels shape:", y_test.shape)    # Should match num_samples_test


Training data shape: (570704, 1440, 1)
Testing data shape: (142676, 1440, 1)
Training labels shape: (570704,)
Testing labels shape: (142676,)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Build the LSTM model
model = Sequential()

# Add LSTM layer
model.add(LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], 1)))  # 50 units, returns sequences for the next layer
model.add(Dropout(0.2))  # Dropout layer to prevent overfitting

# Add a second LSTM layer
model.add(LSTM(50, return_sequences=False))  # Last LSTM layer does not return sequences
model.add(Dropout(0.2))

# Add the output layer
model.add(Dense(1, activation='sigmoid'))  # Sigmoid for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()
