In [6]:
import pandas as pd
import numpy as np
import h5py
import creep_event_picker as cep
from sklearn.preprocessing import StandardScaler
import scipy
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split


In [2]:


# Load the HDF5 data
with h5py.File('../../Data/DATA_tidied/HDF5/CWN1.h5', 'r') as hdf:
    # Access a specific dataset
    dataset = hdf['CWN1_10.0mins']
    slip = dataset['Slip_CWN1_10.0mins'][:]
    time = dataset['Time_CWN1_10.0mins'][:]
    
    # Decode the time data
    decoded_time = [byte_str.decode('utf-8') for byte_str in time]
    decoded_time = pd.to_datetime(decoded_time)

    # Interpolate data
    tm_int, creep_int, upsampled = cep.interpolate(decoded_time, slip, 10)
    sos = scipy.signal.butter(4,[1/7200,1/120], 'band',output = 'sos',fs=0.10) #bandpass filter for 2hrs and 5days
    creep_data  = scipy.signal.sosfiltfilt(sos,creep_int) # filter the data
    time_series_data = pd.Series(creep_data, index=pd.to_datetime(tm_int))

# Load the event catalogue
event_catalogue = pd.read_csv('../../Data/all_creep_event_picks_new_qc_Oct_02_2024.csv',index_col=0)
event_catalogue.drop(event_catalogue[event_catalogue['Creepmeter_abbrv']!='CWN1'].index,inplace=True)
event_catalogue.drop(event_catalogue[event_catalogue['File_code']!='cwn_0'].index,inplace=True)
event_catalogue.reset_index(inplace=True,drop=True)
event_catalogue['start_time'] = pd.to_datetime(event_catalogue['ST'])
event_catalogue['end_time'] = pd.to_datetime(event_catalogue['ET'])
event_catalogue

# Initialize a labels array with zeros
labels = np.zeros(len(time_series_data), dtype=int)

# Label the data based on event start and end times
for _, row in event_catalogue.iterrows():
    start_time = row['start_time']
    end_time = row['end_time']
    
    # Find the start index for the event
    start_index = time_series_data.index.searchsorted(start_time, side='left')
    # Find the end index for the event (add one to include the last point in the event)
    end_index = time_series_data.index.searchsorted(end_time, side='right') - 1
    
    # Ensure indices are within bounds
    start_index = min(start_index, len(time_series_data) - 1)
    end_index = min(end_index, len(time_series_data) - 1)
    
    # Label the range between start and end indices
    if start_index <= end_index:
        labels[start_index:end_index + 1] = 1  # +1 to include the end index

# Combine into a DataFrame if needed
data_with_labels = pd.DataFrame({
    'value': time_series_data,
    'label': labels
})

# Optional: Display the first few rows of the labeled data
print(data_with_labels)  # Display more rows for better context


                        value  label
1991-11-24 00:00:00 -0.007280      0
1991-11-24 00:10:00 -0.000823      0
1991-11-24 00:20:00  0.004949      0
1991-11-24 00:30:00  0.009505      0
1991-11-24 00:40:00  0.012568      0
...                       ...    ...
2020-08-28 06:00:00  0.012500      0
2020-08-28 06:10:00  0.011360      0
2020-08-28 06:20:00  0.009665      0
2020-08-28 06:30:00  0.007517      0
2020-08-28 06:40:00  0.005115      0

[1512761 rows x 2 columns]


In [7]:
'''def extract_features(time_series, labels, window_size):
    features = []
    extracted_labels = []

    # Iterate through the time series data using a sliding window
    for i in range(len(time_series) - window_size + 1):
        window = time_series[i:i + window_size]
        
        # Calculate features for the current window
        mean = window.mean()
        std = window.std()
        min_val = window.min()
        max_val = window.max()
        median = window.median()
        q25 = np.percentile(window, 25)
        q75 = np.percentile(window, 75)
        
        features.append([mean, std, min_val, max_val, median, q25, q75])
        
        # Access labels using .iloc to avoid the FutureWarning
        extracted_labels.append(labels.iloc[i + window_size - 1])  # Use the label for the last point in the window

    # Convert to numpy arrays
    features = np.array(features)
    extracted_labels = np.array(extracted_labels)

    return features, extracted_labels'''



def extract_features(time_series, labels, window_size):
    features = []
    extracted_labels = []

    # Use tqdm to create a progress bar for the loop
    for i in tqdm(range(len(time_series) - window_size + 1), desc="Extracting features"):
        window = time_series[i:i + window_size]
        
        # Calculate features
        mean = window.mean()
        std = window.std()
        min_val = window.min()
        max_val = window.max()
        median = window.median()
        q25 = np.percentile(window, 25)
        q75 = np.percentile(window, 75)
        rms = np.sqrt(np.mean(window**2))  # Root Mean Square
        skewness = scipy.stats.skew(window)  # Skewness
        kurtosis = scipy.stats.kurtosis(window)  # Kurtosis
        
        features.append([mean, std, min_val, max_val, median, q25, q75, rms, skewness, kurtosis])
        
        extracted_labels.append(labels.iloc[i + window_size - 1])

    features = np.array(features)
    extracted_labels = np.array(extracted_labels)

    return features, extracted_labels



# Define window size
window_size = 432  # Example: 3 days worth of data

# Extract features
X, y = extract_features(data_with_labels['value'], data_with_labels['label'], window_size)

# Optionally, standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Combine features and labels into a DataFrame
features_df = pd.DataFrame(X_scaled, columns=['mean', 'std', 'min', 'max', 'median', 'q25', 'q75'])
features_df['label'] = y

# Display the first few rows of the features DataFrame
print(features_df.head())


Extracting features:  17%|█▋        | 251455/1512330 [05:17<26:34, 790.76it/s]


KeyboardInterrupt: 

In [4]:

# Assuming X and y are already defined and the model has been trained

# Step 2.4: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Print the sizes of the training and testing sets
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

# Step 3.5: Train a Random Forest Model
rf_model = RandomForestClassifier(class_weight='balanced',n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
print("Random Forest model trained successfully.")

# Step 3.6: Evaluate the Model's Performance
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}\n")

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Optionally, you can also display the individual metrics
cm = confusion_matrix(y_test, y_pred)
TP = cm[1, 1]  # True Positives
TN = cm[0, 0]  # True Negatives
FP = cm[0, 1]  # False Positives
FN = cm[1, 0]  # False Negatives

# Calculate Precision, Recall, and F1-Score
precision = TP / (TP + FP) if (TP + FP) > 0 else 0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print("\nCustom Metrics Calculation:")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1_score:.2f}")


Training set size: 1209864 samples
Testing set size: 302466 samples
Random Forest model trained successfully.
Accuracy: 1.00

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    278386
           1       1.00      1.00      1.00     24080

    accuracy                           1.00    302466
   macro avg       1.00      1.00      1.00    302466
weighted avg       1.00      1.00      1.00    302466

Confusion Matrix:
[[278327     59]
 [    79  24001]]

Custom Metrics Calculation:
Precision: 1.00
Recall: 1.00
F1-Score: 1.00


In [5]:
import matplotlib.pyplot as plt
%matplotlib qt
plt.close('all')

# Assuming 'data_with_labels' is your DataFrame with 'value', 'label', and 'predictions'
data_with_labels['predictions'] = np.zeros(len(data_with_labels), dtype=int)  # Initialize with zeros
data_with_labels['predictions'][:len(y_pred)] = y_pred  # Assign predictions to the corresponding length

plt.figure(figsize=(15, 6))

# Plot the entire time series in blue
plt.plot(data_with_labels.index, data_with_labels['value'], color='blue', label='Creep Data', alpha=0.5)

# Highlight the actual event times in red
actual_event_times = data_with_labels.index[data_with_labels['label'] == 1]
plt.scatter(actual_event_times, 
            data_with_labels['value'][data_with_labels['label'] == 1],
            color='red', label='Actual Events', marker='o', s=50)  # Use markers for visibility

# Highlight the predicted event times in green
predicted_event_times = data_with_labels.index[data_with_labels['predictions'] == 1]
plt.scatter(predicted_event_times, 
            data_with_labels['value'][data_with_labels['predictions'] == 1],
            color='green', label='Predicted Events', marker='o', s=50,alpha=0.5)  # Use markers for visibility

# Add labels and title
plt.xlabel('Time')
plt.ylabel('Creep Value')
plt.title('Time Series with Highlighted Actual and Predicted Event Times')
plt.legend()
plt.grid()

# Show the plot
plt.show()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_with_labels['predictions'][:len(y_pred)] = y_pred  # Assign predictions to the corresponding length


In [9]:
import pandas as pd
import numpy as np
import h5py
import creep_event_picker as cep
from sklearn.preprocessing import StandardScaler
import scipy
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import scipy.stats

# Load the HDF5 data
with h5py.File('../../Data/DATA_tidied/HDF5/XHR2.h5', 'r') as hdf:
    # Access a specific dataset
    dataset = hdf['XHR2_10.0mins']
    slip = dataset['Slip_XHR2_10.0mins'][:]
    time = dataset['Time_XHR2_10.0mins'][:]
    
    # Decode the time data
    decoded_time = [byte_str.decode('utf-8') for byte_str in time]
    decoded_time = pd.to_datetime(decoded_time)

    # Interpolate data
    tm_int, creep_int, upsampled = cep.interpolate(decoded_time, slip, 10)
    sos = scipy.signal.butter(4,[1/7200,1/120], 'band',output='sos',fs=0.10)  # bandpass filter for 2hrs and 5days
    creep_data  = scipy.signal.sosfiltfilt(sos, creep_int)  # filter the data
    time_series_data = pd.Series(creep_data, index=pd.to_datetime(tm_int))

# Load the event catalogue
event_catalogue = pd.read_csv('../../Data/all_creep_event_picks_new_qc_Oct_02_2024.csv', index_col=0)
event_catalogue.drop(event_catalogue[event_catalogue['Creepmeter_abbrv'] != 'XHR2'].index, inplace=True)
event_catalogue.reset_index(inplace=True, drop=True)
event_catalogue['start_time'] = pd.to_datetime(event_catalogue['ST'])
event_catalogue['end_time'] = pd.to_datetime(event_catalogue['ET'])

# Initialize a labels array with zeros
labels = np.zeros(len(time_series_data), dtype=int)

# Label the data based on event start and end times
for _, row in event_catalogue.iterrows():
    start_time = row['start_time']
    end_time = row['end_time']
    
    # Find the start index for the event
    start_index = time_series_data.index.searchsorted(start_time, side='left')
    # Find the end index for the event (add one to include the last point in the event)
    end_index = time_series_data.index.searchsorted(end_time, side='right') - 1
    
    # Ensure indices are within bounds
    start_index = min(start_index, len(time_series_data) - 1)
    end_index = min(end_index, len(time_series_data) - 1)
    
    # Label the range between start and end indices
    if start_index <= end_index:
        labels[start_index:end_index + 1] = 1  # +1 to include the end index

# Combine into a DataFrame if needed
data_with_labels = pd.DataFrame({
    'value': time_series_data,
    'label': labels
})

# Function to extract features with tqdm progress bar
def extract_features(time_series, labels, window_size):
    features = []
    extracted_labels = []

    # Use tqdm to create a progress bar for the loop
    for i in tqdm(range(len(time_series) - window_size + 1), desc="Extracting features"):
        window = time_series[i:i + window_size]
        
        # Calculate features
        mean = window.mean()
        std = window.std()
        min_val = window.min()
        max_val = window.max()
        median = window.median()
        q25 = np.percentile(window, 25)
        q75 = np.percentile(window, 75)
        rms = np.sqrt(np.mean(window**2))  # Root Mean Square
        skewness = scipy.stats.skew(window)  # Skewness
        kurtosis = scipy.stats.kurtosis(window)  # Kurtosis
        
        # Append all features to the list
        features.append([mean, std, min_val, max_val, median, q25, q75, rms, skewness, kurtosis])
        
        # Access labels using .iloc to avoid the FutureWarning
        extracted_labels.append(labels[i + window_size - 1])  # Use the label for the last point in the window

    # Convert to numpy arrays
    features = np.array(features)
    extracted_labels = np.array(extracted_labels)

    return features, extracted_labels

# Define window size
window_size = 432  # Example: 3 days worth of data

# Extract features with a progress bar
X, y = extract_features(data_with_labels['value'], data_with_labels['label'], window_size)

# Optionally, standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Combine features and labels into a DataFrame
features_df = pd.DataFrame(X_scaled, columns=['mean', 'std', 'min', 'max', 'median', 'q25', 'q75', 'rms', 'skewness', 'kurtosis'])
features_df['label'] = y

# Display the first few rows of the features DataFrame
print(features_df.head())

# Step 2.4: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)
# Print the sizes of the training and testing sets
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

# Step 3.5: Train a Random Forest Model
rf_model = RandomForestClassifier(class_weight='balanced', n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
print("Random Forest model trained successfully.")

# Step 3.6: Evaluate the Model's Performance
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}\n")

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Optionally, you can also display the individual metrics
cm = confusion_matrix(y_test, y_pred)
TP = cm[1, 1]  # True Positives
TN = cm[0, 0]  # True Negatives
FP = cm[0, 1]  # False Positives
FN = cm[1, 0]  # False Negatives

# Calculate Precision, Recall, and F1-Score
precision = TP / (TP + FP) if (TP + FP) > 0 else 0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print("\nCustom Metrics Calculation:")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1_score:.2f}")


  extracted_labels.append(labels[i + window_size - 1])  # Use the label for the last point in the window
Extracting features: 100%|██████████| 714388/714388 [14:47<00:00, 805.05it/s]


       mean       std       min       max    median       q25       q75  \
0 -0.096964 -0.160655  0.159574 -0.182064  0.069596  0.219435 -0.152055   
1 -0.095925 -0.160655  0.159574 -0.182064  0.070364  0.219712 -0.152055   
2 -0.094396 -0.160723  0.159574 -0.182064  0.070919  0.219917 -0.152055   
3 -0.092425 -0.160891  0.159574 -0.182064  0.070919  0.220217 -0.152055   
4 -0.090097 -0.161166  0.159574 -0.182064  0.070919  0.220703 -0.152055   

        rms  skewness  kurtosis  label  
0 -0.170135 -1.682172  0.096769      0  
1 -0.170174 -1.690388  0.104590      0  
2 -0.170296 -1.704314  0.120952      0  
3 -0.170529 -1.722608  0.145668      0  
4 -0.170878 -1.743082  0.175753      0  
Training set size: 571510 samples
Testing set size: 142878 samples
Random Forest model trained successfully.
Accuracy: 1.00

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    134771
           1       1.00      1.00      1.00   

In [12]:
# Get predicted probabilities
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]

# Adjust the threshold
threshold = 0.7  # Change this value to your desired threshold
y_pred_adjusted = (y_pred_proba >= threshold).astype(int)

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)

# Evaluate the adjusted predictions
accuracy_adjusted = accuracy_score(y_test, y_pred_adjusted)
precision_adjusted = precision_score(y_test, y_pred_adjusted)
recall_adjusted = recall_score(y_test, y_pred_adjusted)
f1_score_adjusted = f1_score(y_test, y_pred_adjusted)
confusion_matrix_adjusted = confusion_matrix(y_test, y_pred_adjusted)

# Print the evaluation metrics
print(f"Adjusted Accuracy: {accuracy_adjusted:.2f}")
print(f"Adjusted Precision: {precision_adjusted:.2f}")
print(f"Adjusted Recall: {recall_adjusted:.2f}")
print(f"Adjusted F1 Score: {f1_score_adjusted:.2f}")
print("Adjusted Confusion Matrix:")
print(confusion_matrix_adjusted)



Adjusted Accuracy: 1.00
Adjusted Precision: 1.00
Adjusted Recall: 0.99
Adjusted F1 Score: 0.99
Adjusted Confusion Matrix:
[[134757     14]
 [    71   8036]]


In [10]:
import matplotlib.pyplot as plt
%matplotlib qt
plt.close('all')

# Assuming 'data_with_labels' is your DataFrame with 'value', 'label', and 'predictions'
data_with_labels['predictions'] = np.zeros(len(data_with_labels), dtype=int)  # Initialize with zeros
data_with_labels['predictions'][:len(y_pred)] = y_pred  # Assign predictions to the corresponding length

plt.figure(figsize=(15, 6))

# Plot the entire time series in blue
plt.plot(data_with_labels.index, data_with_labels['value'], color='blue', label='Creep Data', alpha=0.5)

# Highlight the actual event times in red
actual_event_times = data_with_labels.index[data_with_labels['label'] == 1]
plt.scatter(actual_event_times, 
            data_with_labels['value'][data_with_labels['label'] == 1],
            color='red', label='Actual Events', marker='o', s=50)  # Use markers for visibility

# Highlight the predicted event times in green
predicted_event_times = data_with_labels.index[data_with_labels['predictions'] == 1]
plt.scatter(predicted_event_times, 
            data_with_labels['value'][data_with_labels['predictions'] == 1],
            color='green', label='Predicted Events', marker='o', s=50,alpha=0.5)  # Use markers for visibility

# Add labels and title
plt.xlabel('Time')
plt.ylabel('Creep Value')
plt.title('Time Series with Highlighted Actual and Predicted Event Times')
plt.legend()
plt.grid()

# Show the plot
plt.show()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_with_labels['predictions'][:len(y_pred)] = y_pred  # Assign predictions to the corresponding length


In [13]:
import pandas as pd
import numpy as np
import h5py
import creep_event_picker as cep
from sklearn.preprocessing import StandardScaler
import scipy
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)
from sklearn.model_selection import train_test_split

# Load the HDF5 data
with h5py.File('../../Data/DATA_tidied/HDF5/CWN1.h5', 'r') as hdf:
    # Access a specific dataset
    dataset = hdf['CWN1_10.0mins']
    slip = dataset['Slip_CWN1_10.0mins'][:]
    time = dataset['Time_CWN1_10.0mins'][:]
    
    # Decode the time data
    decoded_time = [byte_str.decode('utf-8') for byte_str in time]
    decoded_time = pd.to_datetime(decoded_time)

    # Interpolate data
    tm_int, creep_int, upsampled = cep.interpolate(decoded_time, slip, 10)
    sos = scipy.signal.butter(4,[1/7200,1/120], 'band',output='sos',fs=0.10)  # bandpass filter for 2hrs and 5days
    creep_data  = scipy.signal.sosfiltfilt(sos, creep_int)  # filter the data
    time_series_data = pd.Series(creep_data, index=pd.to_datetime(tm_int))

# Load the event catalogue
event_catalogue = pd.read_csv('../../Data/all_creep_event_picks_new_qc_Oct_02_2024.csv', index_col=0)
event_catalogue.drop(event_catalogue[event_catalogue['Creepmeter_abbrv'] != 'CWN1'].index, inplace=True)
event_catalogue.drop(event_catalogue[event_catalogue['File_code'] != 'cwn_0'].index, inplace=True)
event_catalogue.reset_index(inplace=True, drop=True)
event_catalogue['start_time'] = pd.to_datetime(event_catalogue['ST'])
event_catalogue['end_time'] = pd.to_datetime(event_catalogue['ET'])

# Initialize a labels array with zeros
labels = np.zeros(len(time_series_data), dtype=int)

# Label the data based on event start and end times
for _, row in event_catalogue.iterrows():
    start_time = row['start_time']
    end_time = row['end_time']
    
    # Find the start index for the event
    start_index = time_series_data.index.searchsorted(start_time, side='left')
    # Find the end index for the event (add one to include the last point in the event)
    end_index = time_series_data.index.searchsorted(end_time, side='right') - 1
    
    # Ensure indices are within bounds
    start_index = min(start_index, len(time_series_data) - 1)
    end_index = min(end_index, len(time_series_data) - 1)
    
    # Label the range between start and end indices
    if start_index <= end_index:
        labels[start_index:end_index + 1] = 1  # +1 to include the end index

# Combine into a DataFrame if needed
data_with_labels = pd.DataFrame({
    'value': time_series_data,
    'label': labels
})

def extract_features(time_series, labels, window_size):
    features = []
    extracted_labels = []

    # Iterate through the time series data using a sliding window
    for i in range(len(time_series) - window_size + 1):
        window = time_series[i:i + window_size]
        
        # Calculate features for the current window
        mean = window.mean()
        std = window.std()
        min_val = window.min()
        max_val = window.max()
        median = window.median()
        q25 = np.percentile(window, 25)
        q75 = np.percentile(window, 75)
        
        features.append([mean, std, min_val, max_val, median, q25, q75])
        
        # Access labels using .iloc to avoid the FutureWarning
        extracted_labels.append(labels.iloc[i + window_size - 1])  # Use the label for the last point in the window

    # Convert to numpy arrays
    features = np.array(features)
    extracted_labels = np.array(extracted_labels)

    return features, extracted_labels

# Define window size
window_size = 432  # Example: 3 days worth of data

# Extract features
X, y = extract_features(data_with_labels['value'], data_with_labels['label'], window_size)

# Optionally, standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Combine features and labels into a DataFrame
features_df = pd.DataFrame(X_scaled, columns=['mean', 'std', 'min', 'max', 'median', 'q25', 'q75'])
features_df['label'] = y

# Step 2.4: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 3.5: Train a Random Forest Model
rf_model = RandomForestClassifier(class_weight='balanced', n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Get predicted probabilities
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]

# Adjust the threshold
threshold = 0.7  # Change this value to your desired threshold
y_pred_adjusted = (y_pred_proba >= threshold).astype(int)

# Evaluate the adjusted predictions
accuracy_adjusted = accuracy_score(y_test, y_pred_adjusted)
precision_adjusted = precision_score(y_test, y_pred_adjusted)
recall_adjusted = recall_score(y_test, y_pred_adjusted)
f1_score_adjusted = f1_score(y_test, y_pred_adjusted)
confusion_matrix_adjusted = confusion_matrix(y_test, y_pred_adjusted)

# Print the evaluation metrics
print(f"Adjusted Accuracy: {accuracy_adjusted:.2f}")
print(f"Adjusted Precision: {precision_adjusted:.2f}")
print(f"Adjusted Recall: {recall_adjusted:.2f}")
print(f"Adjusted F1 Score: {f1_score_adjusted:.2f}")
print("Adjusted Confusion Matrix:")
print(confusion_matrix_adjusted)

# Analyze the confusion matrix
TP = confusion_matrix_adjusted[1, 1]  # True Positives
TN = confusion_matrix_adjusted[0, 0]  # True Negatives
FP = confusion_matrix_adjusted[0, 1]  # False Positives
FN = confusion_matrix_adjusted[1, 0]  # False Negatives

print(f"True Positives: {TP}")
print(f"True Negatives: {TN}")
print(f"False Positives: {FP}")
print(f"False Negatives: {FN}")

# Visualize the confusion matrix
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix_adjusted, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['No Event', 'Event'], yticklabels=['No Event', 'Event'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix (Adjusted)')
plt.show()


Adjusted Accuracy: 1.00
Adjusted Precision: 1.00
Adjusted Recall: 0.99
Adjusted F1 Score: 0.99
Adjusted Confusion Matrix:
[[278355     31]
 [   231  23849]]
True Positives: 23849
True Negatives: 278355
False Positives: 31
False Negatives: 231


qt.qpa.backingstore: Back buffer dpr of 2 doesn't match <NSViewBackingLayer: 0x7fa8ffc1ec20> contents scale of 1 - updating layer to match.
qt.qpa.backingstore: Back buffer dpr of 1 doesn't match <NSViewBackingLayer: 0x7fa8ffc1ec20> contents scale of 2 - updating layer to match.
