# Plan

1. Research earthquakes, research common earthquake classification methods
2. Get oriented with data, see if we can find any patterns
3. Decide on ML or non-ML approach

## ML approach plan options:

CNN (convolutional neural network)
- feed the model all the different times in each dataset, labeled as "arrival" or "not arrival"
- use the relationship between the velocities of different time steps to form a model classifying "arrival" or not.

In [101]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import os
from obspy import read
import math

In [3]:
dir_name = 'data/lunar/training'
catalog_name = 'apollo12_catalog_GradeA_final.csv'

In [5]:
# get catalog dataframe
catalog_df = pd.read_csv(os.path.join(dir_name + '/catalogs/', catalog_name))
catalog_df = catalog_df.drop(catalog_df.index[20]).reset_index(drop=True)
# print(catalog_df.head())
# get list of files
arrival_time_catalog = catalog_df['time_rel(sec)']
filename_catalog = catalog_df['filename']
# make_plot(filename_catalog, arrival_time_catalog)

In [4]:
# make the plot for a csv file
def make_plot(namelist, arrival_time):
  fig, ax = plt.subplots(5, 1, figsize=(10, 20))
  # go through the catalog
  for i in range(len(namelist.head())):
    # read the csv and put it into a dataframe
    df = pd.read_csv(os.path.join(dir_name + '/data/S12_GradeA', namelist[i] + '.csv'))
    # put both the times and data columns into respective lists
    times = np.array(df['time_rel(sec)'].tolist())
    data = np.array(df['velocity(m/s)'].tolist())
    # make a plot of time vs velocity
    ax[i].plot(times, data)
    ax[i].set_xlabel('Time (s)')
    ax[i].set_ylabel('Velocity (m/s)')
    # add a red line for the correct arrival time
    ax[i].axvline(x = arrival_time[i], color='red',label='Rel. Arrival')
    arrival_line = ax[i].axvline(x=arrival_time[i], c='red', label='Abs. Arrival')
    ax[i].legend(handles=[arrival_line])

In [None]:
# make_plot(filename_catalog[2])

# Machine Learning Section - LR


pairs to be fixed:
7,8
21,22
35,36
47,48
66,67

# New Version

Version 3

In [81]:
# split a single csv into windows
def makeWindows(df, arrival_times, b_bound = 4, f_bound = 8, jump = 50):
  arrival_indices = []
  # go through each arrival time
  for arrival_time in arrival_times:
    # convert the relative times to a list
    time_list = df['time_rel(sec)'].tolist()
    # go through each time and check if it is close to our current arrival time
    for i in range(len(time_list)):
      if math.isclose(time_list[i], arrival_time, abs_tol=0.1):
        arrival_indices += [i]
        break
  
  velocities = df['velocity(m/s)'].tolist()

  result = []
  
  for offset in range(0, jump):
    f_pointer = (b_bound + f_bound) * jump + offset
    result = result + [[velocities[0: f_pointer:jump], False]]
    f_pointer += jump


    while f_pointer < len(velocities):
      target = f_pointer - (f_bound * jump)
      classification = False
      if target in arrival_indices:
        classification = True
      result.append([np.concatenate((np.array(result[-1][0][1:]), np.float64(velocities[f_pointer])), axis=None), classification])
      f_pointer += jump

  return result

In [82]:
def make_many_windows(count = catalog_df.shape[0], jumps = 50):
  windows_df = pd.DataFrame(columns=['window', 'classification'])
  # keep track of our duplicate csvs (multiple arrival times)
  duplicates = {7:8, 21:22, 35:36, 47:48, 66:67}
  # go through the amount of csv files we want to look at
  for i in range(count):
    if i in duplicates.values():
      continue

    arrival_times = [catalog_df.loc[i, 'time_rel(sec)']]
    if i in duplicates.keys():
      arrival_times.append(catalog_df.loc[duplicates[i], 'time_rel(sec)'])

    filename = catalog_df.loc[i, 'filename']
    csv_df = pd.read_csv(os.path.join(dir_name + '/data/S12_GradeA/', filename + '.csv'))
    new_windows = pd.DataFrame(makeWindows(csv_df, arrival_times, jump = jumps), columns=['window', 'classification'])
    #print(new_windows)
    windows_df = pd.concat([windows_df, new_windows])
  return windows_df



In [84]:
# cProfile.run("make_one_window()")
windows_df = make_many_windows(19, jumps=200)

In [None]:
temp_df = windows_df.query('classification == True')
print(temp_df.head(10))
print(temp_df.iloc[0,0])

In [None]:
windows_df

# Model Training

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
#from imblearn.over_sampling import SMOTE
#from imblearn.under_sampling import RandomUnderSampler


In [None]:
df = windows_df
df

In [None]:
df = windows_df
df_zeros = df[df['classification'] == False]
df_zeros = df_zeros.sample(frac=0.0002, random_state=42)
df_ones = df[df['classification'] == True]
df_pruned = pd.concat([df_zeros, df_ones])
df_pruned
df = df_pruned
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
print(df.shape)
print(df)


In [None]:
#smote = SMOTE(sampling_strategy='minority')
#X_resampled, y_resampled = smote.fit_resample(X_clean, y)
# undersampler = RandomUnderSampler(sampling_strategy='majority')
# X_resampled, y_resampled = undersampler.fit_resample(X_clean, y)

In [None]:
X_clean = pd.DataFrame(df['window'].tolist())
X_clean = X_clean.dropna(axis = 1, how = 'any')
y = df['classification'].astype(int)
X_clean

In [99]:
# Takes 5 seconds to run
class_weights = {0: 1, 1: 540000}
model = LogisticRegression(max_iter=100, class_weight='balanced')
model.fit(X_clean, y)

In [155]:
# Takes 4 minutes to run
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_clean, y)

# Test Data Processing

### Using actual test data

In [90]:
test_dir_name = 'data/lunar/test/S12_GradeB/'

### Using portion of training data

In [91]:
def make_test_windows(start = 30, count = catalog_df.shape[0], jumps = 50):
  test_windows_df = pd.DataFrame(columns=['window', 'classification'])

  duplicates = {7:8, 21:22, 35:36, 47:48, 66:67}
  for i in range(start, count):
    if i in duplicates.values():
      continue

    arrival_times = [catalog_df.loc[i, 'time_rel(sec)']]
    if i in duplicates.keys():
      arrival_times.append(catalog_df.loc[duplicates[i], 'time_rel(sec)'])

    filename = catalog_df.loc[i, 'filename']
    csv_df = pd.read_csv(os.path.join(dir_name + '/data/S12_GradeA/', filename + '.csv'))
    new_windows = pd.DataFrame(makeWindows(csv_df, arrival_times, jump = jumps), columns=['window', 'classification'])
    test_windows_df = pd.concat([test_windows_df, new_windows])
  return test_windows_df

In [94]:
test_df = make_test_windows(start=30, count=catalog_df.shape[0], jumps=200)

In [102]:
test_df

Unnamed: 0,window,classification
0,"[1.406432122600214e-15, 1.8401397302403882e-17...",False
1,"[1.8401397302403882e-17, 1.5994157455475232e-1...",False
2,"[1.5994157455475232e-14, -8.791586538396387e-1...",False
3,"[-8.791586538396387e-13, -3.356882446658291e-1...",False
4,"[-3.356882446658291e-13, 9.112072540232316e-13...",False
...,...,...
570002,"[-1.5372765341245761e-12, 4.313554191251328e-1...",False
570003,"[4.313554191251328e-14, -1.417290681164798e-13...",False
570004,"[-1.417290681164798e-13, 1.3318165700826157e-1...",False
570005,"[1.3318165700826157e-12, -5.069384195598557e-1...",False


### Prepare it for the model

In [113]:
import gc
gc.collect()

6707

In [100]:
X_test_clean = pd.DataFrame(test_df['window'].tolist())
X_test_clean = X_test_clean.dropna(axis = 1, how = 'any')
y_test = test_df['classification'].astype(int)

Run the model on some data

In [101]:
predictions = model.predict(X_test_clean)
print(predictions)

accuracy = accuracy_score(y_test, predictions)
print(accuracy)

confusion = confusion_matrix(y_test, predictions)
report = classification_report(y_test, predictions, zero_division=0)

print(confusion)
print(report)

[0 0 0 ... 0 0 0]
0.9999981020735227
[[23710041        0]
 [      45        0]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00  23710041
           1       0.00      0.00      0.00        45

    accuracy                           1.00  23710086
   macro avg       0.50      0.50      0.50  23710086
weighted avg       1.00      1.00      1.00  23710086



In [118]:
def test_weight(weight):
  model = LogisticRegression(max_iter=1000, solver='liblinear', class_weight = {0: 1, 1: weight})
  model.fit(X_clean, y)
  predictions = model.predict(X_test_clean)
  accuracy = accuracy_score(y_test, predictions)
  return accuracy

In [122]:
def search_weights():
    l = 1000
    r = 5402340
    while True:
      mid = (r-l) // 2 + l
      print(mid)
      accuracy = test_weight(mid)
      print(accuracy)
      if accuracy >= 0.9999983091356555:
          l = mid
      elif accuracy <= 1.690864344489938e-06:
          r = mid
      else:
        print(accuracy)
        return mid

In [None]:
search_weights()