# Plan

1. Research earthquakes, research common earthquake classification methods
2. Get oriented with data, see if we can find any patterns
3. Decide on ML or non-ML approach

## ML approach plan options:

CNN (convolutional neural network)
- feed the model all the different times in each dataset, labeled as "arrival" or "not arrival"
- use the relationship between the velocities of different time steps to form a model classifying "arrival" or not.

In [101]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import os
from obspy import read
import math

In [3]:
dir_name = 'data/lunar/training'
catalog_name = 'apollo12_catalog_GradeA_final.csv'

In [5]:
# get catalog dataframe
catalog_df = pd.read_csv(os.path.join(dir_name + '/catalogs/', catalog_name))
catalog_df = catalog_df.drop(catalog_df.index[20]).reset_index(drop=True)
# print(catalog_df.head())
# get list of files
arrival_time_catalog = catalog_df['time_rel(sec)']
filename_catalog = catalog_df['filename']
# make_plot(filename_catalog, arrival_time_catalog)

In [4]:
# make the plot for a csv file
def make_plot(namelist, arrival_time):
  fig, ax = plt.subplots(5, 1, figsize=(10, 20))
  # for i in range(len(namelist)):
  for i in range(len(namelist.head())):
    df = pd.read_csv(os.path.join(dir_name + '/data/S12_GradeA', namelist[i] + '.csv'))
    times = np.array(df['time_rel(sec)'].tolist())
    data = np.array(df['velocity(m/s)'].tolist())
    ax[i].plot(times, data)
    ax[i].set_xlabel('Time (s)')
    ax[i].set_ylabel('Velocity (m/s)')
    ax[i].axvline(x = arrival_time[i], color='red',label='Rel. Arrival')
    arrival_line = ax[i].axvline(x=arrival_time[i], c='red', label='Abs. Arrival')
    ax[i].legend(handles=[arrival_line])

In [None]:
# make_plot(filename_catalog[2])

# Machine Learning Section - LSTM


pairs to be fixed:
7,8
21,22
35,36
47,48
66,67

# New Version

Version 3

In [25]:
# split a csv into windows
def makeWindows(df, arrival_times, b_bound = 4, f_bound = 8, jump = 50):
  arrival_indices = []
  for arrival_time in arrival_times:
    time_list = df['time_rel(sec)'].tolist()
    for i in range(len(time_list)):
      if math.isclose(time_list[i], arrival_time, abs_tol=0.1):
        arrival_indices += [i]
        break
  velocities = df['velocity(m/s)'].tolist()

  result = []
  for offset in range(0, jump):
    f_pointer = (b_bound + f_bound) * jump + offset
    result = result + [[np.array(velocities[0: f_pointer:jump], dtype=np.float64), False]]
    f_pointer += jump


    while f_pointer < len(velocities):
      target = f_pointer - (f_bound * jump)
      classification = False
      if target in arrival_indices:
        classification = True
      result.append([result[-1][0][1:] + [np.float64(velocities[f_pointer])], classification])
      f_pointer += jump

  return result

In [26]:
def make_many_windows(count = catalog_df.shape[0], jumps = 50):
  windows_df = pd.DataFrame(columns=['window', 'classification'])

  duplicates = {7:8, 21:22, 35:36, 47:48, 66:67}
  for i in range(count):
    if i in duplicates.values():
      continue

    arrival_times = [catalog_df.loc[i, 'time_rel(sec)']]
    if i in duplicates.keys():
      arrival_times.append(catalog_df.loc[duplicates[i], 'time_rel(sec)'])

    filename = catalog_df.loc[i, 'filename']
    csv_df = pd.read_csv(os.path.join(dir_name + '/data/S12_GradeA/', filename + '.csv'))
    new_windows = pd.DataFrame(makeWindows(csv_df, arrival_times, jump = jumps), columns=['window', 'classification'])
    windows_df = pd.concat([windows_df, new_windows])
  return windows_df



In [27]:
# cProfile.run("make_one_window()")
windows_df = make_many_windows(1, jumps=50)

In [39]:
temp_df = windows_df.query('classification == True')
print(temp_df.head(10))
print(temp_df.iloc[0,0])

                                                   window classification
433031  [-3.723378905903512e-10, 1.1130096336027927e-0...           True
230494  [-1.9635943705120223e-10, -2.8390161739232336e...           True
89760   [-1.1627175358013935e-10, -2.5693141420134256e...           True
172201  [2.9728677899237624e-10, 1.6676200219985537e-1...           True
315797  [5.330479132649831e-12, 2.9728208438875485e-10...           True
9061    [1.0751961955724324e-10, 5.833471001562563e-10...           True
547248  [-1.6997540722168103e-10, -3.872939765018343e-...           True
402841  [3.1980663250146814e-11, -1.3775161716234722e-...           True
463205  [3.2863412166777265e-10, -2.9877942713563827e-...           True
89601   [1.2690703919482512e-12, 1.6445242940405202e-1...           True
[-3.723378905903512e-10, 1.1130096336027927e-09, 6.722989909479168e-10, 1.3185041413415143e-09, 6.365836694219063e-10, -4.742486246977076e-10, 3.1071585010038037e-11, -6.122707572106535e-11, 3.6683

In [83]:
windows_df

Unnamed: 0,window,classification
0,"[-6.153278962788711e-14, -8.336084867827097e-1...",False
1,"[-8.336084867827097e-11, -6.078634411602454e-1...",False
2,"[-6.078634411602454e-13, -4.9056350943573936e-...",False
3,"[-4.9056350943573936e-12, 6.734761378977762e-1...",False
4,"[6.734761378977762e-13, 5.1810661491704145e-11...",False
...,...,...
512413,"[-8.594020693386302e-13, -2.748352309672148e-1...",False
512414,"[-2.748352309672148e-11, -5.319778636683391e-1...",False
512415,"[-5.319778636683391e-12, -6.608560491712518e-1...",False
512416,"[-6.608560491712518e-11, -5.383807286282601e-1...",False


# Model Training

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler


In [99]:
df = windows_df

In [151]:
df = windows_df
df_zeros = df[df['classification'] == False]
df_zeros = df_zeros.sample(frac=0.002, random_state=42)
df_ones = df[df['classification'] == True]
df_pruned = pd.concat([df_zeros, df_ones])
df_pruned
df = df_pruned
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
print(df.shape)
print(df)


(28724, 2)
                                                  window classification
0      [-4.744280111644515e-10, -3.622176849036385e-1...          False
1      [-2.351533704162514e-11, -3.422723919234207e-1...          False
2      [3.760155650258351e-10, -8.692728037330011e-11...          False
3      [-2.8437865106914194e-10, -3.290181756305877e-...          False
4      [3.876476576408145e-10, 2.0676025307341463e-10...          False
...                                                  ...            ...
28719  [-1.4963825742215512e-10, 2.7464203149162263e-...          False
28720  [2.4251462371373766e-10, 3.4854379372387736e-1...          False
28721  [1.4984208021467282e-12, 6.937223234403887e-13...          False
28722  [-2.4729974449889085e-10, -1.0664655996138768e...          False
28723  [-2.8319474047653144e-17, -6.138709402767033e-...          False

[28724 rows x 2 columns]


In [88]:
smote = SMOTE(sampling_strategy='minority')
X_resampled, y_resampled = smote.fit_resample(X_clean, y)

MemoryError: Unable to allocate 1.28 GiB for an array with shape (14347547, 12) and data type float64

In [103]:
# undersampler = RandomUnderSampler(sampling_strategy='majority')
# X_resampled, y_resampled = undersampler.fit_resample(X_clean, y)

In [152]:
X_clean = pd.DataFrame(df['window'].tolist())
X_clean = X_clean.dropna(axis = 1, how = 'any')
y = df['classification'].astype(int)
X_clean.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-4.74428e-10,-3.622177e-10,9.693906e-11,8.962054e-09,-3.849657e-09,1.330253e-09,-2.363845e-09,-1.635753e-09,-5.776363e-10,-1.908682e-09,-4.081535e-10,7.691693e-11
1,-2.351534e-11,-3.422724e-10,-5.024346e-10,-1.532303e-10,-2.671384e-10,-6.627942e-12,-1.044759e-11,-2.605059e-10,-6.384307e-11,-3.265215e-10,-2.430051e-11,-2.972461e-10
2,3.760156e-10,-8.692728e-11,-1.317773e-11,2.487387e-10,8.978889e-11,-7.862555e-11,3.366925e-10,3.891794e-11,7.157164e-12,-1.461998e-10,-4.295024e-10,-1.073232e-11
3,-2.843787e-10,-3.290182e-12,-8.022313e-11,5.23158e-12,-1.523784e-10,9.656059e-12,-1.696746e-13,-5.936455e-14,3.971989e-12,-1.845395e-11,-5.337881e-11,2.225791e-10
4,3.876477e-10,2.067603e-10,-1.001798e-10,-1.967059e-10,6.136541e-12,-8.86879e-11,-7.83037e-12,4.478087e-10,-4.097279e-10,1.817991e-10,-1.131352e-10,6.503438e-11


In [153]:
# Takes 5 seconds to run
# {0: 1, 1: 533879}
class_weights = {0: 1, 1: 540000}
model = LogisticRegression(max_iter=100, class_weight='balanced')
model.fit(X_clean, y)
#model.fit(X_resampled, y_resampled)

In [155]:
# Takes 4 minutes to run
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_clean, y)

# Test Data Processing

### Using actual test data

In [16]:
test_dir_name = 'data/lunar/test/S12_GradeB/'

### Using portion of training data

In [17]:
def make_test_windows(start = 30, count = catalog_df.shape[0], jumps = 50):
  test_windows_df = pd.DataFrame(columns=['window', 'classification'])

  duplicates = {7:8, 21:22, 35:36, 47:48, 66:67}
  for i in range(start, count):
    if i in duplicates.values():
      continue

    arrival_times = [catalog_df.loc[i, 'time_rel(sec)']]
    if i in duplicates.keys():
      arrival_times.append(catalog_df.loc[duplicates[i], 'time_rel(sec)'])

    filename = catalog_df.loc[i, 'filename']
    csv_df = pd.read_csv(os.path.join(dir_name + '/data/S12_GradeA/', filename + '.csv'))
    new_windows = pd.DataFrame(makeWindows(csv_df, arrival_times, jump = jumps), columns=['window', 'classification'])
    test_windows_df = pd.concat([test_windows_df, new_windows])
  return test_windows_df

In [78]:
test_df = make_test_windows(start=30, count=catalog_df.shape[0], jumps=5000)

### Prepare it for the model

In [113]:
import gc
gc.collect()

6707

In [115]:
X_test_clean = pd.DataFrame(test_df['window'].tolist())
X_test_clean = X_test_clean.dropna(axis = 1, how = 'any')
y_test = test_df['classification'].astype(int)

Run the model on some data

In [154]:
predictions = model.predict(X_test_clean)
print(predictions)
# for i in predictions:
#   if i == 1: 
#     print("found a one!!")
accuracy = accuracy_score(y_test, predictions)
print(accuracy)
confusion = confusion_matrix(y_test, predictions)
report = classification_report(y_test, predictions, zero_division=0)

print(confusion)
print(report)

[0 0 0 ... 0 0 0]
0.9999983091356555
[[21290850        0]
 [      36        0]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00  21290850
           1       0.00      0.00      0.00        36

    accuracy                           1.00  21290886
   macro avg       0.50      0.50      0.50  21290886
weighted avg       1.00      1.00      1.00  21290886



In [118]:
def test_weight(weight):
  model = LogisticRegression(max_iter=1000, solver='liblinear', class_weight = {0: 1, 1: weight})
  model.fit(X_clean, y)
  predictions = model.predict(X_test_clean)
  accuracy = accuracy_score(y_test, predictions)
  return accuracy

In [122]:
def search_weights():
    l = 1000
    r = 5402340
    while True:
      mid = (r-l) // 2 + l
      print(mid)
      accuracy = test_weight(mid)
      print(accuracy)
      if accuracy >= 0.9999983091356555:
          l = mid
      elif accuracy <= 1.690864344489938e-06:
          r = mid
      else:
        print(accuracy)
        return mid

In [None]:
search_weights()