In [None]:
import glob, joblib, json, os, time, torch
import pandas as pd
import numpy as np
import torch.nn as nn
from datetime import datetime, timedelta
from geopy.distance import geodesic
from sklearn.model_selection import KFold, train_test_split
from torch.utils.data import DataLoader, Dataset, TensorDataset

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

BASE_PATH = os.path.expanduser("~/Code/Quake")
DATA_PATH, SAVES_PATH = os.path.join(BASE_PATH, "data"), os.path.join(BASE_PATH, "saves")
DATABASE_FILE, PROCESSED_DATABASE_FILE, METRICS_FILE = os.path.join(DATA_PATH, "database.csv"), os.path.join(DATA_PATH, "processed_database.npz"), os.path.join(SAVES_PATH, "all.metrics")

R, I = 100, 2592000

def check_repository():
	os.makedirs(DATA_PATH, exist_ok=True)
	os.makedirs(SAVES_PATH, exist_ok=True)
	if not os.path.isfile(DATABASE_FILE):
		raise Exception()
	if not os.path.isfile(METRICS_FILE) or os.path.getsize(METRICS_FILE) == 0:
		with open(METRICS_FILE, "w") as f:
			f.write("[]")
	print("Checked repository")

class EarthquakeDataset(Dataset):
	def __init__(self, X, y):
		self.X, self.y = torch.tensor(X, dtype=torch.float32, device=DEVICE), torch.tensor(y, dtype=torch.float32, device=DEVICE)
	def __getitem__(self, i):
		return self.X[i], self.y[i]
	def __len__(self):
		return len(self.X)

def generate_labels(df):
	timestamps, latitudes, longitudes = df["Timestamp"].values, df["Latitude"].values, df["Longitude"].values
	m = len(timestamps)
	y = np.zeros(m, dtype=np.float32)
	for i in range(m):
		event_time, event_latitude, event_longitude = timestamps[i], latitudes[i], longitudes[i]
		in_window = (timestamps > event_time) & (timestamps <= event_time + I)
		for j in np.where(in_window)[0]:
			if geodesic((event_latitude, event_longitude), (latitudes[j], longitudes[j])).km <= R:
				y[i] = 1
				break
	return y

def process_dataset():
	if os.path.exists(PROCESSED_DATABASE_FILE):
		X, y = np.load(PROCESSED_DATABASE_FILE)["X"], np.load(PROCESSED_DATABASE_FILE)["y"]
		# print(pd.DataFrame(X, columns=["Timestamp", "Latitude", "Longitude", "Depth", "Magnitude"]))
		return X, y

	df = pd.read_csv(DATABASE_FILE, sep=",", engine="python")

	# Select "Earthquake" rows
	df = df[df["Type"] == "Earthquake"]

	# Combine "Date" and "Time" columns to "Timestamp" column in Datetime format
	df["Timestamp"] = pd.to_datetime(
		df["Date"] + " " + df["Time"],
		format="%m/%d/%Y %H:%M:%S",
		errors="coerce"
	)

	# Drop non-standard "Timestamp" columns
	df = df.dropna(subset=["Timestamp"])

	# Convert "Timestamp" column from Datetime format to UNIX format
	df["Timestamp"] = df["Timestamp"].view("int64") // 10**9

	# Select and sort "Timestamp", "Latitude", "Longitude", "Depth", and "Magnitude" columns
	df = df[["Timestamp", "Latitude", "Longitude", "Depth", "Magnitude"]].sort_values("Timestamp")

	X = df.values
	y = generate_labels(df)

	np.savez(PROCESSED_DATABASE_FILE, X=X, y=y)
	return X, y

class BaseModel(nn.Module):
	def __init__(self, id):
		super().__init__()
		self.id = id
	def forward(self, x):
		raise NotImplementedError()

class MLPModel(BaseModel):
	def __init__(self, input_dimensions, hidden_dimensions, layers, dropout_rate, is_batch_normalised):
		super().__init__(f"MLP({input_dimensions}-{hidden_dimensions}-{layers}-{dropout_rate}-{is_batch_normalised})")

		layer_list = []
		for i in range(layers):
			in_dim = input_dimensions if i == 0 else hidden_dimensions
			layer_list.append(nn.Linear(in_dim, hidden_dimensions))
			if is_batch_normalised:
				layer_list.append(nn.BatchNorm1d(hidden_dimensions))
			layer_list.append(nn.ReLU())
			layer_list.append(nn.Dropout(dropout_rate))
		layer_list.append(nn.Linear(hidden_dimensions, 1))

		self.network = nn.Sequential(*layer_list)

	def forward(self, x):
		return torch.sigmoid(self.network(x)).squeeze(-1)

class TranEncDecModel(BaseModel):
	def __init__(self, input_dimensions, hidden_dimensions, layers, heads, dropout_rate, is_batch_normalised):
		super().__init__(f"TranEncDec({input_dimensions}-{hidden_dimensions}-{layers}-{heads}-{dropout_rate}-{is_batch_normalised})")

		encoder_layer = nn.TransformerEncoderLayer(
			d_model=input_dimensions,
			nhead=heads,
			dim_feedforward=hidden_dimensions,
			dropout=dropout_rate,
			batch_first=True
		)
		self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=layers)
		self.decoder = nn.Sequential(
			nn.Linear(input_dimensions, hidden_dimensions),
			nn.ReLU(),
			nn.Linear(hidden_dimensions, 1)
		)

	def forward(self, x):
		x = self.encoder(x.unsqueeze(1)).squeeze(1)
		return torch.sigmoid(self.decoder(x)).squeeze(-1)

class ConvTranEncDecModel(BaseModel):
	def __init__(self, input_dimensions, conv_filters, kernel_size, hidden_dimensions, layers, heads, dropout_rate, is_batch_normalised):
		super().__init__(f"ConvTranEncDec({input_dimensions}-{conv_filters}-{kernel_size}-{hidden_dimensions}-{layers}-{heads}-{dropout_rate}-{is_batch_normalised})")

		self.conv = nn.Sequential(
			nn.Conv1d(in_channels=1, out_channels=conv_filters, kernel_size=kernel_size, padding=kernel_size // 2),
			nn.ReLU(),
			nn.Conv1d(in_channels=conv_filters, out_channels=conv_filters, kernel_size=kernel_size, padding=kernel_size // 2),
			nn.ReLU()
		)
		encoder_layer = nn.TransformerEncoderLayer(
			d_model=conv_filters,
			nhead=heads,
			dim_feedforward=hidden_dimensions,
			dropout=dropout_rate,
			batch_first=True
		)
		self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=layers)
		self.decoder = nn.Sequential(
			nn.Linear(conv_filters, hidden_dimensions),
			nn.ReLU(),
			nn.Linear(hidden_dimensions, 1)
		)

	def forward(self, x):
		x = x.unsqueeze(1)
		x = self.conv(x).transpose(1, 2)
		x = self.encoder(x).mean(dim=1)
		return torch.sigmoid(self.decoder(x)).squeeze(-1)

MODEL_CLASSES = {
	"MLP": MLPModel,
	"TranEncDec": TranEncDecModel,
	"ConvTranEncDec": ConvTranEncDecModel
}

def instantiate_model(model_type, **kwargs):
	return MODEL_CLASSES[model_type](**kwargs).to(DEVICE)

def save_model_weights_and_losses(model, epoch, train_loss, val_loss):
	model_folder = os.path.join(SAVES_PATH, model.id)

	# Make model folder if and only if it does not exist
	os.makedirs(model_folder, exist_ok=True)

	# Save model weights at epoch number to model folder
	torch.save(model.state_dict(), os.path.join(model_folder, f"epoch-{epoch}.pth"))

	# Save model losses at epoch number to model folder
	with open(os.path.join(model_folder, f"epoch-{epoch}.losses"), "w") as f:
		json.dump({"train": train_loss, "val": val_loss}, f)

def load_model_weights(model, epoch):
	model_folder = os.path.join(SAVES_PATH, model.id)
	model_weights = sorted(glob.glob(os.path.join(model_folder, "*.pth")), key=os.path.getmtime)

	if not model_weights:
		return 1

	if epoch is not None:
		loaded_model_weights = os.path.join(model_folder, f"epoch-{epoch}.pth")
	else:
		loaded_model_weights = model_weights[-1]
		epoch = int(os.path.basename(loaded_model_weights).split("-")[1].split(".")[0])
	model.load_state_dict(torch.load(loaded_model_weights, map_location=DEVICE))

	return max(epoch + 1, 1)

def train_model(model, L_train, L_val, epoch, epochs):
	# Try to load model weights
	initial_epoch = load_model_weights(model, epoch)
	final_epoch = initial_epoch + epochs + 1

	optimizer = torch.optim.Adam(model.parameters())
	criterion = nn.BCELoss()

	lowest_loss, patience_limit, patience_counter = float("inf"), 5, 0

	for epoch in range(initial_epoch, final_epoch):
		model.train()
		train_loss = 0
		for X_batch, y_batch in L_train:
			optimizer.zero_grad()

			y_predictions = model(X_batch)
			loss = criterion(y_predictions, y_batch)
			loss.backward()
			optimizer.step()
			train_loss += loss.item() * X_batch.size(0)
		train_loss /= len(L_train.dataset)

		model.eval()
		val_loss = 0
		with torch.no_grad():
			for X_batch, y_batch in L_val:
				y_predictions = model(X_batch)
				loss = criterion(y_predictions, y_batch)
				val_loss += loss.item() * X_batch.size(0)
		val_loss /= len(L_val.dataset)

		# Save model weights and losses
		save_model_weights_and_losses(model, epoch, train_loss, val_loss)

		# Print model training progress
		print(f"{model.id} @ epoch {epoch} of {final_epoch}: train_loss = {train_loss:.7f}, val_loss = {val_loss:.7f}")

		# Try early stopping
		if val_loss > lowest_loss:
			patience_counter += 1
			if patience_counter >= patience_limit:
				print(f"Early stopping at epoch {epoch}")
				break
		else:
			patience_counter, lowest_loss = 0, val_loss

def evaluate_model(model, L_unseen):
	model.eval()
	correct, total = 0, 0
	with torch.no_grad():
		for X_batch, y_batch in L_unseen:
			y_predictions = (model(X_batch) >= 0.5).float()
			correct += (y_predictions == y_batch).sum().item()
			total += y_batch.size(0)
	return correct / total

def train_and_crossvalidate_model(model_specification, folds=1, epoch=None, epochs=1):
	X, y = process_dataset()

	X_seen, X_unseen, y_seen, y_unseen = train_test_split(X, y, test_size=0.2, random_state=37)
	D_unseen = EarthquakeDataset(X_unseen, y_unseen)
	L_unseen = DataLoader(D_unseen, batch_size=32, shuffle=False)

	if folds > 1:
		kf = KFold(n_splits=folds, shuffle=True, random_state=37)
		score_list = []
		for fold, (i_train, i_val) in enumerate(kf.split(X_seen)):
			D_train, D_val = TensorDataset(
				torch.tensor(X_seen[i_train], dtype=torch.float32, device=DEVICE),
				torch.tensor(y_seen[i_train], dtype=torch.float32, device=DEVICE)
			), TensorDataset(
				torch.tensor(X_seen[i_val], dtype=torch.float32, device=DEVICE),
				torch.tensor(y_seen[i_val], dtype=torch.float32, device=DEVICE)
			)
			L_train, L_val = DataLoader(D_train, batch_size=32, shuffle=True), DataLoader(D_val, batch_size=32, shuffle=False)

			model = instantiate_model(**model_specification)
			train_model(model, L_train, L_val, epoch, epochs)
			score_list.append(evaluate_model(model, L_unseen))
		score = sum(score_list) / len(score_list)
	else:
		D_seen = EarthquakeDataset(X_seen, y_seen)
		L_train, L_val = DataLoader(D_seen, batch_size=32, shuffle=True), DataLoader(D_seen, batch_size=32, shuffle=False)

		model = instantiate_model(**model_specification)
		train_model(model, L_train, L_val, epoch, epochs)
		score = evaluate_model(model, L_unseen)

	print(f"id: {model.id}, score: {score}")
	with open(METRICS_FILE, "r+") as f:
		metrics = json.load(f)
		metrics.append({
			"id": model.id,
			"score": score,
			"folds": folds,
			"time": time.time()
		})
		f.seek(0)
		json.dump(metrics, f, indent=2)
		f.truncate()

check_repository()

model_specification_1 = {
	"model_type": "MLP",
	"input_dimensions": 5,
	"hidden_dimensions": 128,
	"layers": 16,
	"dropout_rate": 0.1,
	"is_batch_normalised": True
}
model_specification_2 = {
	"model_type": "TranEncDec",
	"input_dimensions": 5,
	"hidden_dimensions": 128,
	"layers": 8,
	"heads": 5,
	"dropout_rate": 0.05,
	"is_batch_normalised": True
}
model_specification_3 = {
	"model_type": "ConvTranEncDec",
	"input_dimensions": 5,
	"conv_filters": 12, # Ensure "conv_filters" is divisible by "heads"
	"kernel_size": 3,
	"hidden_dimensions": 128,
	"layers": 6,
	"heads": 4,
	"dropout_rate": 0.05,
	"is_batch_normalised": True
}
train_and_crossvalidate_model(model_specification_1, folds=2, epochs=50)