# Load libraries

In [None]:
import numpy as np
import pandas as pd
import primesieve as ps
from tqdm.notebook import tqdm
from sklearn.metrics import confusion_matrix, matthews_corrcoef

from dataloader import read_dataset
from file_ops import get_LMFDB_all_file_names

# Create test curves dataset

In [None]:
# Load LMFDB curves - just to be able to remove them from custom dataset

data_all_LMFDB, _ = read_dataset(
    get_LMFDB_all_file_names(), 10, load_reduced_metadata=True)
conductors_LMFDB = set([elem['conductor'] for elem in data_all_LMFDB])


In [None]:
# load custom dataset curves and ap-s and remove all curves having conductors from LMFDB or conductor > 10^8

num_exp=1000  # primes p < num_exp
data_all, data_qexp = read_dataset(["od_Matije/curves_r01triv", "od_Matije/curves_svi"], num_exp)
conductors = [elem['conductor'] for elem in data_all]
idxs = [idx for idx, cond in enumerate(conductors) if ((cond <= 100000000) and (cond not in conductors_LMFDB))]

data_qexp = data_qexp[idxs]
idxs = set(idxs)
data_all = [elem for idx, elem in enumerate(
    tqdm(data_all, leave=False)) if idx in idxs]

conductors = [elem['conductor'] for elem in data_all]
assert len(conductors) == data_qexp.shape[0]
len(conductors)

In [None]:
# save conductors and ap-s from selected curves

N=10000  # number of curves to test on

with open('conductors.txt', 'w') as out_file:
    file_content = "\n".join([str(cond) for cond in conductors[-N:]])
    out_file.write(file_content)

np.savetxt('aps.txt', data_qexp[-N:], fmt='%d', delimiter=',')

In [None]:
# double check that curves and ap-s can be loaded - can skip this

loaded_conductors = pd.read_csv('conductors.txt', names=['conductor'])
assert list(loaded_conductors['conductor']) == conductors[-N:]

loaded_data_qexp = pd.read_csv(
    'aps.txt', names=[f'p{prime}' for prime in list(ps.primes(num_exp))]).to_numpy()
assert np.array_equal(loaded_data_qexp, data_qexp[-N:])


# Run deepellrank

In [None]:
# run deepellrank from command line

# After running deepellrank - plot confusion matrix

In [None]:
# Get precomputed true ranks from custom dataset

true_ranks = [elem['rank'] for elem in data_all][-N:]

In [None]:
# Load predicted ranks

predicted_ranks = pd.read_csv('predicted_ranks.txt', names=['rank'])
predicted_ranks = list(predicted_ranks['rank'])
assert len(true_ranks) == len(predicted_ranks)

In [None]:
# Compute MCC
matthews_corrcoef(true_ranks, predicted_ranks)

In [None]:
# Build confusion matrix
cf_matrix = confusion_matrix(true_ranks, predicted_ranks)
print(cf_matrix.shape)
df_cm = pd.DataFrame(
    cf_matrix / np.sum(cf_matrix) * 100,
    index=[i for i in sorted(list(set(true_ranks).union(set(predicted_ranks))))],
    columns=[i for i in sorted(list(set(true_ranks).union(set(predicted_ranks))))]
)

df_cm