In [5]:
# read the following csvs in pandas. they contian the same primary key in their first column. join them on this column and print the resulting dataframe
import pandas as pd
file_endings = ["_confidences.csv", "_evaluation_metrics_per_class.csv"]
d_full_m_full = r"C:\Users\Dan\Desktop\CDC\Projects\dives\data\preproc\retry_071124\LANL_minus_4293_curated_4293_model_curated_full\LANL_minus_4293_curated_4293_model_curated_full"
d_full_m_combined = r"C:\Users\Dan\Desktop\CDC\Projects\dives\data\preproc\retry_071124\LANL_minus_4293_curated_new_combined_modelnew_combined_model\LANL_minus_4293_curated_new_combined_modelnew_combined_model"
d_full_m_pol_only = r"C:\Users\Dan\Desktop\CDC\Projects\dives\data\preproc\retry_071124\LANL_minus_4293_curated_new_pol_only_modelnew_pol_only_model\LANL_minus_4293_curated_new_pol_only_modelnew_pol_only_model"

# each directory has files with the same name as the directory + the file endings


In [29]:
d_full_m_full_conf = pd.read_csv(d_full_m_full + file_endings[0])
d_full_m_full_eval = pd.read_csv(d_full_m_full + file_endings[1])
d_full_m_pol_conf = pd.read_csv(d_full_m_pol_only + file_endings[0])
d_full_m_pol_eval = pd.read_csv(d_full_m_pol_only + file_endings[1])
d_full_m_combined_conf = pd.read_csv(d_full_m_combined + file_endings[0])
d_full_m_combined_eval = pd.read_csv(d_full_m_combined + file_endings[1])


In [30]:
# when combining the dataframes, we will want the columns to be side by side
# and the columns should be renamed to reflect the model they came from, either pol, full, or combined

# the goal is to check wehter the pol and full models seperately working together are better than a single combined model. 
# the confidences file for each sequence will assign a confidence score produced by each model for its subtype, as given in "predicted_class_confidence". We want to create a dataframe that shows "pol_confidence", "full_confidence", "combined_confidence", "true_class" for each sequence.
# the current columns are "id	true_class	predicted_class	true_class_confidence	predicted_class_confidence	top_3_confidences	position_of_true_class	length	match?", and we will need "id, true_class, predicted_class, predicted_class_confidence" for each model, named appropriately. 

# lets start with that. 

# first, lets pull the subset of columns we need from each dataframe
d_full_m_full_conf = d_full_m_full_conf[["id", "true_class", "predicted_class", "predicted_class_confidence"]]
d_full_m_combined_conf = d_full_m_combined_conf[["id", "predicted_class", "predicted_class_confidence"]] # dont need true class for this one
d_full_m_pol_conf = d_full_m_pol_conf[["id", "predicted_class", "predicted_class_confidence"]] # dont need true class for this one


# now we need to rename the columns to reflect the model they came from
d_full_m_full_conf = d_full_m_full_conf.rename(columns={"predicted_class":"full_prediction", "predicted_class_confidence": "full_confidence"})
d_full_m_combined_conf = d_full_m_combined_conf.rename(columns={"predicted_class":"combined_prediction", "predicted_class_confidence": "combined_confidence"})
d_full_m_pol_conf = d_full_m_pol_conf.rename(columns={"predicted_class":"pol_prediction", "predicted_class_confidence": "pol_confidence"})

# now we can merge the two dataframes on the id column
full_combined_conf = d_full_m_full_conf.merge(d_full_m_combined_conf, on="id")
full_combined_conf = full_combined_conf.merge(d_full_m_pol_conf, on="id")

# add a title to the dataframe
full_combined_conf.style.set_caption("Dataset: 4,293 Full Genomes Curated.")

full_combined_conf = full_combined_conf[["id", "true_class", "pol_prediction", "full_prediction", "combined_prediction", "pol_confidence", "full_confidence", "combined_confidence"]]

full_combined_conf["all agree?"] = full_combined_conf.apply(lambda x: True if x["pol_prediction"] == x["full_prediction"] == x["combined_prediction"] else False, axis=1)

# suppose we used the guess based on the confidence of the full or pol model, such that whoever is more confident, we use their guess.
full_combined_conf["full_pol_seperate_max_guess"] = full_combined_conf.apply(lambda x: x["full_prediction"] if x["full_confidence"] > x["pol_confidence"] else x["pol_prediction"], axis=1)
full_combined_conf["max_guesser"] = full_combined_conf.apply(lambda x: "full" if x["full_confidence"] > x["pol_confidence"] else "pol", axis=1)
full_combined_conf

Unnamed: 0,id,true_class,pol_prediction,full_prediction,combined_prediction,pol_confidence,full_confidence,combined_confidence,all agree?,full_pol_seperate_max_guess,max_guesser
0,B.IIIB_LAI,B,83_cpx,B,B,0.085354,0.998493,0.999670,False,B,full
1,B.LAI-J19,B,03_A6B,B,B,0.072677,0.998496,0.999668,False,B,full
2,C.93IN101,C,87_cpx,C,C,0.147904,0.994804,0.999081,False,C,full
3,01_AE.95TNIH022,01_AE,58_01B,01_AE,01_AE,0.196625,0.776646,0.998898,False,01_AE,full
4,01_AE.93JP_NH1,01_AE,58_01B,01_AE,01_AE,0.136432,0.779268,0.998884,False,01_AE,full
...,...,...,...,...,...,...,...,...,...,...,...
15930,B.RL42,B,83_cpx,B,B,0.116997,0.998498,0.999670,False,B,full
15931,D.84ZR085,D,B,D,D,0.135010,0.936094,0.995936,False,D,full
15932,G.92NG083_JV10832,G,06_cpx,G,G,0.059785,0.819189,0.917711,False,G,full
15933,B.REHTLV3_LAI_IIIB,B,130_A1B,B,B,0.102420,0.998498,0.999693,False,B,full


In [31]:
outpath = r"C:\Users\Dan\Desktop\CDC\Projects\dives\data\preproc\retry_071124\_compare_tg_vs_sep\validation"
full_combined_conf.to_csv(f"{outpath}\\Data_FullGenomes_confidence_comparison_2.csv", index=False)

In [25]:
#count the "agree?" column
full_combined_conf["agree?"].value_counts()

agree?
False    15431
True       504
Name: count, dtype: int64

In [32]:
full_combined_conf["max_guesser"].value_counts()

max_guesser
full    15932
pol         3
Name: count, dtype: int64

In [33]:
# now lets do the same for the pol test dataset
d_pol_m_full = r'C:\Users\Dan\Desktop\CDC\Projects\dives\data\preproc\retry_071124\NXTSA_4293_model_curated_full\NXTSA_4293_model_curated_full'
d_pol_m_combined =r'C:\Users\Dan\Desktop\CDC\Projects\dives\data\preproc\retry_071124\new_NXTSA_validation_new_combined_model\new_NXTSA_validation_new_combined_model'
d_pol_m_pol_only = r'C:\Users\Dan\Desktop\CDC\Projects\dives\data\preproc\retry_071124\new_NXTSA_validation_new_pol_only_model\new_NXTSA_validation_new_pol_only_model'

d_pol_m_full_conf = pd.read_csv(d_pol_m_full + file_endings[0])
d_pol_m_full_eval = pd.read_csv(d_pol_m_full + file_endings[1])
d_pol_m_pol_conf = pd.read_csv(d_pol_m_pol_only + file_endings[0])
d_pol_m_pol_eval = pd.read_csv(d_pol_m_pol_only + file_endings[1])
d_pol_m_combined_conf = pd.read_csv(d_pol_m_combined + file_endings[0])
d_pol_m_combined_eval = pd.read_csv(d_pol_m_combined + file_endings[1])

d_pol_m_full_conf = d_pol_m_full_conf[["id", "true_class", "predicted_class", "predicted_class_confidence"]]
d_pol_m_combined_conf = d_pol_m_combined_conf[["id", "predicted_class", "predicted_class_confidence"]] # dont need true class for this one
d_pol_m_pol_conf = d_pol_m_pol_conf[["id", "predicted_class", "predicted_class_confidence"]] # dont need true class for this one

d_pol_m_full_conf = d_pol_m_full_conf.rename(columns={"predicted_class":"full_prediction", "predicted_class_confidence": "full_confidence"})
d_pol_m_combined_conf = d_pol_m_combined_conf.rename(columns={"predicted_class":"combined_prediction", "predicted_class_confidence": "combined_confidence"})
d_pol_m_pol_conf = d_pol_m_pol_conf.rename(columns={"predicted_class":"pol_prediction", "predicted_class_confidence": "pol_confidence"})
pol_combined_conf = d_pol_m_full_conf.merge(d_pol_m_combined_conf, on="id")
pol_combined_conf = pol_combined_conf.merge(d_pol_m_pol_conf, on="id")

pol_combined_conf.style.set_caption("Dataset: NXTSA Pol Genomes.")

pol_combined_conf = pol_combined_conf[["id", "true_class", "pol_prediction", "full_prediction", "combined_prediction", "pol_confidence", "full_confidence", "combined_confidence"]]
pol_combined_conf["all agree?"] = pol_combined_conf.apply(lambda x: True if x["pol_prediction"] == x["full_prediction"] == x["combined_prediction"] else False, axis=1)
pol_combined_conf["full_pol_seperate_max_guess"] = pol_combined_conf.apply(lambda x: x["full_prediction"] if x["full_confidence"] > x["pol_confidence"] else x["pol_prediction"], axis=1)
pol_combined_conf["max_guesser"] = pol_combined_conf.apply(lambda x: "full" if x["full_confidence"] > x["pol_confidence"] else "pol", axis=1)
pol_combined_conf


Unnamed: 0,id,true_class,pol_prediction,full_prediction,combined_prediction,pol_confidence,full_confidence,combined_confidence,all agree?,full_pol_seperate_max_guess,max_guesser
0,DQ314732,01_AE,01_AE,26_A5U,01_AE,0.936703,0.042985,0.999020,False,01_AE,pol
1,JX112838,01_AE,01_AE,26_A5U,01_AE,0.942507,0.042624,0.999076,False,01_AE,pol
2,JX446645,01_AE,01_AE,26_A5U,01_AE,0.932038,0.042499,0.999110,False,01_AE,pol
3,JX446720,01_AE,01_AE,26_A5U,01_AE,0.892447,0.042637,0.999054,False,01_AE,pol
4,JX446756,01_AE,01_AE,26_A5U,01_AE,0.940217,0.042709,0.999052,False,01_AE,pol
...,...,...,...,...,...,...,...,...,...,...,...
2247,MH710523,C,131_A1B,138_cpx,82_cpx,0.063378,0.057470,0.976070,False,131_A1B,pol
2248,MH710524,C,C,138_cpx,77_cpx,0.907251,0.072994,0.637479,False,C,pol
2249,MH710525,C,C,138_cpx,77_cpx,0.264659,0.062792,0.848724,False,C,pol
2250,MH710526,C,C,138_cpx,77_cpx,0.815311,0.060187,0.952149,False,C,pol


In [34]:
pol_combined_conf.to_csv(f"{outpath}\\Data_PolGenomes_confidence_comparison.csv", index=False)