In [1]:
import pandas as pd
import os

In [60]:
fin_checkv = "quality_summary.tsv"
fin_genomad_virus = "isophage_ragtag_nonRef_virus_summary.tsv"
fin_genomad_plasmid = "isophage_ragtag_nonRef_plasmid_summary.tsv"
fin_iphop = "Host_prediction_to_genome_m90.csv"
fin_vs2 = "final-viral-score.tsv"
fin_vs2_cat = "vs2_category.csv"
fin_clst = "isophage_p95_s95_c95_cluster_ani_raw.tsv"

df_checkv = pd.read_csv(fin_checkv, sep="\t")
df_genomad_virus = pd.read_csv(fin_genomad_virus, sep="\t")
df_genomad_plasmid = pd.read_csv(fin_genomad_plasmid, sep="\t")
df_iphop = pd.read_csv(fin_iphop, sep=",")
df_vs2 = pd.read_csv(fin_vs2, sep="\t")
df_vs2_cat = pd.read_csv(fin_vs2_cat, sep=",")
df_clst = pd.read_csv(fin_clst, sep="\t", header=None, names=["cluster", "seq_name"])

# concat df_genomad_virus and df_genomad_plasmid
df_genomad = pd.concat([df_genomad_virus, df_genomad_plasmid], axis=0)

# sort df_iphop by column "Confidence score" in descending order, and then remove duplicates based on column "Virus" only keep the first one
df_iphop_one = df_iphop.sort_values(by="Confidence score", ascending=False).drop_duplicates(subset="Virus", keep="first")

In [81]:
# merge df_genomad_virus and df_checkv based on seq_name and contig_id
df = df_genomad_virus.merge(df_checkv, left_on="seq_name", right_on="contig_id", how="inner")
df = df.merge(df_vs2, left_on="seq_name", right_on="seqname", how="left")
df = df.merge(df_iphop_one, left_on="seq_name", right_on="Virus", how="left")

# checkv_quality in ["Complete", "High-Quality", "Medium-Quality"]
# df = df[df["checkv_quality"].isin(["Complete", "High-quality", "Medium-quality"])]

In [82]:
vcontigs = [x.replace("isophage_", "") for x in df.seq_name.to_list()]
vcontigs

['DMPmix_3_length=59943_depth=185.62x_circular=true',
 'DMPmix_7_length=9409_depth=0.14x',
 'D45mix_1_length=165254_depth=1.00x_circular=true',
 'D45s11_17_length=40759_depth=57.48x_circular=true',
 'DMPmix_8_length=5416_depth=0.29x',
 'DMPs09_46_length=34615_depth=41.72x',
 'DMPmix_4_length=42642_depth=9.47x_circular=true',
 'D45mix_2_length=42942_depth=0.26x',
 'D45mix_3_length=36732_depth=0.40x',
 'DMPmix_6_length=30335_depth=0.49x',
 'DMPmix_13_length=1117_depth=0.29x']

In [83]:
# cluster in df.seq_name
df_clst2 = df_clst[df_clst["cluster"].isin(vcontigs)]
df_clst2.seq_name.to_list()

['D45mix_1_length=165254_depth=1.00x_circular=true',
 'DMPmix_3_length=59943_depth=185.62x_circular=true',
 'D45mix_2_length=42942_depth=0.26x,DMPmix_10_length=4151_depth=0.15x',
 'DMPmix_4_length=42642_depth=9.47x_circular=true,DMPmix_12_length=1240_depth=0.22x,DMPmix_14_length=1090_depth=0.32x',
 'D45s11_17_length=40759_depth=57.48x_circular=true',
 'D45mix_3_length=36732_depth=0.40x,DMPmix_5_length=35162_depth=4.37x',
 'DMPs09_46_length=34615_depth=41.72x',
 'DMPmix_6_length=30335_depth=0.49x',
 'DMPmix_7_length=9409_depth=0.14x',
 'DMPmix_8_length=5416_depth=0.29x',
 'DMPmix_13_length=1117_depth=0.29x']

In [84]:
df

Unnamed: 0,seq_name,length_x,topology,coordinates,n_genes,genetic_code,virus_score,fdr,n_hallmarks,marker_enrichment,...,length_y,hallmark,viral,cellular,Virus,Host genome,Host taxonomy,Main method,Confidence score,Additional methods
0,isophage_DMPmix_3_length=59943_depth=185.62x_c...,59943,Linear,,76,11,0.9809,,9,94.109,...,59943,17,86.8,1.3,,,,,,
1,isophage_DMPmix_7_length=9409_depth=0.14x,9409,Linear,,13,11,0.9794,,6,17.245,...,9409,6,92.3,0.0,isophage_DMPmix_7_length=9409_depth=0.14x,RS_GCF_001676055.1,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,blast,95.4,iPHoP-RF;74.30
2,isophage_D45mix_1_length=165254_depth=1.00x_ci...,165254,Linear,,267,11,0.9791,,53,313.4203,...,165254,43,68.5,0.7,isophage_D45mix_1_length=165254_depth=1.00x_ci...,RS_GCF_006386485.1,d__Bacteria;p__Actinobacteriota;c__Actinomycet...,blast,94.2,iPHoP-RF;74.80
3,isophage_D45s11_17_length=40759_depth=57.48x_c...,40759,Linear,,57,11,0.9771,,10,53.0881,...,40759,12,71.4,3.2,isophage_D45s11_17_length=40759_depth=57.48x_c...,RS_GCF_002902365.1,d__Bacteria;p__Firmicutes;c__Bacilli;o__Staphy...,iPHoP-RF,97.1,blast;92.10
4,isophage_DMPmix_8_length=5416_depth=0.29x,5416,Linear,,5,11,0.9766,,2,5.6681,...,5416,4,100.0,0.0,isophage_DMPmix_8_length=5416_depth=0.29x,GB_GCA_900635025.1,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,blast,93.6,iPHoP-RF;69.30
5,isophage_DMPs09_46_length=34615_depth=41.72x,34615,Linear,,51,11,0.9758,,16,54.5464,...,34615,10,64.7,2.0,isophage_DMPs09_46_length=34615_depth=41.72x,RS_GCF_004000745.1,d__Bacteria;p__Firmicutes;c__Bacilli;o__Paenib...,iPHoP-RF,96.4,
6,isophage_DMPmix_4_length=42642_depth=9.47x_cir...,42642,Linear,,67,11,0.9686,,8,70.853,...,42642,15,59.1,7.6,isophage_DMPmix_4_length=42642_depth=9.47x_cir...,RS_GCF_001676055.1,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,blast,96.6,iPHoP-RF;71.30
7,isophage_D45mix_2_length=42942_depth=0.26x,42942,Linear,,67,11,0.9677,,19,65.9378,...,42942,13,44.6,12.3,isophage_D45mix_2_length=42942_depth=0.26x,GB_GCA_900635025.1,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,blast,96.6,iPHoP-RF;93.10
8,isophage_D45mix_3_length=36732_depth=0.40x,36732,Linear,,52,11,0.9666,,18,54.2128,...,36732,13,53.8,11.5,isophage_D45mix_3_length=36732_depth=0.40x,RS_GCF_001676055.1,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,blast,95.8,iPHoP-RF;86.30
9,isophage_DMPmix_6_length=30335_depth=0.49x,30335,Linear,,38,11,0.9607,,11,35.9155,...,30335,9,50.0,10.5,isophage_DMPmix_6_length=30335_depth=0.49x,RS_GCF_001676055.1,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,blast,96.7,iPHoP-RF;73.50


In [86]:
df.seq_name.to_list()

['isophage_DMPmix_3_length=59943_depth=185.62x_circular=true',
 'isophage_DMPmix_7_length=9409_depth=0.14x',
 'isophage_D45mix_1_length=165254_depth=1.00x_circular=true',
 'isophage_D45s11_17_length=40759_depth=57.48x_circular=true',
 'isophage_DMPmix_8_length=5416_depth=0.29x',
 'isophage_DMPs09_46_length=34615_depth=41.72x',
 'isophage_DMPmix_4_length=42642_depth=9.47x_circular=true',
 'isophage_D45mix_2_length=42942_depth=0.26x',
 'isophage_D45mix_3_length=36732_depth=0.40x',
 'isophage_DMPmix_6_length=30335_depth=0.49x',
 'isophage_DMPmix_13_length=1117_depth=0.29x']