# Interpro analysis of external rings

In [1]:
# Upload libraries
library(readr)
library(data.table)
library(dplyr)
library(stringr)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:data.table’:

    between, first, last


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [1]:
# Set the working directory to the parent folder of plots_data/ or if you get new results, set setwd to PATH where the files are located.
# Example:
# setwd("~/microbiota_sialylation/genomes_download/")

In [3]:
setwd("/mnt/l/Sia_git/genomes_download/")

In [17]:
# Upload data
matched_KpsM_ID_interpro <- read_delim("plots_data/Interpro_results/KpsM_ID_final.tsv_retrieved_final_results.tsv", 
                                       delim = "\t", escape_double = FALSE, 
                                       col_names = FALSE, trim_ws = TRUE)

# set header
column_names_interpro <-c("Sequence", "md5", "unkown", "Bank", "signature", "annotation", "align_begin",
                          "align_end", "e-value","Hit", "date", "Interpro_code", "annotation_two", "unkown2", "unkown3")

setnames(matched_KpsM_ID_interpro, column_names_interpro)

head(matched_KpsM_ID_interpro)
nrow(matched_KpsM_ID_interpro)

# See unique names for annotation column
unique_annot_KpsM <- unique(matched_KpsM_ID_interpro$annotation)
unique_annot_KpsM

[1mRows: [22m[34m11689[39m [1mColumns: [22m[34m15[39m
[36m──[39m [1mColumn specification[22m [36m──────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (11): X1, X2, X4, X5, X6, X9, X11, X12, X13, X14, X15
[32mdbl[39m  (3): X3, X7, X8
[33mlgl[39m  (1): X10

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


Sequence,md5,unkown,Bank,signature,annotation,align_begin,align_end,e-value,Hit,date,Interpro_code,annotation_two,unkown2,unkown3
<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<lgl>,<chr>,<chr>,<chr>,<chr>,<chr>
GCF_012224765.1_protein.faa_WP_167844504.1,05c8429d6d974e9df66e2c3ffaae9c11,260,Pfam,PF01061,ABC-2 type transporter,5,218,9.2e-08,True,24-01-2026,IPR013525,"ABC-2 type transporter, transmembrane domain",-,-
GCF_012224765.1_protein.faa_WP_167844504.1,05c8429d6d974e9df66e2c3ffaae9c11,260,PANTHER,PTHR30413,INNER MEMBRANE TRANSPORT PERMEASE,3,248,1.7e-15,True,24-01-2026,-,-,-,-
GCF_012224765.1_protein.faa_WP_167844504.1,05c8429d6d974e9df66e2c3ffaae9c11,260,PRINTS,PR00164,ABC-2 type transport system membrane protein signature,25,46,6.2e-08,True,24-01-2026,IPR000412,ABC-2 transporter,-,-
GCF_012224765.1_protein.faa_WP_167844504.1,05c8429d6d974e9df66e2c3ffaae9c11,260,PRINTS,PR00164,ABC-2 type transport system membrane protein signature,138,162,6.2e-08,True,24-01-2026,IPR000412,ABC-2 transporter,-,-
GCF_012224765.1_protein.faa_WP_167844504.1,05c8429d6d974e9df66e2c3ffaae9c11,260,PRINTS,PR00164,ABC-2 type transport system membrane protein signature,175,194,6.2e-08,True,24-01-2026,IPR000412,ABC-2 transporter,-,-
GCF_012224765.1_protein.faa_WP_167844504.1,05c8429d6d974e9df66e2c3ffaae9c11,260,PRINTS,PR00164,ABC-2 type transport system membrane protein signature,195,214,6.2e-08,True,24-01-2026,IPR000412,ABC-2 transporter,-,-


In [8]:
# Filter by code of KpsM
patterns_KpsM <- c("PTHR30413","PS51012","PIRSF006648", "PF01061", "PR00164")

filtered_KpsM_interpro <- matched_KpsM_ID_interpro %>%
  filter(signature %in% patterns_KpsM)

# Get unique IDs
KpsM_interpro_distinct <- filtered_KpsM_interpro%>% distinct(Sequence, .keep_all = TRUE)
KpsM_interpro_distinct$GCF_ID <- str_extract(KpsM_interpro_distinct$Sequence, "^[^_]+_[^_]+") #extract GCF NCBI ID
KpsM_interpro_distinct_unique <- KpsM_interpro_distinct%>% distinct(GCF_ID, .keep_all = FALSE) #get unique GCF NCBI ID

nrow(filtered_KpsM_interpro)
nrow(KpsM_interpro_distinct_unique)

In [16]:
#filter Interpro for KpsT
matched_KpsT_ID_interpro <- read_delim("plots_data/Interpro_results/KpsT_ID_final.tsv_retrieved_final_results.tsv", 
                                       delim = "\t", escape_double = FALSE, 
                                       col_names = FALSE, trim_ws = TRUE)

setnames(matched_KpsT_ID_interpro, column_names_interpro)

head(matched_KpsT_ID_interpro)
nrow(matched_KpsT_ID_interpro)

unique_annot_KpsT <- unique(matched_KpsT_ID_interpro$annotation)
unique_annot_KpsT

[1mRows: [22m[34m11513[39m [1mColumns: [22m[34m15[39m
[36m──[39m [1mColumn specification[22m [36m──────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (11): X1, X2, X4, X5, X6, X9, X11, X12, X13, X14, X15
[32mdbl[39m  (3): X3, X7, X8
[33mlgl[39m  (1): X10

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


Sequence,md5,unkown,Bank,signature,annotation,align_begin,align_end,e-value,Hit,date,Interpro_code,annotation_two,unkown2,unkown3
<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<lgl>,<chr>,<chr>,<chr>,<chr>,<chr>
GCF_020410765.1_protein.faa_WP_086157215.1,b14d48c2285a6f69ba2a4bfb8a9d5d62,264,CDD,cd03220,ABC_KpsT_Wzt,19,249,4.01601e-94,True,24-01-2026,IPR015860,"ABC transporter, teichoic acids export TagH-like",-,-
GCF_020410765.1_protein.faa_WP_086157215.1,b14d48c2285a6f69ba2a4bfb8a9d5d62,264,SUPERFAMILY,SSF52540,P-loop containing nucleoside triphosphate hydrolases,53,258,5e-42,True,24-01-2026,IPR027417,P-loop containing nucleoside triphosphate hydrolase,-,-
GCF_020410765.1_protein.faa_WP_086157215.1,b14d48c2285a6f69ba2a4bfb8a9d5d62,264,PANTHER,PTHR46743,TEICHOIC ACIDS EXPORT ATP-BINDING PROTEIN TAGH,19,262,2e-89,True,24-01-2026,IPR050683,Bacterial Polysaccharide Export ATP-binding,-,-
GCF_020410765.1_protein.faa_WP_086157215.1,b14d48c2285a6f69ba2a4bfb8a9d5d62,264,Gene3D,G3DSA:3.40.50.300,-,49,263,4.6e-43,True,24-01-2026,IPR027417,P-loop containing nucleoside triphosphate hydrolase,-,-
GCF_020410765.1_protein.faa_WP_086157215.1,b14d48c2285a6f69ba2a4bfb8a9d5d62,264,Pfam,PF00005,ABC transporter,62,195,9.6e-16,True,24-01-2026,IPR003439,"ABC transporter-like, ATP-binding domain",-,-
GCF_020410765.1_protein.faa_WP_086157215.1,b14d48c2285a6f69ba2a4bfb8a9d5d62,264,SMART,SM00382,AAA_5,70,244,1.3e-15,True,24-01-2026,IPR003593,AAA+ ATPase domain,-,-


In [11]:
# Filter by code for KpsT
patterns_KpsT <- c("G3DSA:3.40.50.300","PF00005","SM00382",
                   "PTHR46743","SSF52540","PS50893","PS00211","cd03220")

filtered_KpsT_interpro <- matched_KpsT_ID_interpro %>%
  filter(signature %in% patterns_KpsT)

# Get unique IDs
KpsT_interpro_distinct <- filtered_KpsT_interpro%>% distinct(Sequence, .keep_all = TRUE)
KpsT_interpro_distinct$GCF_ID <- str_extract(KpsT_interpro_distinct$Sequence, "^[^_]+_[^_]+") #extract GCF NCBI ID
KpsT_interpro_distinct_unique <- KpsT_interpro_distinct%>% distinct(GCF_ID, .keep_all = FALSE) #get unique GCF NCBI ID

nrow(filtered_KpsT_interpro)
nrow(KpsT_interpro_distinct_unique)

In [15]:
#KpsD

#filter Interpro for KpsD
matched_KpsD_ID_interpro <- read_delim("plots_data/Interpro_results/KpsD_ID_final.tsv_retrieved_final_results.tsv", 
                                       delim = "\t", escape_double = FALSE, 
                                       col_names = FALSE, trim_ws = TRUE)

setnames(matched_KpsD_ID_interpro, column_names_interpro)

head(matched_KpsD_ID_interpro)
nrow(matched_KpsD_ID_interpro)

unique_annot_KpsD <- unique(matched_KpsD_ID_interpro$annotation)
unique_annot_KpsD

[1mRows: [22m[34m6851[39m [1mColumns: [22m[34m15[39m
[36m──[39m [1mColumn specification[22m [36m──────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (11): X1, X2, X4, X5, X6, X9, X11, X12, X13, X14, X15
[32mdbl[39m  (3): X3, X7, X8
[33mlgl[39m  (1): X10

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


Sequence,md5,unkown,Bank,signature,annotation,align_begin,align_end,e-value,Hit,date,Interpro_code,annotation_two,unkown2,unkown3
<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<lgl>,<chr>,<chr>,<chr>,<chr>,<chr>
GCF_013357745.1_protein.faa_WP_000039770.1,c74d39c7bb81034ca45922fd9de0ca03,385,Gene3D,G3DSA:3.30.1950.10,wza like domain,88,167,1.6e-108,True,24-01-2026,-,-,-,-
GCF_013357745.1_protein.faa_WP_000039770.1,c74d39c7bb81034ca45922fd9de0ca03,385,Pfam,PF18412,Outer-membrane lipoprotein Wza C-terminal domain,352,381,5.8e-11,True,24-01-2026,IPR040716,"Outer-membrane lipoprotein Wza, C-terminal domain",-,-
GCF_013357745.1_protein.faa_WP_000039770.1,c74d39c7bb81034ca45922fd9de0ca03,385,Pfam,PF02563,Polysaccharide biosynthesis/export protein,80,163,3.6000000000000003e-22,True,24-01-2026,IPR003715,"Polysaccharide export protein, N-terminal domain",-,-
GCF_013357745.1_protein.faa_WP_000039770.1,c74d39c7bb81034ca45922fd9de0ca03,385,Gene3D,G3DSA:3.10.560.10,Outer membrane lipoprotein wza domain like,173,250,1.6e-108,True,24-01-2026,-,-,-,-
GCF_013357745.1_protein.faa_WP_000039770.1,c74d39c7bb81034ca45922fd9de0ca03,385,Gene3D,G3DSA:3.10.560.10,Outer membrane lipoprotein wza domain like,253,349,1.6e-108,True,24-01-2026,-,-,-,-
GCF_013357745.1_protein.faa_WP_000039770.1,c74d39c7bb81034ca45922fd9de0ca03,385,PANTHER,PTHR33619,POLYSACCHARIDE EXPORT PROTEIN GFCE-RELATED,12,371,2.8999999999999996e-68,True,24-01-2026,IPR049712,Polysaccharide export protein,-,-


In [14]:
# Filter by KpsD code
patterns_KpsD <- c("G3DSA:3.10.560.10","PTHR33619",
                   "PF10531","PF02563")

filtered_KpsD_interpro <- matched_KpsD_ID_interpro %>%
  filter(signature %in% patterns_KpsD)

# Get unique IDs
KpsD_interpro_distinct <- filtered_KpsD_interpro%>% distinct(Sequence, .keep_all = TRUE)
KpsD_interpro_distinct$GCF_ID <- str_extract(KpsD_interpro_distinct$Sequence, "^[^_]+_[^_]+") #extract GCF NCBI ID
KpsD_interpro_distinct_unique <- KpsD_interpro_distinct%>% distinct(GCF_ID, .keep_all = FALSE) #get unique GCF NCBI ID

nrow(filtered_KpsD_interpro)
nrow(KpsD_interpro_distinct_unique)

In [18]:
#neuO

matched_neuO_ID_interpro <- read_delim("plots_data/Interpro_results/neuO_ID_final.tsv_retrieved_final_results.tsv", 
                                       delim = "\t", escape_double = FALSE, 
                                       col_names = FALSE, trim_ws = TRUE)

setnames(matched_neuO_ID_interpro, column_names_interpro)
head(matched_neuO_ID_interpro)
nrow(matched_neuO_ID_interpro)

unique_annot_neuO <- unique(matched_neuO_ID_interpro$annotation)
unique_annot_neuO

[1mRows: [22m[34m9276[39m [1mColumns: [22m[34m15[39m
[36m──[39m [1mColumn specification[22m [36m──────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (11): X1, X2, X4, X5, X6, X9, X11, X12, X13, X14, X15
[32mdbl[39m  (3): X3, X7, X8
[33mlgl[39m  (1): X10

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


Sequence,md5,unkown,Bank,signature,annotation,align_begin,align_end,e-value,Hit,date,Interpro_code,annotation_two,unkown2,unkown3
<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<lgl>,<chr>,<chr>,<chr>,<chr>,<chr>
GCF_008704715.1_protein.faa_WP_150504405.1,007e2eb5e2f839a632be266dfe33268a,272,CDD,cd04647,LbH_MAT_like,95,203,4.80528E-33,True,24-01-2026,-,-,-,-
GCF_008704715.1_protein.faa_WP_150504405.1,007e2eb5e2f839a632be266dfe33268a,272,MobiDBLite,mobidb-lite,consensus disorder prediction,209,228,-,True,24-01-2026,-,-,-,-
GCF_008704715.1_protein.faa_WP_150504405.1,007e2eb5e2f839a632be266dfe33268a,272,Pfam,PF00132,Bacterial transferase hexapeptide (six repeats),153,187,4.1E-7,True,24-01-2026,IPR001451,Hexapeptide repeat,-,-
GCF_008704715.1_protein.faa_WP_150504405.1,007e2eb5e2f839a632be266dfe33268a,272,Gene3D,G3DSA:2.160.10.10,Hexapeptide repeat proteins,10,211,3.6E-36,True,24-01-2026,-,-,-,-
GCF_008704715.1_protein.faa_WP_150504405.1,007e2eb5e2f839a632be266dfe33268a,272,PANTHER,PTHR23416,SIALIC ACID SYNTHASE-RELATED,54,212,2.8E-37,True,24-01-2026,IPR051159,Hexapeptide-repeat containing acetyltransferases,-,-
GCF_008704715.1_protein.faa_WP_150504405.1,007e2eb5e2f839a632be266dfe33268a,272,SUPERFAMILY,SSF51161,Trimeric LpxA-like enzymes,54,210,3.06E-36,True,24-01-2026,IPR011004,Trimeric LpxA-like superfamily,-,-


In [19]:
# Filter by NeuO code 
patterns_neuO <- c("SSF51161","G3DSA:2.160.10.10")

filtered_neuO_interpro <- matched_neuO_ID_interpro %>%
  filter(signature %in% patterns_neuO)

# Get unique IDs
neuO_interpro_distinct <- filtered_neuO_interpro%>% distinct(Sequence, .keep_all = TRUE)
neuO_interpro_distinct$GCF_ID <- str_extract(neuO_interpro_distinct$Sequence, "^[^_]+_[^_]+") #extract GCF NCBI ID
neuO_interpro_distinct_unique <- neuO_interpro_distinct%>% distinct(GCF_ID, .keep_all = FALSE) #get unique GCF NCBI ID

nrow(filtered_neuO_interpro)
nrow(neuO_interpro_distinct_unique)

In [20]:
#neuD

matched_neuD_ID_interpro <- read_delim("plots_data/Interpro_results/neuD_ID_final.tsv_retrieved_final_results.tsv", 
                                       delim = "\t", escape_double = FALSE, 
                                       col_names = FALSE, trim_ws = TRUE)

setnames(matched_neuD_ID_interpro, column_names_interpro)
head(matched_neuD_ID_interpro)
nrow(matched_neuD_ID_interpro)

unique_annot_neuD <- unique(matched_neuD_ID_interpro$annotation)
unique_annot_neuD

[1mRows: [22m[34m9335[39m [1mColumns: [22m[34m15[39m
[36m──[39m [1mColumn specification[22m [36m──────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (11): X1, X2, X4, X5, X6, X9, X11, X12, X13, X14, X15
[32mdbl[39m  (3): X3, X7, X8
[33mlgl[39m  (1): X10

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


Sequence,md5,unkown,Bank,signature,annotation,align_begin,align_end,e-value,Hit,date,Interpro_code,annotation_two,unkown2,unkown3
<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<lgl>,<chr>,<chr>,<chr>,<chr>,<chr>
GCF_036345175.1_protein.faa_WP_330292806.1,9c2f7dc1e8019dfe45188ae6cae9adef,229,PANTHER,PTHR43300,ACETYLTRANSFERASE,89,211,1.1e-22,True,24-01-2026,IPR050179,Transferase hexapeptide repeat,-,-
GCF_036345175.1_protein.faa_WP_330292806.1,9c2f7dc1e8019dfe45188ae6cae9adef,229,NCBIfam,TIGR03570,NeuD/PglB/VioB family sugar acetyltransferase,4,206,3.8e-53,True,24-01-2026,IPR020019,Acyltransferase PglD-like,-,-
GCF_036345175.1_protein.faa_WP_330292806.1,9c2f7dc1e8019dfe45188ae6cae9adef,229,CDD,cd03360,LbH_AT_putative,6,204,4.95844e-59,True,24-01-2026,IPR020019,Acyltransferase PglD-like,-,-
GCF_036345175.1_protein.faa_WP_330292806.1,9c2f7dc1e8019dfe45188ae6cae9adef,229,Pfam,PF17836,PglD N-terminal domain,4,82,5.7e-06,True,24-01-2026,IPR041561,"PglD, N-terminal",-,-
GCF_036345175.1_protein.faa_WP_330292806.1,9c2f7dc1e8019dfe45188ae6cae9adef,229,SUPERFAMILY,SSF51161,Trimeric LpxA-like enzymes,2,213,1.52e-36,True,24-01-2026,IPR011004,Trimeric LpxA-like superfamily,-,-
GCF_036345175.1_protein.faa_WP_330292806.1,9c2f7dc1e8019dfe45188ae6cae9adef,229,Gene3D,G3DSA:2.160.10.10,Hexapeptide repeat proteins,89,221,2.4e-27,True,24-01-2026,-,-,-,-


In [21]:
# Filter by NeuD Interpro code
patterns_neuD <- c("SSF51161","TIGR03570","cd03360","PF17836",
                   "PTHR43300","G3DSA:3.40.50.20","G3DSA:2.160.10.10","PS00101")

filtered_neuD_interpro <- matched_neuD_ID_interpro %>%
  filter(signature %in% patterns_neuD)

# Get unique IDs
neuD_interpro_distinct <- filtered_neuD_interpro%>% distinct(Sequence, .keep_all = TRUE)
neuD_interpro_distinct$GCF_ID <- str_extract(neuD_interpro_distinct$Sequence, "^[^_]+_[^_]+") #extract GCF NCBI ID
neuD_interpro_distinct_unique <- neuD_interpro_distinct%>% distinct(GCF_ID, .keep_all = FALSE) #get unique GCF NCBI ID

nrow(filtered_neuD_interpro)
nrow(neuD_interpro_distinct_unique)

In [24]:
#save file with NCBI ID - Result is already available at PATH below
write_tsv(neuO_interpro_distinct_unique, "plots_data/itol/neuO_interpro_distinct_unique_phylogeny.tsv")
write_tsv(KpsT_interpro_distinct_unique, "plots_data/itol/KpsT_interpro_distinct_unique_phylogeny.tsv")
write_tsv(KpsM_interpro_distinct_unique, "plots_data/itol/KpsM_interpro_distinct_unique_phylogeny.tsv")
write_tsv(neuD_interpro_distinct_unique, "plots_data/itol/neuD_interpro_distinct_unique_phylogeny.tsv")
write_tsv(KpsD_interpro_distinct_unique, "plots_data/itol/KpsD_interpro_distinct_unique_phylogeny.tsv")