# Domain plots with positive selected sites

## Map sites back to human protein sequence

In [1]:
# packages
from Bio import SeqIO
from Bio.Seq import Seq
import pandas as pd
from pathlib import Path

In [9]:
# --- Directory ---
data_dir = Path("/home/emma/Amplicons/Workspaces/emma/downloaded_data")

# ------ INPUT FILES ------
# variables 
family = "CENPVL"
cluster = "CENPVL3"
paml_positive_sites = [69, 72, 75, 81, 86, 89, 110]
human_seq_name = "CENPVL1_isoform_NP_001342206.1;protein_id=NP_001342206.1_HomSap"

alignment_with_gaps = f"{data_dir}/sequences_x_updated/{family}_selected_isoform/blastdb/cluster_alignments/{family}_cluster_{cluster}_NT.fa"  
original_fasta_all_species = f"{data_dir}/sequences_x_updated/{family}_selected_isoform/all_species_{family}.fa"  

# ------ LOAD SEQUENCES ------
original_records = list(SeqIO.parse(original_fasta_all_species, "fasta"))
original_seq = None
for record in original_records:
    if record.id == human_seq_name:
        original_seq = str(record.seq)
        break

alignment_records = list(SeqIO.parse(alignment_with_gaps, "fasta"))
human_with_gaps = None
for record in alignment_records:
    if record.id == human_seq_name:
        human_with_gaps = str(record.seq)
        break

print(f"Found sequence: {human_seq_name}")
print(f"Original sequence length: {len(original_seq)} bp ({len(original_seq)//3} codons)")
print(f"Alignment length (with gaps): {len(human_with_gaps)} bp ({len(human_with_gaps)//3} positions)")

# ------ CHECK WHICH COLUMNS WOULD BE REMOVED BY COMPLETE DELETION ------
# A column is removed if ANY species has a gap there
alignment_length = len(human_with_gaps)
columns_to_keep = []

# Check each alignment position across ALL species
for pos in range(0, alignment_length, 3):
    codon_has_gap = False
    for record in alignment_records:
        codon = str(record.seq)[pos:pos+3]
        if '-' in codon:
            codon_has_gap = True
            break
    
    if not codon_has_gap:
        columns_to_keep.append(pos // 3)  # Store as codon position

print(f"Columns kept after complete deletion: {len(columns_to_keep)} codons")
print(f"Columns removed: {(alignment_length // 3) - len(columns_to_keep)} codons")

# ------ BUILD MAPPING ------
trimmed_to_positions = []
for trimmed_idx, alignment_codon_idx in enumerate(columns_to_keep):
    trimmed_codon_num = trimmed_idx + 1  # 1-based PAML position
    alignment_codon_pos = alignment_codon_idx + 1  # 1-based alignment position
    
    # Get the codon from human sequence in alignment
    codon_start = alignment_codon_idx * 3
    codon = human_with_gaps[codon_start:codon_start+3]
    aa = str(Seq(codon).translate()) if '-' not in codon else 'X'
    
    # Get position in original human sequence (without gaps)
    human_codon_count = 0
    for i in range(0, codon_start + 3, 3):
        test_codon = human_with_gaps[i:i+3]
        if '-' not in test_codon:
            human_codon_count += 1
    
    trimmed_to_positions.append({
        'trimmed_codon': trimmed_codon_num,           # PAML position
        'alignment_position': alignment_codon_pos,     # Position in full alignment
        'original_position': human_codon_count,        # Position in human original
        'codon': codon,
        'aa': aa
    })

# ------ MAP PAML POSITIONS ------
results = []
original_positions = []  # Collect positions for easy copying

for paml_pos in paml_positive_sites:
    if paml_pos <= len(trimmed_to_positions):
        mapping = trimmed_to_positions[paml_pos - 1]
        results.append({
            'PAML_codon_position': paml_pos,
            'Full_alignment_position': mapping['alignment_position'],
            'Original_human_position': mapping['original_position'],
            'Codon': mapping['codon'],
            'Amino_acid': mapping['aa']
        })
        original_positions.append(mapping['original_position'])

df = pd.DataFrame(results)
print("\n=== Mapping PAML positions ===")
print(df.to_string(index=False))

# Print as Python list (easy to copy-paste)
print("\n=== Original human positions (Python list format) ===")
print(f"paml_positive_sites = {original_positions}")

# Print as comma-separated string (for other uses)
print("\n=== Original human positions (comma-separated) ===")
print(", ".join(map(str, original_positions)))

Found sequence: CENPVL1_isoform_NP_001342206.1;protein_id=NP_001342206.1_HomSap
Original sequence length: 819 bp (273 codons)
Alignment length (with gaps): 870 bp (290 positions)
Columns kept after complete deletion: 273 codons
Columns removed: 17 codons

=== Mapping PAML positions ===
 PAML_codon_position  Full_alignment_position  Original_human_position Codon Amino_acid
                  69                       86                       69   AAG          K
                  72                       89                       72   CCC          P
                  75                       92                       75   CTG          L
                  81                       98                       81   ACC          T
                  86                      103                       86   GGG          G
                  89                      106                       89   CCG          P
                 110                      127                      110   CGG          R

=== Orig

## Use the InterProScan TSV file of the domains found to create a domain plot

In [2]:
%load_ext rpy2.ipython
#%reload_ext rpy2.ipython

In [4]:
%%R

library("dplyr")
library(ggplot2)
library(tidyr)
library(readr)
library(purrr)
library(tibble)
library(stringr)
library(forcats)

### OPNLW1

In [None]:
%%R
#### OPN1LW ####

# ------ INPUT FILES ------
interpro_file <- "OPN1LW_HomSap.tsv"
# ADD YOUR POSITIVE SELECTION SITES HERE:
paml_positive_sites <- c(116, 153, 171, 180, 233, 274, 275, 277, 279, 298, 309)  


# ------ LOAD INTERPRO DATA ------
columns <- c('protein_accession', 'seq_md5', 'seq_length', 'analysis', 'signature_accession', 
             'signature_description', 'start', 'stop', 'score', 'status', 'date', 
             'interpro_accession', 'interpro_description', 'go_annotations', 'pathways')

df <- read_tsv(interpro_file, col_names = columns, show_col_types = FALSE)

protein_length <- df$seq_length[1]
cat(sprintf("Protein length: %d amino acids\n", protein_length))

# ------ SELECT DOMAINS ------
# Separate disordered regions
df_disordered <- df %>%
  filter(str_detect(signature_description, regex("disorder", ignore_case = TRUE)))

# Merge overlapping disordered regions
merge_overlapping_regions <- function(df_regions) {
  if (nrow(df_regions) == 0) {
    return(df_regions)
  }
  
  df_sorted <- df_regions %>% arrange(start)
  merged <- list()
  current_start <- df_sorted$start[1]
  current_stop <- df_sorted$stop[1]
  
  if (nrow(df_sorted) > 1) {
    for (i in 2:nrow(df_sorted)) {
      if (df_sorted$start[i] <= current_stop + 5) {
        current_stop <- max(current_stop, df_sorted$stop[i])
      } else {
        merged[[length(merged) + 1]] <- data.frame(start = current_start, stop = current_stop)
        current_start <- df_sorted$start[i]
        current_stop <- df_sorted$stop[i]
      }
    }
  }
  
  merged[[length(merged) + 1]] <- data.frame(start = current_start, stop = current_stop)
  bind_rows(merged)
}

df_disordered <- merge_overlapping_regions(df_disordered)

# Select the LWS opsin domain, retinal binding site, and Rhodopsin-like GPCR signatures
df_lws <- df %>%
  filter(signature_accession == "cd15081") %>%  # LWS opsin
  arrange(start)

df_retinal <- df %>%
  filter(str_detect(signature_description, "retinal binding site")) %>%
  arrange(start)

df_rhodopsin <- df %>%
  filter(
    signature_accession == "PR00237" &  # Rhodopsin-like GPCR superfamily signature
      analysis == "PRINTS"
  ) %>%
  arrange(start)

domain_colors <- list(
  'LWS' = '#E74C3C',
  'retinal' = '#F39C12',
  'rhodopsin' = '#3498DB'
)

# Print domains 
cat("\nDomains to plot:\n")
cat("LWS opsin:\n")
print(df_lws %>% select(signature_description, start, stop))
cat("\nRetinal binding site:\n")
print(df_retinal %>% select(signature_description, start, stop))
cat("\nRhodopsin-like GPCR signatures:\n")
print(df_rhodopsin %>% select(signature_description, start, stop))

# ------  PLOT ------
png('OPN1LW_domains_selection.png', width = 16, height = 4, units = 'in', res = 300)

# plot with no border
par(mar = c(4, 1, 3, 1), xpd = FALSE)
plot(NULL, xlim = c(-10, protein_length + 10), ylim = c(0.15, 0.85),
     xlab = 'Amino acid position', ylab = '', yaxt = 'n',
     main = 'OPN1LW Protein Domains and Positive Selection Sites',
     cex.lab = 1.2, cex.main = 1.4, font.lab = 2, font.main = 2,
     axes = FALSE)

x_ticks <- seq(0, ceiling(protein_length/10)*10, by = 10)
axis(1, at = x_ticks, labels = x_ticks, cex.axis = 0.9)

backbone_y <- 0.5
domain_height <- 0.05
disorder_height <- 0.08
sub_height <- 0.025  
triangle_width <- 0.8
triangle_height <- 0.08

# ------  POSITIVE SELECTION SITES  ------
if (length(paml_positive_sites) > 0) {
  for (site in paml_positive_sites) {
    in_any_domain <- FALSE
    
    # Check LWS domain
    if (nrow(df_lws) > 0) {
      if (df_lws$start[1] <= site && site <= df_lws$stop[1]) {
        in_any_domain <- TRUE
      }
    }
    
    # Check disordered regions
    if (!in_any_domain && nrow(df_disordered) > 0) {
      for (i in 1:nrow(df_disordered)) {
        if (df_disordered$start[i] <= site && site <= df_disordered$stop[i]) {
          in_any_domain <- TRUE
          break
        }
      }
    }
    
    # Triangle positions
    if (in_any_domain) {
      triangle_bottom_y <- backbone_y - domain_height/2 - triangle_height
      triangle_top_y <- backbone_y - domain_height/2
    } else {
      triangle_bottom_y <- backbone_y - triangle_height
      triangle_top_y <- backbone_y
    }
    
    # Draw triangle
    polygon(x = c(site - triangle_width, site + triangle_width, site),
            y = c(triangle_bottom_y, triangle_bottom_y, triangle_top_y),
            col = 'red', border = 'darkred', lwd = 1)
    
    # Add site number annotation
    text(site, triangle_bottom_y - 0.02, as.character(site), 
         cex = 0.7, font = 1, col = 'black', srt = 90, adj = 1)
  }
}

# ------  BACKBONE ------
lines(c(0, protein_length), c(backbone_y, backbone_y), lwd = 3)

# ------  DISORDERED REGIONS ------
if (nrow(df_disordered) > 0) {
  for (i in 1:nrow(df_disordered)) {
    start <- df_disordered$start[i]
    stop <- df_disordered$stop[i]
    
    rect(start, backbone_y - disorder_height/2, stop, backbone_y + disorder_height/2,
         col = rgb(0.75, 0.75, 0.75, 0.5), border = 'gray50', lwd = 1)
  }
}

# ------  RETINAL BINDING SITE ------
if (nrow(df_retinal) > 0) {
  start <- df_retinal$start[1]
  stop <- df_retinal$stop[1]
  name <- 'Retinal'
  color <- domain_colors[['retinal']]
  width <- stop - start
  
  # Draw above the LWS domain 
  retinal_y <- backbone_y + domain_height/2 + sub_height*2 + 0.08
  rect(start, retinal_y - sub_height/2, stop, retinal_y + sub_height/2,
       col = color, border = 'black', lwd = 1.5)
  
  text(start + width/2, retinal_y, name, 
       cex = 0.6, font = 2, col = 'white')
  
  coord_y <- retinal_y + sub_height/2 + 0.02
  text(start, coord_y, as.character(start), cex = 0.5, font = 2)
  text(stop, coord_y, as.character(stop), cex = 0.5, font = 2)
}

# ------  RHODOPSIN-LIKE GPCR SIGNATURES  ------
if (nrow(df_rhodopsin) > 0) {
  rhodopsin_y <- backbone_y + domain_height/2 + sub_height + 0.02
  
  for (i in 1:nrow(df_rhodopsin)) {
    start <- df_rhodopsin$start[i]
    stop <- df_rhodopsin$stop[i]
    width <- stop - start
    
    rect(start, rhodopsin_y - sub_height/2, stop, rhodopsin_y + sub_height/2,
         col = domain_colors[['rhodopsin']], border = 'black', lwd = 1.5)
    
    # "GPCR" label inside the box
    text(start + width/2, rhodopsin_y, "GPCR", 
         cex = 0.5, font = 2, col = 'white')
    
    # coordinates below each block
    coord_y <- rhodopsin_y - sub_height/2 - 0.015
    text(start, coord_y, as.character(start), cex = 0.5, font = 2)
    text(stop, coord_y, as.character(stop), cex = 0.5, font = 2)
  }
}

# ------ LWS OPSIN DOMAIN ------
if (nrow(df_lws) > 0) {
  start <- df_lws$start[1]
  stop <- df_lws$stop[1]
  name <- 'LWS opsin'
  color <- domain_colors[['LWS']]
  width <- stop - start
  
  rect(start, backbone_y - domain_height/2, stop, backbone_y + domain_height/2,
       col = color, border = 'black', lwd = 2)
  
  text(start + width/2, backbone_y, name, 
       cex = 0.8, font = 2, col = 'white')
  
  coord_y <- backbone_y + domain_height/2 + 0.05
  text(start, coord_y, as.character(start), cex = 0.7, font = 2)
  text(stop, coord_y, as.character(stop), cex = 0.7, font = 2)
}

# ------ LEGEND ------
legend('topright', 
       legend = c('Positive selection', 'Disordered region'),
       fill = c('red', rgb(0.75, 0.75, 0.75, 0.5)),
       border = c('darkred', 'gray50'),
       cex = 0.9, box.lwd = 1)

dev.off()

cat("\nPlot saved as 'OPN1LW_domains_selection.png'\n")


Protein length: 364 amino acids

Domains to plot:
LWS opsin:
# A tibble: 1 × 3
  signature_description start  stop
  <chr>                 <dbl> <dbl>
1 7tmA_LWS_opsin           42   333

Retinal binding site:
# A tibble: 1 × 3
  signature_description                          start  stop
  <chr>                                          <dbl> <dbl>
1 Visual pigments (opsins) retinal binding site.   306   322

Rhodopsin-like GPCR signatures:
# A tibble: 7 × 3
  signature_description                     start  stop
  <chr>                                     <dbl> <dbl>
1 Rhodopsin-like GPCR superfamily signature    55    79
2 Rhodopsin-like GPCR superfamily signature    88   109
3 Rhodopsin-like GPCR superfamily signature   133   155
4 Rhodopsin-like GPCR superfamily signature   168   189
5 Rhodopsin-like GPCR superfamily signature   220   243
6 Rhodopsin-like GPCR superfamily signature   266   290
7 Rhodopsin-like GPCR superfamily signature   304   330

Plot saved as 'OPN1LW_domains_sel

### MAGEA2

In [None]:
%%R
##### MAGEA2 #### 

# ------ INPUT FILES ------
interpro_file <- "MAGEA2_HomSap.tsv"
#  POSITIVE SELECTION SITES :
paml_positive_sites <- c(37, 60, 97, 104, 140, 145, 156, 290, 294, 296)  

# ------ LOAD INTERPRO DATA ------
columns <- c('protein_accession', 'seq_md5', 'seq_length', 'analysis', 'signature_accession', 
             'signature_description', 'start', 'stop', 'score', 'status', 'date', 
             'interpro_accession', 'interpro_description', 'go_annotations', 'pathways')

df <- read_tsv(interpro_file, col_names = columns, show_col_types = FALSE)

protein_length <- df$seq_length[1]
cat(sprintf("Protein length: %d amino acids\n", protein_length))

# ------ SELECT DOMAINS ------
# Separate disordered regions
df_disordered <- df %>%
  filter(str_detect(signature_description, regex("disorder", ignore_case = TRUE)))

# Merge overlapping disordered regions
merge_overlapping_regions <- function(df_regions) {
  if (nrow(df_regions) == 0) {
    return(df_regions)
  }
  
  df_sorted <- df_regions %>% arrange(start)
  merged <- list()
  current_start <- df_sorted$start[1]
  current_stop <- df_sorted$stop[1]
  
  if (nrow(df_sorted) > 1) {
    for (i in 2:nrow(df_sorted)) {
      if (df_sorted$start[i] <= current_stop + 5) {
        current_stop <- max(current_stop, df_sorted$stop[i])
      } else {
        merged[[length(merged) + 1]] <- data.frame(start = current_start, stop = current_stop)
        current_start <- df_sorted$start[i]
        current_stop <- df_sorted$stop[i]
      }
    }
  }
  
  merged[[length(merged) + 1]] <- data.frame(start = current_start, stop = current_stop)
  bind_rows(merged)
}

df_disordered <- merge_overlapping_regions(df_disordered)

# Select ONLY the specific domains we want
df_domains <- df %>%
  filter(
    signature_accession == "SM01392" |  # MAGE_N_2
      str_detect(signature_description, "MAGE homology domain, winged helix WH1 motif") |
      str_detect(signature_description, "MAGE homology domain, winged helix WH2 motif")
  ) %>%
  arrange(start)

# ------ ASSIGN COLORS ------
domain_colors <- list(
  'WH1' = '#E74C3C',
  'WH2' = '#9B59B6',
  'N_terminal' = '#3498DB'
)

get_domain_color <- function(name) {
  if (str_detect(name, "WH1")) return(domain_colors[['WH1']])
  if (str_detect(name, "WH2")) return(domain_colors[['WH2']])
  if (str_detect(name, "MAGE_N_2")) return(domain_colors[['N_terminal']])
  return('#A5A5A5')
}

shorten_name <- function(name) {
  if (str_detect(name, "WH1")) return('MAGE WH1')
  if (str_detect(name, "WH2")) return('MAGE WH2')
  if (str_detect(name, "MAGE_N_2")) return('MAGE N-terminal')
  return('Unknown')
}

df_domains <- df_domains %>%
  mutate(
    short_name = sapply(signature_description, shorten_name),
    color = sapply(signature_description, get_domain_color)
  )

# Print domains to see what we're plotting
cat("\nDomains to plot:\n")
print(df_domains %>% select(signature_description, short_name, start, stop, color))

# ------ CREATE PLOT ------
png('MAGEA2_domains_selection.png', width = 16, height = 4, units = 'in', res = 300)

# Set up plot with no border
par(mar = c(4, 1, 3, 1), xpd = FALSE)
plot(NULL, xlim = c(-10, protein_length + 10), ylim = c(0.2, 0.8),
     xlab = 'Amino acid position', ylab = '', yaxt = 'n',
     main = 'MAGEA2 Protein Domains and Positive Selection Sites',
     cex.lab = 1.2, cex.main = 1.4, font.lab = 2, font.main = 2,
     axes = FALSE)

# Add custom x-axis with more ticks
x_ticks <- seq(0, ceiling(protein_length/10)*10, by = 10)
axis(1, at = x_ticks, labels = x_ticks, cex.axis = 0.9)

backbone_y <- 0.5
domain_height <- 0.05
disorder_height <- 0.08  # Make disordered regions taller
triangle_width <- 0.8
triangle_height <- 0.08

# ------ PLOT POSITIVE SELECTION SITES AS TRIANGLES ------
if (length(paml_positive_sites) > 0) {
  for (site in paml_positive_sites) {
    in_any_domain <- FALSE
    
    # Check structured domains
    if (nrow(df_domains) > 0) {
      for (i in 1:nrow(df_domains)) {
        if (df_domains$start[i] <= site && site <= df_domains$stop[i]) {
          in_any_domain <- TRUE
          break
        }
      }
    }
    
    # Check disordered regions
    if (!in_any_domain && nrow(df_disordered) > 0) {
      for (i in 1:nrow(df_disordered)) {
        if (df_disordered$start[i] <= site && site <= df_disordered$stop[i]) {
          in_any_domain <- TRUE
          break
        }
      }
    }
    
    # Triangle positions
    if (in_any_domain) {
      triangle_bottom_y <- backbone_y - domain_height/2 - triangle_height
      triangle_top_y <- backbone_y - domain_height/2
    } else {
      triangle_bottom_y <- backbone_y - triangle_height
      triangle_top_y <- backbone_y
    }
    
    # Draw triangle
    polygon(x = c(site - triangle_width, site + triangle_width, site),
            y = c(triangle_bottom_y, triangle_bottom_y, triangle_top_y),
            col = 'red', border = 'darkred', lwd = 1)
    
    # Add site number annotation - all at the same height
    text(site, triangle_bottom_y - 0.02, as.character(site), 
         cex = 0.7, font = 1, col = 'black', srt = 90, adj = 1)
  }
}

# ------ DRAW BACKBONE ------
lines(c(0, protein_length), c(backbone_y, backbone_y), lwd = 3)

# ------ DRAW DISORDERED REGIONS (TALLER AND BEHIND DOMAINS) ------
if (nrow(df_disordered) > 0) {
  for (i in 1:nrow(df_disordered)) {
    start <- df_disordered$start[i]
    stop <- df_disordered$stop[i]
    width <- stop - start
    
    # Make disordered regions taller so they're visible behind domains
    rect(start, backbone_y - disorder_height/2, stop, backbone_y + disorder_height/2,
         col = rgb(0.75, 0.75, 0.75, 0.5), border = 'gray50', lwd = 1)
  }
}

# ------ DRAW STRUCTURED DOMAINS ------
if (nrow(df_domains) > 0) {
  for (i in 1:nrow(df_domains)) {
    start <- df_domains$start[i]
    stop <- df_domains$stop[i]
    name <- df_domains$short_name[i]
    color <- df_domains$color[i]
    width <- stop - start
    
    # Draw rectangle
    rect(start, backbone_y - domain_height/2, stop, backbone_y + domain_height/2,
         col = color, border = 'black', lwd = 2)
    
    # Add domain label
    text(start + width/2, backbone_y, name, 
         cex = 0.8, font = 2, col = 'white')
    
    # Add coordinates
    coord_y <- backbone_y + domain_height/2 + 0.05
    text(start, coord_y, as.character(start), cex = 0.7, font = 2)
    text(stop, coord_y, as.character(stop), cex = 0.7, font = 2)
  }
}

# ------ LEGEND ------
legend('topright', 
       legend = c('Positive selection', 'Disordered region'),
       fill = c('red', rgb(0.75, 0.75, 0.75, 0.5)),
       border = c('darkred', 'gray50'),
       cex = 0.9, box.lwd = 1)

dev.off()

cat("\nPlot saved as 'MAGEA2_domains_selection.png'\n")


Protein length: 314 amino acids

Domains to plot:
# A tibble: 3 × 5
  signature_description                        short_name      start  stop color
  <chr>                                        <chr>           <dbl> <dbl> <chr>
1 MAGE_N_2                                     MAGE N-terminal     3    96 #349…
2 MAGE homology domain, winged helix WH1 motif MAGE WH1           80   190 #E74…
3 MAGE homology domain, winged helix WH2 motif MAGE WH2          193   314 #9B5…

Plot saved as 'MAGEA2_domains_selection.png'


### MAGEB6

In [16]:
%%R
#### MAGEB6 ####

# ------ INPUT FILES ------
interpro_file <- "MAGEB6_HomSap.tsv"
# ADD YOUR POSITIVE SELECTION SITES HERE:
paml_positive_sites <- c(46, 49, 82, 87, 160, 180, 182, 185, 188, 189, 190, 191, 192, 200, 228, 247, 298, 313, 379)  # Replace with your sites, e.g., c(10, 25, 50, 100, ...)

# ------ LOAD INTERPRO DATA ------
columns <- c('protein_accession', 'seq_md5', 'seq_length', 'analysis', 'signature_accession', 
             'signature_description', 'start', 'stop', 'score', 'status', 'date', 
             'interpro_accession', 'interpro_description', 'go_annotations', 'pathways')

df <- read_tsv(interpro_file, col_names = columns, show_col_types = FALSE)

protein_length <- df$seq_length[1]
cat(sprintf("Protein length: %d amino acids\n", protein_length))

# ------ SELECT DOMAINS ------
# Separate disordered regions
df_disordered <- df %>%
  filter(str_detect(signature_description, regex("disorder", ignore_case = TRUE)))

# Merge overlapping disordered regions
merge_overlapping_regions <- function(df_regions) {
  if (nrow(df_regions) == 0) {
    return(df_regions)
  }
  
  df_sorted <- df_regions %>% arrange(start)
  merged <- list()
  current_start <- df_sorted$start[1]
  current_stop <- df_sorted$stop[1]
  
  if (nrow(df_sorted) > 1) {
    for (i in 2:nrow(df_sorted)) {
      if (df_sorted$start[i] <= current_stop + 5) {
        current_stop <- max(current_stop, df_sorted$stop[i])
      } else {
        merged[[length(merged) + 1]] <- data.frame(start = current_start, stop = current_stop)
        current_start <- df_sorted$start[i]
        current_stop <- df_sorted$stop[i]
      }
    }
  }
  
  merged[[length(merged) + 1]] <- data.frame(start = current_start, stop = current_stop)
  bind_rows(merged)
}

df_disordered <- merge_overlapping_regions(df_disordered)

# Select ONLY the specific domains we want - including BOTH MAGE_N_2 domains
df_domains <- df %>%
  filter(
    signature_accession == "SM01392" |  # MAGE_N_2 (there are 2!)
      str_detect(signature_description, "MAGE homology domain, winged helix WH1 motif") |
      str_detect(signature_description, "MAGE homology domain, winged helix WH2 motif")
  ) %>%
  arrange(start)

# ------ ASSIGN COLORS ------
domain_colors <- list(
  'WH1' = '#E74C3C',
  'WH2' = '#9B59B6',
  'N_terminal_1' = '#3498DB',
  'N_terminal_2' = '#2980B9'  # Slightly darker blue for the second N-terminal
)

get_domain_color <- function(name, start_pos) {
  if (str_detect(name, "WH1")) return(domain_colors[['WH1']])
  if (str_detect(name, "WH2")) return(domain_colors[['WH2']])
  if (str_detect(name, "MAGE_N_2")) {
    # Distinguish the two N-terminal domains by position
    if (start_pos < 100) {
      return(domain_colors[['N_terminal_1']])
    } else {
      return(domain_colors[['N_terminal_2']])
    }
  }
  return('#A5A5A5')
}

shorten_name <- function(name, start_pos) {
  if (str_detect(name, "WH1")) return('MAGE WH1')
  if (str_detect(name, "WH2")) return('MAGE WH2')
  if (str_detect(name, "MAGE_N_2")) {
    # Label them as N-terminal 1 and 2
    if (start_pos < 100) {
      return('MAGE N-terminal 1')
    } else {
      return('MAGE N-terminal 2')
    }
  }
  return('Unknown')
}

df_domains <- df_domains %>%
  mutate(
    short_name = mapply(shorten_name, signature_description, start),
    color = mapply(get_domain_color, signature_description, start)
  )

# Print domains to see what we're plotting
cat("\nDomains to plot:\n")
print(df_domains %>% select(signature_description, short_name, start, stop, color))

# ------ CREATE PLOT ------
png('MAGEB6_domains_selection.png', width = 16, height = 4, units = 'in', res = 300)

# Set up plot with no border
par(mar = c(4, 1, 3, 1), xpd = FALSE)
plot(NULL, xlim = c(-10, protein_length + 10), ylim = c(0.2, 0.8),
     xlab = 'Amino acid position', ylab = '', yaxt = 'n',
     main = 'MAGEB6 Protein Domains and Positive Selection Sites',
     cex.lab = 1.2, cex.main = 1.4, font.lab = 2, font.main = 2,
     axes = FALSE)

# Add custom x-axis with more ticks
x_ticks <- seq(0, ceiling(protein_length/10)*10, by = 10)
axis(1, at = x_ticks, labels = x_ticks, cex.axis = 0.9)

backbone_y <- 0.5
domain_height <- 0.05
disorder_height <- 0.08  # Make disordered regions taller
triangle_width <- 0.8
triangle_height <- 0.08

# ------ PLOT POSITIVE SELECTION SITES AS TRIANGLES ------
if (length(paml_positive_sites) > 0) {
  for (site in paml_positive_sites) {
    in_any_domain <- FALSE
    
    # Check structured domains
    if (nrow(df_domains) > 0) {
      for (i in 1:nrow(df_domains)) {
        if (df_domains$start[i] <= site && site <= df_domains$stop[i]) {
          in_any_domain <- TRUE
          break
        }
      }
    }
    
    # Check disordered regions
    if (!in_any_domain && nrow(df_disordered) > 0) {
      for (i in 1:nrow(df_disordered)) {
        if (df_disordered$start[i] <= site && site <= df_disordered$stop[i]) {
          in_any_domain <- TRUE
          break
        }
      }
    }
    
    # Triangle positions
    if (in_any_domain) {
      triangle_bottom_y <- backbone_y - domain_height/2 - triangle_height
      triangle_top_y <- backbone_y - domain_height/2
    } else {
      triangle_bottom_y <- backbone_y - triangle_height
      triangle_top_y <- backbone_y
    }
    
    # Draw triangle
    polygon(x = c(site - triangle_width, site + triangle_width, site),
            y = c(triangle_bottom_y, triangle_bottom_y, triangle_top_y),
            col = 'red', border = 'darkred', lwd = 1)
    
    # Add site number annotation - all at the same height
    text(site, triangle_bottom_y - 0.02, as.character(site), 
         cex = 0.7, font = 1, col = 'black', srt = 90, adj = 1)
  }
}

# ------ DRAW BACKBONE ------
lines(c(0, protein_length), c(backbone_y, backbone_y), lwd = 3)

# ------ DRAW DISORDERED REGIONS (TALLER AND BEHIND DOMAINS) ------
if (nrow(df_disordered) > 0) {
  for (i in 1:nrow(df_disordered)) {
    start <- df_disordered$start[i]
    stop <- df_disordered$stop[i]
    width <- stop - start
    
    # Make disordered regions taller so they're visible behind domains
    rect(start, backbone_y - disorder_height/2, stop, backbone_y + disorder_height/2,
         col = rgb(0.75, 0.75, 0.75, 0.5), border = 'gray50', lwd = 1)
  }
}

# ------ DRAW STRUCTURED DOMAINS ------
if (nrow(df_domains) > 0) {
  for (i in 1:nrow(df_domains)) {
    start <- df_domains$start[i]
    stop <- df_domains$stop[i]
    name <- df_domains$short_name[i]
    color <- df_domains$color[i]
    width <- stop - start
    
    # Draw rectangle
    rect(start, backbone_y - domain_height/2, stop, backbone_y + domain_height/2,
         col = color, border = 'black', lwd = 2)
    
    # Add domain label
    text(start + width/2, backbone_y, name, 
         cex = 0.8, font = 2, col = 'white')
    
    # Add coordinates
    coord_y <- backbone_y + domain_height/2 + 0.05
    text(start, coord_y, as.character(start), cex = 0.7, font = 2)
    text(stop, coord_y, as.character(stop), cex = 0.7, font = 2)
  }
}

# ------ LEGEND ------
legend('topright', 
       legend = c('Positive selection', 'Disordered region'),
       fill = c('red', rgb(0.75, 0.75, 0.75, 0.5)),
       border = c('darkred', 'gray50'),
       cex = 0.9, box.lwd = 1)

dev.off()

cat("\nPlot saved as 'MAGEB6_domains_selection.png'\n")


Protein length: 407 amino acids

Domains to plot:
# A tibble: 4 × 5
  signature_description                        short_name      start  stop color
  <chr>                                        <chr>           <dbl> <dbl> <chr>
1 MAGE_N_2                                     MAGE N-termina…     3    95 #349…
2 MAGE_N_2                                     MAGE N-termina…   102   179 #298…
3 MAGE homology domain, winged helix WH1 motif MAGE WH1          166   276 #E74…
4 MAGE homology domain, winged helix WH2 motif MAGE WH2          280   401 #9B5…

Plot saved as 'MAGEB6_domains_selection.png'


### MAGEA1

In [17]:
%%R
#### MAGEA1 ####

# ------ INPUT FILES ------
interpro_file <- "MAGEA1_HomSap.tsv"
# ADD YOUR POSITIVE SELECTION SITES HERE:
paml_positive_sites <- c(3, 16, 20, 32, 48, 92, 94, 111, 138, 170, 189, 201, 208, 222, 256, 303)

# ------ LOAD INTERPRO DATA ------
columns <- c('protein_accession', 'seq_md5', 'seq_length', 'analysis', 'signature_accession', 
             'signature_description', 'start', 'stop', 'score', 'status', 'date', 
             'interpro_accession', 'interpro_description', 'go_annotations', 'pathways')

df <- read_tsv(interpro_file, col_names = columns, show_col_types = FALSE)

protein_length <- df$seq_length[1]
cat(sprintf("Protein length: %d amino acids\n", protein_length))

# ------ SELECT DOMAINS ------
# Separate disordered regions
df_disordered <- df %>%
  filter(str_detect(signature_description, regex("disorder", ignore_case = TRUE)))

# Merge overlapping disordered regions
merge_overlapping_regions <- function(df_regions) {
  if (nrow(df_regions) == 0) {
    return(df_regions)
  }
  
  df_sorted <- df_regions %>% arrange(start)
  merged <- list()
  current_start <- df_sorted$start[1]
  current_stop <- df_sorted$stop[1]
  
  if (nrow(df_sorted) > 1) {
    for (i in 2:nrow(df_sorted)) {
      if (df_sorted$start[i] <= current_stop + 5) {
        current_stop <- max(current_stop, df_sorted$stop[i])
      } else {
        merged[[length(merged) + 1]] <- data.frame(start = current_start, stop = current_stop)
        current_start <- df_sorted$start[i]
        current_stop <- df_sorted$stop[i]
      }
    }
  }
  
  merged[[length(merged) + 1]] <- data.frame(start = current_start, stop = current_stop)
  bind_rows(merged)
}

df_disordered <- merge_overlapping_regions(df_disordered)

# Select ONLY the specific domains we want
df_domains <- df %>%
  filter(
    signature_accession == "SM01392" |  # MAGE_N_2
      str_detect(signature_description, "MAGE homology domain, winged helix WH1 motif") |
      str_detect(signature_description, "MAGE homology domain, winged helix WH2 motif")
  ) %>%
  arrange(start)

# ------ ASSIGN COLORS ------
domain_colors <- list(
  'WH1' = '#E74C3C',
  'WH2' = '#9B59B6',
  'N_terminal' = '#3498DB'
)

get_domain_color <- function(name) {
  if (str_detect(name, "WH1")) return(domain_colors[['WH1']])
  if (str_detect(name, "WH2")) return(domain_colors[['WH2']])
  if (str_detect(name, "MAGE_N_2")) return(domain_colors[['N_terminal']])
  return('#A5A5A5')
}

shorten_name <- function(name) {
  if (str_detect(name, "WH1")) return('MAGE WH1')
  if (str_detect(name, "WH2")) return('MAGE WH2')
  if (str_detect(name, "MAGE_N_2")) return('MAGE N-terminal')
  return('Unknown')
}

df_domains <- df_domains %>%
  mutate(
    short_name = sapply(signature_description, shorten_name),
    color = sapply(signature_description, get_domain_color)
  )

# Print domains to see what we're plotting
cat("\nDomains to plot:\n")
print(df_domains %>% select(signature_description, short_name, start, stop, color))

# ------ CREATE PLOT ------
png('MAGEA1_domains_selection.png', width = 16, height = 4, units = 'in', res = 300)

# Set up plot with no border
par(mar = c(4, 1, 3, 1), xpd = FALSE)
plot(NULL, xlim = c(-10, protein_length + 10), ylim = c(0.2, 0.8),
     xlab = 'Amino acid position', ylab = '', yaxt = 'n',
     main = 'MAGEA1 Protein Domains and Positive Selection Sites',
     cex.lab = 1.2, cex.main = 1.4, font.lab = 2, font.main = 2,
     axes = FALSE)

# Add custom x-axis with more ticks
x_ticks <- seq(0, ceiling(protein_length/10)*10, by = 10)
axis(1, at = x_ticks, labels = x_ticks, cex.axis = 0.9)

backbone_y <- 0.5
domain_height <- 0.05
triangle_width <- 0.8
triangle_height <- 0.08

# ------ PLOT POSITIVE SELECTION SITES AS TRIANGLES ------
if (length(paml_positive_sites) > 0) {
  for (site in paml_positive_sites) {
    in_any_domain <- FALSE
    
    # Check structured domains
    if (nrow(df_domains) > 0) {
      for (i in 1:nrow(df_domains)) {
        if (df_domains$start[i] <= site && site <= df_domains$stop[i]) {
          in_any_domain <- TRUE
          break
        }
      }
    }
    
    # Check disordered regions
    if (!in_any_domain && nrow(df_disordered) > 0) {
      for (i in 1:nrow(df_disordered)) {
        if (df_disordered$start[i] <= site && site <= df_disordered$stop[i]) {
          in_any_domain <- TRUE
          break
        }
      }
    }
    
    # Triangle positions
    if (in_any_domain) {
      triangle_bottom_y <- backbone_y - domain_height/2 - triangle_height
      triangle_top_y <- backbone_y - domain_height/2
    } else {
      triangle_bottom_y <- backbone_y - triangle_height
      triangle_top_y <- backbone_y
    }
    
    # Draw triangle
    polygon(x = c(site - triangle_width, site + triangle_width, site),
            y = c(triangle_bottom_y, triangle_bottom_y, triangle_top_y),
            col = 'red', border = 'darkred', lwd = 1)
    
    # Add site number annotation - all at the same height
    text(site, triangle_bottom_y - 0.02, as.character(site), 
         cex = 0.7, font = 1, col = 'black', srt = 90, adj = 1)
  }
}

# ------ DRAW BACKBONE ------
lines(c(0, protein_length), c(backbone_y, backbone_y), lwd = 3)

# ------ DRAW DISORDERED REGIONS ------
if (nrow(df_disordered) > 0) {
  for (i in 1:nrow(df_disordered)) {
    start <- df_disordered$start[i]
    stop <- df_disordered$stop[i]
    width <- stop - start
    
    rect(start, backbone_y - domain_height/2, stop, backbone_y + domain_height/2,
         col = rgb(0.75, 0.75, 0.75, 0.5), border = 'gray50', lwd = 1)
  }
}

# ------ DRAW STRUCTURED DOMAINS ------
if (nrow(df_domains) > 0) {
  for (i in 1:nrow(df_domains)) {
    start <- df_domains$start[i]
    stop <- df_domains$stop[i]
    name <- df_domains$short_name[i]
    color <- df_domains$color[i]
    width <- stop - start
    
    # Draw rectangle
    rect(start, backbone_y - domain_height/2, stop, backbone_y + domain_height/2,
         col = color, border = 'black', lwd = 2)
    
    # Add domain label
    text(start + width/2, backbone_y, name, 
         cex = 0.8, font = 2, col = 'white')
    
    # Add coordinates
    coord_y <- backbone_y + domain_height/2 + 0.05
    text(start, coord_y, as.character(start), cex = 0.7, font = 2)
    text(stop, coord_y, as.character(stop), cex = 0.7, font = 2)
  }
}

# ------ LEGEND ------
legend('topright', 
       legend = c('Positive selection', 'Disordered region'),
       fill = c('red', rgb(0.75, 0.75, 0.75, 0.5)),
       border = c('darkred', 'gray50'),
       cex = 0.9, box.lwd = 1)

dev.off()

cat("\nPlot saved as 'MAGEA1_domains_selection.png'\n")


Protein length: 309 amino acids

Domains to plot:
# A tibble: 3 × 5
  signature_description                        short_name      start  stop color
  <chr>                                        <chr>           <dbl> <dbl> <chr>
1 MAGE_N_2                                     MAGE N-terminal     3    89 #349…
2 MAGE homology domain, winged helix WH1 motif MAGE WH1           72   183 #E74…
3 MAGE homology domain, winged helix WH2 motif MAGE WH2          186   309 #9B5…

Plot saved as 'MAGEA1_domains_selection.png'


### SSX

In [21]:
%%R


# ------ INPUT FILES ------
interpro_file <- "SSX1_HomSap.tsv"
paml_positive_sites <- c(12, 13, 21, 22, 35, 42, 47, 53, 54, 72, 77, 88, 108, 125, 153, 154)

# ------ LOAD INTERPRO DATA ------
columns <- c('protein_accession', 'seq_md5', 'seq_length', 'analysis', 'signature_accession', 
             'signature_description', 'start', 'stop', 'score', 'status', 'date', 
             'interpro_accession', 'interpro_description', 'go_annotations', 'pathways')

df <- read_tsv(interpro_file, col_names = columns, show_col_types = FALSE)

protein_length <- df$seq_length[1]
cat(sprintf("Protein length: %d amino acids\n", protein_length))

# ------ SELECT DOMAINS ------
# Separate disordered regions
df_disordered <- df %>%
  filter(str_detect(signature_description, regex("disorder", ignore_case = TRUE)))

# Merge overlapping disordered regions
merge_overlapping_regions <- function(df_regions) {
  if (nrow(df_regions) == 0) {
    return(df_regions)
  }
  
  df_sorted <- df_regions %>% arrange(start)
  merged <- list()
  current_start <- df_sorted$start[1]
  current_stop <- df_sorted$stop[1]
  
  if (nrow(df_sorted) > 1) {
    for (i in 2:nrow(df_sorted)) {
      if (df_sorted$start[i] <= current_stop + 5) {
        current_stop <- max(current_stop, df_sorted$stop[i])
      } else {
        merged[[length(merged) + 1]] <- data.frame(start = current_start, stop = current_stop)
        current_start <- df_sorted$start[i]
        current_stop <- df_sorted$stop[i]
      }
    }
  }
  
  merged[[length(merged) + 1]] <- data.frame(start = current_start, stop = current_stop)
  bind_rows(merged)
}

df_disordered <- merge_overlapping_regions(df_disordered)

# Select structured domains
exclude_terms <- c('unintegrated', 'Coiled coil', 'Signal peptide', 'Low complexity', 'disorder', 'Disorder')
df_domains <- df

for (term in exclude_terms) {
  df_domains <- df_domains %>%
    filter(!str_detect(signature_description, regex(term, ignore_case = TRUE)))
}

domain_types_to_plot <- c('Pfam', 'SMART', 'SUPERFAMILY')
df_domains <- df_domains %>%
  filter(analysis %in% domain_types_to_plot) %>%
  arrange(start)

# ------ REMOVE OVERLAPS ------
calculate_overlap <- function(start1, stop1, start2, stop2) {
  overlap_start <- max(start1, start2)
  overlap_end <- min(stop1, stop2)
  overlap <- max(0, overlap_end - overlap_start)
  return(overlap)
}

remove_overlapping_domains <- function(df_domains) {
  if (nrow(df_domains) == 0) return(df_domains)
  
  df_domains$keep <- TRUE
  
  for (i in 1:nrow(df_domains)) {
    if (!df_domains$keep[i]) next
    
    if (i < nrow(df_domains)) {
      for (j in (i+1):nrow(df_domains)) {
        if (!df_domains$keep[j]) next
        
        overlap <- calculate_overlap(df_domains$start[i], df_domains$stop[i], 
                                     df_domains$start[j], df_domains$stop[j])
        length1 <- df_domains$stop[i] - df_domains$start[i]
        length2 <- df_domains$stop[j] - df_domains$start[j]
        
        if (overlap > 0.5 * min(length1, length2)) {
          if (df_domains$analysis[i] == 'SUPERFAMILY' && df_domains$analysis[j] == 'SMART') {
            df_domains$keep[j] <- FALSE
          } else if (df_domains$analysis[i] == 'SMART' && df_domains$analysis[j] == 'SUPERFAMILY') {
            df_domains$keep[i] <- FALSE
            break
          } else if (length1 >= length2) {
            df_domains$keep[j] <- FALSE
          } else {
            df_domains$keep[i] <- FALSE
            break
          }
        }
      }
    }
  }
  
  df_domains %>% filter(keep) %>% select(-keep)
}

df_domains <- remove_overlapping_domains(df_domains)

# ------ ASSIGN COLORS ------
domain_colors <- list(
  'KRAB' = '#5B9BD5',
  'krabfinus' = '#70AD47',
  'SSXRD' = '#FFC000',
  'DUF' = '#C55A11'
)

get_domain_color <- function(name) {
  for (key in names(domain_colors)) {
    if (str_detect(tolower(name), tolower(key))) {
      return(domain_colors[[key]])
    }
  }
  return('#A5A5A5')
}

shorten_name <- function(name) {
  name <- str_split(name, ',')[[1]][1]
  if (str_detect(name, 'KRAB')) return('KRAB')
  if (str_detect(tolower(name), 'krabfinus')) return('krabfinus')
  if (str_detect(name, 'SSXRD')) return('SSXRD')
  return(substr(str_split(name, ' ')[[1]][1], 1, 15))
}

df_domains <- df_domains %>%
  mutate(
    short_name = sapply(signature_description, shorten_name),
    color = sapply(signature_description, get_domain_color)
  )

# ------ CREATE PLOT ------
png('SSX_domains_selection.png', width = 16, height = 4, units = 'in', res = 300)

# Set up plot with no border
par(mar = c(4, 1, 3, 1), xpd = FALSE)
plot(NULL, xlim = c(-10, protein_length + 10), ylim = c(0.2, 0.8),
     xlab = 'Amino acid position', ylab = '', yaxt = 'n',
     main = 'SSX Protein Domains and Positive Selection Sites',
     cex.lab = 1.2, cex.main = 1.4, font.lab = 2, font.main = 2,
     axes = FALSE)

# Add custom x-axis with more ticks
x_ticks <- c(seq(0, floor(protein_length/10)*10, by = 25), protein_length)
axis(1, at = x_ticks, labels = x_ticks, cex.axis = 0.9)

backbone_y <- 0.5
domain_height <- 0.05
triangle_width <- 0.8
triangle_height <- 0.05

# ------ PLOT POSITIVE SELECTION SITES AS TRIANGLES ------
for (site in paml_positive_sites) {
  in_any_domain <- FALSE
  
  # Check structured domains
  if (nrow(df_domains) > 0) {
    for (i in 1:nrow(df_domains)) {
      if (df_domains$start[i] <= site && site <= df_domains$stop[i]) {
        in_any_domain <- TRUE
        break
      }
    }
  }
  
  # Check disordered regions
  if (!in_any_domain && nrow(df_disordered) > 0) {
    for (i in 1:nrow(df_disordered)) {
      if (df_disordered$start[i] <= site && site <= df_disordered$stop[i]) {
        in_any_domain <- TRUE
        break
      }
    }
  }
  
  # Triangle positions
  if (in_any_domain) {
    triangle_bottom_y <- backbone_y - domain_height/2 - triangle_height
    triangle_top_y <- backbone_y - domain_height/2
  } else {
    triangle_bottom_y <- backbone_y - triangle_height
    triangle_top_y <- backbone_y
  }
  
  # Draw triangle
  polygon(x = c(site - triangle_width, site + triangle_width, site),
          y = c(triangle_bottom_y, triangle_bottom_y, triangle_top_y),
          col = 'red', border = 'darkred', lwd = 1)
  
  # Add site number annotation above the triangle
  text(site, triangle_bottom_y - 0.02, as.character(site), 
       cex = 0.6, font = 1, col = 'black', srt = 90, adj = 1)
}

# ------ DRAW BACKBONE ------
lines(c(0, protein_length), c(backbone_y, backbone_y), lwd = 3)

# ------ DRAW DISORDERED REGIONS ------
if (nrow(df_disordered) > 0) {
  for (i in 1:nrow(df_disordered)) {
    start <- df_disordered$start[i]
    stop <- df_disordered$stop[i]
    width <- stop - start
    
    rect(start, backbone_y - domain_height/2, stop, backbone_y + domain_height/2,
         col = rgb(0.75, 0.75, 0.75, 0.5), border = 'gray50', lwd = 1)
  }
}

# ------ DRAW STRUCTURED DOMAINS ------
if (nrow(df_domains) > 0) {
  for (i in 1:nrow(df_domains)) {
    start <- df_domains$start[i]
    stop <- df_domains$stop[i]
    name <- df_domains$short_name[i]
    color <- df_domains$color[i]
    width <- stop - start
    
    # Draw rectangle
    rect(start, backbone_y - domain_height/2, stop, backbone_y + domain_height/2,
         col = color, border = 'black', lwd = 2)
    
    # Add domain label
    text(start + width/2, backbone_y, name, 
         cex = 0.8, font = 2, col = 'white')
    
    # Add coordinates
    coord_y <- backbone_y + domain_height/2 + 0.05
    text(start, coord_y, as.character(start), cex = 0.7, font = 2)
    text(stop, coord_y, as.character(stop), cex = 0.7, font = 2)
  }
}

# ------ LEGEND ------
legend('topright', 
       legend = c('Positive selection', 'Disordered region'),
       fill = c('red', rgb(0.75, 0.75, 0.75, 0.5)),
       border = c('darkred', 'gray50'),
       cex = 0.9, box.lwd = 1)

dev.off()

cat("\nPlot saved as 'SSX_domains_selection.png'\n")

Protein length: 188 amino acids

Plot saved as 'SSX_domains_selection.png'


### SPANXN2 

In [None]:
%%R

# ------ INPUT FILES ------
interpro_file <- "SPANXN2_HomSap.tsv"
# ADD YOUR POSITIVE SELECTION SITES HERE:
paml_positive_sites <- c(47, 53, 60, 92, 97)  # Replace with your sites

# ------ LOAD INTERPRO DATA ------
columns <- c('protein_accession', 'seq_md5', 'seq_length', 'analysis', 'signature_accession', 
             'signature_description', 'start', 'stop', 'score', 'status', 'date', 
             'interpro_accession', 'interpro_description', 'go_annotations', 'pathways')

df <- read_tsv(interpro_file, col_names = columns, show_col_types = FALSE)

protein_length <- df$seq_length[1]
cat(sprintf("Protein length: %d amino acids\n", protein_length))

# ------ SELECT DOMAINS ------
# Separate disordered regions
df_disordered <- df %>%
  filter(str_detect(signature_description, regex("disorder", ignore_case = TRUE)))

# Merge overlapping disordered regions
merge_overlapping_regions <- function(df_regions) {
  if (nrow(df_regions) == 0) {
    return(df_regions)
  }
  
  df_sorted <- df_regions %>% arrange(start)
  merged <- list()
  current_start <- df_sorted$start[1]
  current_stop <- df_sorted$stop[1]
  
  if (nrow(df_sorted) > 1) {
    for (i in 2:nrow(df_sorted)) {
      if (df_sorted$start[i] <= current_stop + 5) {
        current_stop <- max(current_stop, df_sorted$stop[i])
      } else {
        merged[[length(merged) + 1]] <- data.frame(start = current_start, stop = current_stop)
        current_start <- df_sorted$start[i]
        current_stop <- df_sorted$stop[i]
      }
    }
  }
  
  merged[[length(merged) + 1]] <- data.frame(start = current_start, stop = current_stop)
  bind_rows(merged)
}

df_disordered <- merge_overlapping_regions(df_disordered)

# Select SPAN-X domain
df_domains <- df %>%
  filter(signature_accession == "PF07458") %>%  # SPAN-X family
  arrange(start)

# ------ ASSIGN COLORS ------
domain_colors <- list(
  'SPANX' = '#E74C3C'
)

# Print domains to see what we're plotting
cat("\nDomains to plot:\n")
print(df_domains %>% select(signature_description, start, stop))

# ------ CREATE PLOT ------
png('SPANXN2_domains_selection.png', width = 16, height = 4, units = 'in', res = 300)

# Set up plot with no border
par(mar = c(4, 1, 3, 1), xpd = FALSE)
plot(NULL, xlim = c(-10, protein_length + 10), ylim = c(0.2, 0.8),
     xlab = 'Amino acid position', ylab = '', yaxt = 'n',
     main = 'SPANXN2 Protein Domains and Positive Selection Sites',
     cex.lab = 1.2, cex.main = 1.4, font.lab = 2, font.main = 2,
     axes = FALSE)

# Add custom x-axis with more ticks
x_ticks <- seq(0, ceiling(protein_length/10)*10, by = 25)
axis(1, at = x_ticks, labels = x_ticks, cex.axis = 0.9)

backbone_y <- 0.5
domain_height <- 0.05
disorder_height <- 0.08
triangle_width <- 0.8
triangle_height <- 0.08

# ------ PLOT POSITIVE SELECTION SITES AS TRIANGLES ------
if (length(paml_positive_sites) > 0) {
  for (site in paml_positive_sites) {
    in_any_domain <- FALSE
    
    # Check structured domains
    if (nrow(df_domains) > 0) {
      for (i in 1:nrow(df_domains)) {
        if (df_domains$start[i] <= site && site <= df_domains$stop[i]) {
          in_any_domain <- TRUE
          break
        }
      }
    }
    
    # Check disordered regions
    if (!in_any_domain && nrow(df_disordered) > 0) {
      for (i in 1:nrow(df_disordered)) {
        if (df_disordered$start[i] <= site && site <= df_disordered$stop[i]) {
          in_any_domain <- TRUE
          break
        }
      }
    }
    
    # Triangle positions
    if (in_any_domain) {
      triangle_bottom_y <- backbone_y - domain_height/2 - triangle_height
      triangle_top_y <- backbone_y - domain_height/2
    } else {
      triangle_bottom_y <- backbone_y - triangle_height
      triangle_top_y <- backbone_y
    }
    
    # Draw triangle
    polygon(x = c(site - triangle_width, site + triangle_width, site),
            y = c(triangle_bottom_y, triangle_bottom_y, triangle_top_y),
            col = 'red', border = 'darkred', lwd = 1)
    
    # Add site number annotation
    text(site, triangle_bottom_y - 0.02, as.character(site), 
         cex = 0.7, font = 1, col = 'black', srt = 90, adj = 1)
  }
}

# ------ DRAW BACKBONE ------
lines(c(0, protein_length), c(backbone_y, backbone_y), lwd = 3)

# ------ DRAW DISORDERED REGIONS (TALLER AND BEHIND DOMAINS) ------
if (nrow(df_disordered) > 0) {
  for (i in 1:nrow(df_disordered)) {
    start <- df_disordered$start[i]
    stop <- df_disordered$stop[i]
    width <- stop - start
    
    rect(start, backbone_y - disorder_height/2, stop, backbone_y + disorder_height/2,
         col = rgb(0.75, 0.75, 0.75, 0.5), border = 'gray50', lwd = 1)
  }
}

# ------ DRAW SPAN-X DOMAIN ------
if (nrow(df_domains) > 0) {
  for (i in 1:nrow(df_domains)) {
    start <- df_domains$start[i]
    stop <- df_domains$stop[i]
    name <- 'SPAN-X'
    color <- domain_colors[['SPANX']]
    width <- stop - start
    
    # Draw rectangle
    rect(start, backbone_y - domain_height/2, stop, backbone_y + domain_height/2,
         col = color, border = 'black', lwd = 2)
    
    # Add domain label
    text(start + width/2, backbone_y, name, 
         cex = 0.8, font = 2, col = 'white')
    
    # Add coordinates
    coord_y <- backbone_y + domain_height/2 + 0.05
    text(start, coord_y, as.character(start), cex = 0.7, font = 2)
    text(stop, coord_y, as.character(stop), cex = 0.7, font = 2)
  }
}

# ------ LEGEND ------
legend('topright', 
       legend = c('Positive selection', 'Disordered region'),
       fill = c('red', rgb(0.75, 0.75, 0.75, 0.5)),
       border = c('darkred', 'gray50'),
       cex = 0.9, box.lwd = 1)

dev.off()

cat("\nPlot saved as 'SPANXN2_domains_selection.png'\n")

### VCX 

In [10]:
%%R

# ------ INPUT FILES ------
interpro_file <- "VCX_HomSap.tsv"
# ADD YOUR POSITIVE SELECTION SITES HERE:
paml_positive_sites <- c(128, 129, 130, 131, 132)  # Replace with your sites


# ------ LOAD INTERPRO DATA ------
columns <- c('protein_accession', 'seq_md5', 'seq_length', 'analysis', 'signature_accession', 
             'signature_description', 'start', 'stop', 'score', 'status', 'date', 
             'interpro_accession', 'interpro_description', 'go_annotations', 'pathways')

df <- read_tsv(interpro_file, col_names = columns, show_col_types = FALSE)

protein_length <- df$seq_length[1]
cat(sprintf("Protein length: %d amino acids\n", protein_length))

# ------ SELECT DOMAINS ------
# Separate disordered regions
df_disordered <- df %>%
  filter(str_detect(signature_description, regex("disorder", ignore_case = TRUE)))

# Merge overlapping disordered regions
merge_overlapping_regions <- function(df_regions) {
  if (nrow(df_regions) == 0) {
    return(df_regions)
  }
  
  df_sorted <- df_regions %>% arrange(start)
  merged <- list()
  current_start <- df_sorted$start[1]
  current_stop <- df_sorted$stop[1]
  
  if (nrow(df_sorted) > 1) {
    for (i in 2:nrow(df_sorted)) {
      if (df_sorted$start[i] <= current_stop + 5) {
        current_stop <- max(current_stop, df_sorted$stop[i])
      } else {
        merged[[length(merged) + 1]] <- data.frame(start = current_start, stop = current_stop)
        current_start <- df_sorted$start[i]
        current_stop <- df_sorted$stop[i]
      }
    }
  }
  
  merged[[length(merged) + 1]] <- data.frame(start = current_start, stop = current_stop)
  bind_rows(merged)
}

df_disordered <- merge_overlapping_regions(df_disordered)

# Select domains - VCX/VCY family
df_vcx <- df %>%
  filter(signature_accession == "PF15231") %>%  # Variable charge X/Y family
  arrange(start)

# Select testis-specific domain
df_testis <- df %>%
  filter(signature_accession == "PTHR15251") %>%
  arrange(start)

# ------ ASSIGN COLORS ------
domain_colors <- list(
  'VCX' = '#E74C3C',
  'TESTIS' = '#3498DB'
)

# Print domains to see what we're plotting
cat("\nDomains to plot:\n")
cat("VCX/VCY domain:\n")
print(df_vcx %>% select(signature_description, start, stop))
cat("\nTestis-specific domain:\n")
print(df_testis %>% select(signature_description, start, stop))

# ------ CREATE PLOT ------
png('VCX_domains_selection.png', width = 16, height = 4, units = 'in', res = 300)

# Set up plot with no border
par(mar = c(4, 1, 3, 1), xpd = FALSE)
plot(NULL, xlim = c(-10, protein_length + 10), ylim = c(0.15, 0.85),
     xlab = 'Amino acid position', ylab = '', yaxt = 'n',
     main = 'VCX Protein Domains and Positive Selection Sites',
     cex.lab = 1.2, cex.main = 1.4, font.lab = 2, font.main = 2,
     axes = FALSE)

# Add custom x-axis with ticks every 20 amino acids
x_ticks <- seq(0, ceiling(protein_length/20)*20, by = 20)
axis(1, at = x_ticks, labels = x_ticks, cex.axis = 0.9)

backbone_y <- 0.5
domain_height <- 0.05
disorder_height <- 0.08
sub_height <- 0.025  # Smaller height for testis-specific domain
triangle_width <- 0.8
triangle_height <- 0.08

# ------ PLOT POSITIVE SELECTION SITES AS TRIANGLES ------
if (length(paml_positive_sites) > 0) {
  for (site in paml_positive_sites) {
    in_any_domain <- FALSE
    
    # Check VCX domain
    if (nrow(df_vcx) > 0) {
      if (df_vcx$start[1] <= site && site <= df_vcx$stop[1]) {
        in_any_domain <- TRUE
      }
    }
    
    # Check disordered regions
    if (!in_any_domain && nrow(df_disordered) > 0) {
      for (i in 1:nrow(df_disordered)) {
        if (df_disordered$start[i] <= site && site <= df_disordered$stop[i]) {
          in_any_domain <- TRUE
          break
        }
      }
    }
    
    # Triangle positions
    if (in_any_domain) {
      triangle_bottom_y <- backbone_y - domain_height/2 - triangle_height
      triangle_top_y <- backbone_y - domain_height/2
    } else {
      triangle_bottom_y <- backbone_y - triangle_height
      triangle_top_y <- backbone_y
    }
    
    # Draw triangle
    polygon(x = c(site - triangle_width, site + triangle_width, site),
            y = c(triangle_bottom_y, triangle_bottom_y, triangle_top_y),
            col = 'red', border = 'darkred', lwd = 1)
    
    # Add site number annotation
    text(site, triangle_bottom_y - 0.02, as.character(site), 
         cex = 0.7, font = 1, col = 'black', srt = 90, adj = 1)
  }
}

# ------ DRAW BACKBONE ------
lines(c(0, protein_length), c(backbone_y, backbone_y), lwd = 3)

# ------ DRAW DISORDERED REGIONS ------
if (nrow(df_disordered) > 0) {
  for (i in 1:nrow(df_disordered)) {
    start <- df_disordered$start[i]
    stop <- df_disordered$stop[i]
    
    rect(start, backbone_y - disorder_height/2, stop, backbone_y + disorder_height/2,
         col = rgb(0.75, 0.75, 0.75, 0.5), border = 'gray50', lwd = 1)
  }
}

# ------ DRAW TESTIS-SPECIFIC DOMAIN (above VCX domain) ------
if (nrow(df_testis) > 0) {
  start <- df_testis$start[1]
  stop <- df_testis$stop[1]
  name <- 'Testis-specific basic protein Y 1-related'
  color <- domain_colors[['TESTIS']]
  width <- stop - start
  
  # Draw above the VCX domain
  testis_y <- backbone_y + domain_height/2 + sub_height + 0.05
  rect(start, testis_y - sub_height/2, stop, testis_y + sub_height/2,
       col = color, border = 'black', lwd = 1.5)
  
  text(start + width/2, testis_y, name, 
       cex = 0.6, font = 2, col = 'white')
  
  coord_y <- testis_y + sub_height/2 + 0.02
  text(start, coord_y, as.character(start), cex = 0.5, font = 2)
  text(stop, coord_y, as.character(stop), cex = 0.5, font = 2)
}

# ------ DRAW VCX/VCY DOMAIN (main domain) ------
if (nrow(df_vcx) > 0) {
  start <- df_vcx$start[1]
  stop <- df_vcx$stop[1]
  name <- 'VCX/VCY'
  color <- domain_colors[['VCX']]
  width <- stop - start
  
  rect(start, backbone_y - domain_height/2, stop, backbone_y + domain_height/2,
       col = color, border = 'black', lwd = 2)
  
  text(start + width/2, backbone_y, name, 
       cex = 0.8, font = 2, col = 'white')
  
  coord_y <- backbone_y + domain_height/2 + 0.05
  text(start, coord_y, as.character(start), cex = 0.7, font = 2)
  text(stop, coord_y, as.character(stop), cex = 0.7, font = 2)
}

# ------ LEGEND ------
legend('topright', 
       legend = c('Positive selection', 'Disordered region'),
       fill = c('red', rgb(0.75, 0.75, 0.75, 0.5)),
       border = c('darkred', 'gray50'),
       cex = 0.9, box.lwd = 1)

dev.off()

cat("\nPlot saved as 'VCX_domains_selection.png'\n")

Protein length: 186 amino acids

Domains to plot:
VCX/VCY domain:
# A tibble: 1 × 3
  signature_description      start  stop
  <chr>                      <dbl> <dbl>
1 Variable charge X/Y family     1   119

Testis-specific domain:
# A tibble: 1 × 3
  signature_description                     start  stop
  <chr>                                     <dbl> <dbl>
1 TESTIS-SPECIFIC BASIC PROTEIN Y 1-RELATED    56   182

Plot saved as 'VCX_domains_selection.png'


### ZXD

In [13]:
%%R
# ------ INPUT FILES ------
interpro_file <- "ZXD_HomSap.tsv"
# ADD YOUR POSITIVE SELECTION SITES HERE:
paml_positive_sites <- c(17, 119, 126, 135, 273, 799)  # Replace with your sites


# ------ LOAD INTERPRO DATA ------
columns <- c('protein_accession', 'seq_md5', 'seq_length', 'analysis', 'signature_accession', 
             'signature_description', 'start', 'stop', 'score', 'status', 'date', 
             'interpro_accession', 'interpro_description', 'go_annotations', 'pathways')

df <- read.delim(interpro_file, header = FALSE, col.names = columns, stringsAsFactors = FALSE)

protein_length <- df$seq_length[1]
cat(sprintf("Protein length: %d amino acids\n", protein_length))

# ------ SELECT DOMAINS ------
# Separate disordered regions
df_disordered <- df[grepl("disorder", df$signature_description, ignore.case = TRUE), ]

# Merge overlapping disordered regions
merge_overlapping_regions <- function(df_regions) {
  if (nrow(df_regions) == 0) {
    return(df_regions)
  }
  
  df_sorted <- df_regions[order(df_regions$start), ]
  merged <- list()
  current_start <- df_sorted$start[1]
  current_stop <- df_sorted$stop[1]
  
  if (nrow(df_sorted) > 1) {
    for (i in 2:nrow(df_sorted)) {
      if (df_sorted$start[i] <= current_stop + 5) {
        current_stop <- max(current_stop, df_sorted$stop[i])
      } else {
        merged[[length(merged) + 1]] <- data.frame(start = current_start, stop = current_stop)
        current_start <- df_sorted$start[i]
        current_stop <- df_sorted$stop[i]
      }
    }
  }
  
  merged[[length(merged) + 1]] <- data.frame(start = current_start, stop = current_stop)
  do.call(rbind, merged)
}

df_disordered <- merge_overlapping_regions(df_disordered)

# Select C2H2 zinc finger domains - use ProSiteProfiles for clean domain boundaries
df_zf <- df[df$signature_accession == "PS50157", ]  # ProSiteProfiles Zinc finger C2H2 type
df_zf <- df_zf[order(df_zf$start), ]

# ------ ASSIGN COLORS ------
domain_colors <- list(
  'ZF' = '#3498DB'  # Blue for zinc fingers
)

# Print domains to see what we're plotting
cat("\nDomains to plot:\n")
cat("Zinc finger C2H2 domains:\n")
print(df_zf[, c('signature_description', 'start', 'stop')])
cat("\nDisordered regions:\n")
print(df_disordered)

# ------ CREATE PLOT ------
png('ZXD_domains_selection.png', width = 18, height = 4, units = 'in', res = 300)

# Set up plot with no border
par(mar = c(4, 1, 3, 1), xpd = FALSE)
plot(NULL, xlim = c(-10, protein_length + 10), ylim = c(0.15, 0.85),
     xlab = 'Amino acid position', ylab = '', yaxt = 'n',
     main = 'ZXD Protein Domains and Positive Selection Sites',
     cex.lab = 1.2, cex.main = 1.4, font.lab = 2, font.main = 2,
     axes = FALSE)

# Add custom x-axis with ticks every 50 amino acids
x_ticks <- seq(0, ceiling(protein_length/50)*50, by = 50)
axis(1, at = x_ticks, labels = x_ticks, cex.axis = 0.9)

backbone_y <- 0.5
domain_height <- 0.06
disorder_height <- 0.08
triangle_width <- 0.8
triangle_height <- 0.08

# ------ PLOT POSITIVE SELECTION SITES AS TRIANGLES ------
if (length(paml_positive_sites) > 0) {
  for (site in paml_positive_sites) {
    in_any_domain <- FALSE
    
    # Check zinc finger domains
    if (nrow(df_zf) > 0) {
      for (i in 1:nrow(df_zf)) {
        if (df_zf$start[i] <= site && site <= df_zf$stop[i]) {
          in_any_domain <- TRUE
          break
        }
      }
    }
    
    # Check disordered regions
    if (!in_any_domain && nrow(df_disordered) > 0) {
      for (i in 1:nrow(df_disordered)) {
        if (df_disordered$start[i] <= site && site <= df_disordered$stop[i]) {
          in_any_domain <- TRUE
          break
        }
      }
    }
    
    # Triangle positions
    if (in_any_domain) {
      triangle_bottom_y <- backbone_y - domain_height/2 - triangle_height
      triangle_top_y <- backbone_y - domain_height/2
    } else {
      triangle_bottom_y <- backbone_y - triangle_height
      triangle_top_y <- backbone_y
    }
    
    # Draw triangle
    polygon(x = c(site - triangle_width, site + triangle_width, site),
            y = c(triangle_bottom_y, triangle_bottom_y, triangle_top_y),
            col = 'red', border = 'darkred', lwd = 1)
    
    # Add site number annotation
    text(site, triangle_bottom_y - 0.02, as.character(site), 
         cex = 0.7, font = 1, col = 'black', srt = 90, adj = 1)
  }
}

# ------ DRAW BACKBONE ------
lines(c(0, protein_length), c(backbone_y, backbone_y), lwd = 3)

# ------ DRAW DISORDERED REGIONS ------
if (nrow(df_disordered) > 0) {
  for (i in 1:nrow(df_disordered)) {
    start <- df_disordered$start[i]
    stop <- df_disordered$stop[i]
    
    rect(start, backbone_y - disorder_height/2, stop, backbone_y + disorder_height/2,
         col = rgb(0.75, 0.75, 0.75, 0.5), border = 'gray50', lwd = 1)
  }
}

# ------ DRAW C2H2 ZINC FINGER DOMAINS (individual domains) ------
if (nrow(df_zf) > 0) {
  color <- domain_colors[['ZF']]
  
  for (i in 1:nrow(df_zf)) {
    start <- df_zf$start[i]
    stop <- df_zf$stop[i]
    width <- stop - start
    
    rect(start, backbone_y - domain_height/2, stop, backbone_y + domain_height/2,
         col = color, border = 'black', lwd = 2)
    
    # Add ZF label and number for each domain
    text(start + width/2, backbone_y, paste0('ZF', i), 
         cex = 0.6, font = 2, col = 'white')
    
    # Add coordinates below each domain
    coord_y <- backbone_y - domain_height/2 - 0.04
    text(start, coord_y, as.character(start), cex = 0.5, font = 2)
    text(stop, coord_y, as.character(stop), cex = 0.5, font = 2)
  }
}

# ------ LEGEND ------
legend('topright', 
       legend = c('C2H2 zinc finger', 'Positive selection', 'Disordered region'),
       fill = c(domain_colors[['ZF']], 'red', rgb(0.75, 0.75, 0.75, 0.5)),
       border = c('black', 'darkred', 'gray50'),
       cex = 0.9, box.lwd = 1)

dev.off()

cat("\nPlot saved as 'ZXD_domains_selection.png'\n")
cat(sprintf("\nSummary: %d zinc finger domains plotted\n", nrow(df_zf)))

Protein length: 799 amino acids

Domains to plot:
Zinc finger C2H2 domains:
                   signature_description start stop
 Zinc finger C2H2 type domain profile.   267  296
 Zinc finger C2H2 type domain profile.   300  329
 Zinc finger C2H2 type domain profile.   330  359
 Zinc finger C2H2 type domain profile.   360  387
 Zinc finger C2H2 type domain profile.   389  418
 Zinc finger C2H2 type domain profile.   420  449
 Zinc finger C2H2 type domain profile.   450  479
 Zinc finger C2H2 type domain profile.   480  509
 Zinc finger C2H2 type domain profile.   510  539

Disordered regions:
  start stop
     1   90
   118  140

Plot saved as 'ZXD_domains_selection.png'

Summary: 9 zinc finger domains plotted


### CENPVL 

In [15]:
%%R 
# ------ INPUT FILES ------
interpro_file <- "CENPVL1_HomSap.tsv"
# ADD YOUR POSITIVE SELECTION SITES HERE:
paml_positive_sites <- c(69, 72, 75, 81, 86, 89, 110)  # Replace with your sites

# ------ LOAD INTERPRO DATA ------
columns <- c('protein_accession', 'seq_md5', 'seq_length', 'analysis', 'signature_accession', 
             'signature_description', 'start', 'stop', 'score', 'status', 'date', 
             'interpro_accession', 'interpro_description', 'go_annotations', 'pathways')

df <- read.delim(interpro_file, header = FALSE, col.names = columns, stringsAsFactors = FALSE)

protein_length <- df$seq_length[1]
cat(sprintf("Protein length: %d amino acids\n", protein_length))

# ------ SELECT DOMAINS ------
# Separate disordered regions
df_disordered <- df[grepl("disorder", df$signature_description, ignore.case = TRUE), ]

# Merge overlapping disordered regions
merge_overlapping_regions <- function(df_regions) {
  if (nrow(df_regions) == 0) {
    return(df_regions)
  }
  
  df_sorted <- df_regions[order(df_regions$start), ]
  merged <- list()
  current_start <- df_sorted$start[1]
  current_stop <- df_sorted$stop[1]
  
  if (nrow(df_sorted) > 1) {
    for (i in 2:nrow(df_sorted)) {
      if (df_sorted$start[i] <= current_stop + 5) {
        current_stop <- max(current_stop, df_sorted$stop[i])
      } else {
        merged[[length(merged) + 1]] <- data.frame(start = current_start, stop = current_stop)
        current_start <- df_sorted$start[i]
        current_stop <- df_sorted$stop[i]
      }
    }
  }
  
  merged[[length(merged) + 1]] <- data.frame(start = current_start, stop = current_stop)
  do.call(rbind, merged)
}

df_disordered <- merge_overlapping_regions(df_disordered)

# Select CENP-V/GFA domain
df_cenpv <- df[df$signature_accession == "PS51891", ]  # CENP-V/GFA domain
df_cenpv <- df_cenpv[order(df_cenpv$start), ]

# Select signal peptide
df_signal <- df[df$signature_accession == "SIGNAL_PEPTIDE", ]
df_signal <- df_signal[order(df_signal$start), ]

# ------ ASSIGN COLORS ------
domain_colors <- list(
  'CENPV' = '#E74C3C',     # Red for CENP-V domain
  'SIGNAL' = '#F39C12'     # Orange for signal peptide
)

# Print domains to see what we're plotting
cat("\nDomains to plot:\n")
cat("CENP-V/GFA domain:\n")
print(df_cenpv[, c('signature_description', 'start', 'stop')])
cat("\nSignal peptide:\n")
print(df_signal[, c('signature_description', 'start', 'stop')])
cat("\nDisordered regions:\n")
print(df_disordered)

# ------ CREATE PLOT ------
png('CENPVL1_domains_selection.png', width = 16, height = 4, units = 'in', res = 300)

# Set up plot with no border
par(mar = c(4, 1, 3, 1), xpd = FALSE)
plot(NULL, xlim = c(-10, protein_length + 10), ylim = c(0.15, 0.85),
     xlab = 'Amino acid position', ylab = '', yaxt = 'n',
     main = 'CENPVL1 Protein Domains and Positive Selection Sites',
     cex.lab = 1.2, cex.main = 1.4, font.lab = 2, font.main = 2,
     axes = FALSE)

# Add custom x-axis with ticks every 20 amino acids
x_ticks <- seq(0, ceiling(protein_length/20)*20, by = 20)
axis(1, at = x_ticks, labels = x_ticks, cex.axis = 0.9)

backbone_y <- 0.5
domain_height <- 0.05
disorder_height <- 0.08
signal_height <- 0.03  # Smaller height for signal peptide
triangle_width <- 0.8
triangle_height <- 0.08

# ------ PLOT POSITIVE SELECTION SITES AS TRIANGLES ------
if (length(paml_positive_sites) > 0) {
  for (site in paml_positive_sites) {
    in_any_domain <- FALSE
    
    # Check CENP-V domain
    if (nrow(df_cenpv) > 0) {
      if (df_cenpv$start[1] <= site && site <= df_cenpv$stop[1]) {
        in_any_domain <- TRUE
      }
    }
    
    # Check signal peptide
    if (!in_any_domain && nrow(df_signal) > 0) {
      if (df_signal$start[1] <= site && site <= df_signal$stop[1]) {
        in_any_domain <- TRUE
      }
    }
    
    # Check disordered regions
    if (!in_any_domain && nrow(df_disordered) > 0) {
      for (i in 1:nrow(df_disordered)) {
        if (df_disordered$start[i] <= site && site <= df_disordered$stop[i]) {
          in_any_domain <- TRUE
          break
        }
      }
    }
    
    # Triangle positions
    if (in_any_domain) {
      triangle_bottom_y <- backbone_y - domain_height/2 - triangle_height
      triangle_top_y <- backbone_y - domain_height/2
    } else {
      triangle_bottom_y <- backbone_y - triangle_height
      triangle_top_y <- backbone_y
    }
    
    # Draw triangle
    polygon(x = c(site - triangle_width, site + triangle_width, site),
            y = c(triangle_bottom_y, triangle_bottom_y, triangle_top_y),
            col = 'red', border = 'darkred', lwd = 1)
    
    # Add site number annotation
    text(site, triangle_bottom_y - 0.02, as.character(site), 
         cex = 0.7, font = 1, col = 'black', srt = 90, adj = 1)
  }
}

# ------ DRAW BACKBONE ------
lines(c(0, protein_length), c(backbone_y, backbone_y), lwd = 3)

# ------ DRAW DISORDERED REGIONS ------
if (nrow(df_disordered) > 0) {
  for (i in 1:nrow(df_disordered)) {
    start <- df_disordered$start[i]
    stop <- df_disordered$stop[i]
    
    rect(start, backbone_y - disorder_height/2, stop, backbone_y + disorder_height/2,
         col = rgb(0.75, 0.75, 0.75, 0.5), border = 'gray50', lwd = 1)
  }
}

# ------ DRAW SIGNAL PEPTIDE (above CENP-V domain) ------
if (nrow(df_signal) > 0) {
  start <- df_signal$start[1]
  stop <- df_signal$stop[1]
  name <- 'Signal peptide'
  color <- domain_colors[['SIGNAL']]
  width <- stop - start
  
  # Draw above the main domain
  signal_y <- backbone_y + domain_height/2 + signal_height + 0.05
  rect(start, signal_y - signal_height/2, stop, signal_y + signal_height/2,
       col = color, border = 'black', lwd = 1.5)
  
  text(start + width/2, signal_y, name, 
       cex = 0.6, font = 2, col = 'white')
  
  coord_y <- signal_y + signal_height/2 + 0.02
  text(start, coord_y, as.character(start), cex = 0.5, font = 2)
  text(stop, coord_y, as.character(stop), cex = 0.5, font = 2)
}

# ------ DRAW CENP-V/GFA DOMAIN (main domain) ------
if (nrow(df_cenpv) > 0) {
  start <- df_cenpv$start[1]
  stop <- df_cenpv$stop[1]
  name <- 'CENP-V/GFA'
  color <- domain_colors[['CENPV']]
  width <- stop - start
  
  rect(start, backbone_y - domain_height/2, stop, backbone_y + domain_height/2,
       col = color, border = 'black', lwd = 2)
  
  text(start + width/2, backbone_y, name, 
       cex = 0.8, font = 2, col = 'white')
  
  coord_y <- backbone_y + domain_height/2 + 0.05
  text(start, coord_y, as.character(start), cex = 0.7, font = 2)
  text(stop, coord_y, as.character(stop), cex = 0.7, font = 2)
}

# ------ LEGEND ------
legend('topright', 
       legend = c('Positive selection', 'Disordered region'),
       fill = c('red', rgb(0.75, 0.75, 0.75, 0.5)),
       border = c('darkred', 'gray50'),
       cex = 0.9, box.lwd = 1)

dev.off()

cat("\nPlot saved as 'CENPVL1_domains_selection.png'\n")

Protein length: 272 amino acids

Domains to plot:
CENP-V/GFA domain:
        signature_description start stop
 CENP-V/GFA domain profile.   133  246

Signal peptide:
  signature_description start stop
 Signal peptide region     1   35

Disordered regions:
  start stop
     1   23
    65   95
   240  272

Plot saved as 'CENPVL1_domains_selection.png'


### CSF2RA

In [18]:
%%R

# ------ INPUT FILES ------
interpro_file <- "CSF2RA_HomSap.tsv"
# ADD YOUR POSITIVE SELECTION SITES HERE:
paml_positive_sites <- c(377, 393, 394, 399, 401, 406, 407, 411, 413)  # Replace with your sites

# ------ LOAD INTERPRO DATA ------
columns <- c('protein_accession', 'seq_md5', 'seq_length', 'analysis', 'signature_accession', 
             'signature_description', 'start', 'stop', 'score', 'status', 'date', 
             'interpro_accession', 'interpro_description', 'go_annotations', 'pathways')

df <- read.delim(interpro_file, header = FALSE, col.names = columns, stringsAsFactors = FALSE)

protein_length <- df$seq_length[1]
cat(sprintf("Protein length: %d amino acids\n", protein_length))

# ------ SELECT DOMAINS ------
# Separate disordered regions
df_disordered <- df[grepl("disorder", df$signature_description, ignore.case = TRUE), ]

# Merge overlapping disordered regions
merge_overlapping_regions <- function(df_regions) {
  if (nrow(df_regions) == 0) {
    return(df_regions)
  }
  
  df_sorted <- df_regions[order(df_regions$start), ]
  merged <- list()
  current_start <- df_sorted$start[1]
  current_stop <- df_sorted$stop[1]
  
  if (nrow(df_sorted) > 1) {
    for (i in 2:nrow(df_sorted)) {
      if (df_sorted$start[i] <= current_stop + 5) {
        current_stop <- max(current_stop, df_sorted$stop[i])
      } else {
        merged[[length(merged) + 1]] <- data.frame(start = current_start, stop = current_stop)
        current_start <- df_sorted$start[i]
        current_stop <- df_sorted$stop[i]
      }
    }
  }
  
  merged[[length(merged) + 1]] <- data.frame(start = current_start, stop = current_stop)
  do.call(rbind, merged)
}

df_disordered <- merge_overlapping_regions(df_disordered)

# Select IL-3 receptor N-terminal domain
df_il3 <- df[df$signature_accession == "PF18611", ]
df_il3 <- df_il3[order(df_il3$start), ]

# Select IL-6 receptor binding domain
df_il6 <- df[df$signature_accession == "PF09240", ]
df_il6 <- df_il6[order(df_il6$start), ]

# Select Fibronectin type-III domain
df_fn3 <- df[df$signature_accession == "PS50853", ]
df_fn3 <- df_fn3[order(df_fn3$start), ]

# Select signal peptide
df_signal <- df[df$signature_accession == "SIGNAL_PEPTIDE", ]
df_signal <- df_signal[order(df_signal$start), ]

# Select transmembrane region
df_tm <- df[df$signature_accession == "TRANSMEMBRANE", ]
df_tm <- df_tm[order(df_tm$start), ]

# ------ ASSIGN COLORS ------
domain_colors <- list(
  'IL3' = '#E74C3C',      # Red for IL-3 receptor domain
  'IL6' = '#3498DB',      # Blue for IL-6 receptor domain
  'FN3' = '#2ECC71',      # Green for Fibronectin type-III
  'SIGNAL' = '#F39C12',   # Orange for signal peptide
  'TM' = '#9B59B6'        # Purple for transmembrane
)

# Print domains to see what we're plotting
cat("\nDomains to plot:\n")
cat("IL-3 receptor N-terminal domain:\n")
print(df_il3[, c('signature_description', 'start', 'stop')])
cat("\nIL-6 receptor binding domain:\n")
print(df_il6[, c('signature_description', 'start', 'stop')])
cat("\nFibronectin type-III domain:\n")
print(df_fn3[, c('signature_description', 'start', 'stop')])
cat("\nSignal peptide:\n")
print(df_signal[, c('signature_description', 'start', 'stop')])
cat("\nTransmembrane region:\n")
print(df_tm[, c('signature_description', 'start', 'stop')])
cat("\nDisordered regions:\n")
print(df_disordered)

# ------ CREATE PLOT ------
png('CSF2RA_domains_selection.png', width = 18, height = 4, units = 'in', res = 300)

# Set up plot with no border
par(mar = c(4, 1, 3, 1), xpd = FALSE)
plot(NULL, xlim = c(-10, protein_length + 10), ylim = c(0.15, 0.85),
     xlab = 'Amino acid position', ylab = '', yaxt = 'n',
     main = 'CSF2RA Protein Domains and Positive Selection Sites',
     cex.lab = 1.2, cex.main = 1.4, font.lab = 2, font.main = 2,
     axes = FALSE)

# Add custom x-axis with ticks every 50 amino acids
x_ticks <- seq(0, ceiling(protein_length/50)*50, by = 50)
axis(1, at = x_ticks, labels = x_ticks, cex.axis = 0.9)

backbone_y <- 0.5
domain_height <- 0.05
disorder_height <- 0.08
signal_height <- 0.03  # Smaller height for signal peptide
tm_height <- 0.04      # Smaller height for transmembrane
triangle_width <- 0.8
triangle_height <- 0.08

# ------ PLOT POSITIVE SELECTION SITES AS TRIANGLES ------
if (length(paml_positive_sites) > 0) {
  for (site in paml_positive_sites) {
    in_any_domain <- FALSE
    
    # Check all domains
    domains_list <- list(df_il3, df_il6, df_fn3, df_signal, df_tm)
    for (domain_df in domains_list) {
      if (nrow(domain_df) > 0) {
        for (i in 1:nrow(domain_df)) {
          if (domain_df$start[i] <= site && site <= domain_df$stop[i]) {
            in_any_domain <- TRUE
            break
          }
        }
      }
      if (in_any_domain) break
    }
    
    # Check disordered regions
    if (!in_any_domain && nrow(df_disordered) > 0) {
      for (i in 1:nrow(df_disordered)) {
        if (df_disordered$start[i] <= site && site <= df_disordered$stop[i]) {
          in_any_domain <- TRUE
          break
        }
      }
    }
    
    # Triangle positions
    if (in_any_domain) {
      triangle_bottom_y <- backbone_y - domain_height/2 - triangle_height
      triangle_top_y <- backbone_y - domain_height/2
    } else {
      triangle_bottom_y <- backbone_y - triangle_height
      triangle_top_y <- backbone_y
    }
    
    # Draw triangle
    polygon(x = c(site - triangle_width, site + triangle_width, site),
            y = c(triangle_bottom_y, triangle_bottom_y, triangle_top_y),
            col = 'red', border = 'darkred', lwd = 1)
    
    # Add site number annotation
    text(site, triangle_bottom_y - 0.02, as.character(site), 
         cex = 0.7, font = 1, col = 'black', srt = 90, adj = 1)
  }
}

# ------ DRAW BACKBONE ------
lines(c(0, protein_length), c(backbone_y, backbone_y), lwd = 3)

# ------ DRAW DISORDERED REGIONS ------
if (nrow(df_disordered) > 0) {
  for (i in 1:nrow(df_disordered)) {
    start <- df_disordered$start[i]
    stop <- df_disordered$stop[i]
    
    rect(start, backbone_y - disorder_height/2, stop, backbone_y + disorder_height/2,
         col = rgb(0.75, 0.75, 0.75, 0.5), border = 'gray50', lwd = 1)
  }
}

# ------ DRAW SIGNAL PEPTIDE (above main domains) ------
if (nrow(df_signal) > 0) {
  start <- df_signal$start[1]
  stop <- df_signal$stop[1]
  name <- 'Signal'
  color <- domain_colors[['SIGNAL']]
  width <- stop - start
  
  signal_y <- backbone_y + domain_height/2 + signal_height + 0.05
  rect(start, signal_y - signal_height/2, stop, signal_y + signal_height/2,
       col = color, border = 'black', lwd = 1.5)
  
  text(start + width/2, signal_y, name, 
       cex = 0.6, font = 2, col = 'white')
  
  coord_y <- signal_y + signal_height/2 + 0.02
  text(start, coord_y, as.character(start), cex = 0.5, font = 2)
  text(stop, coord_y, as.character(stop), cex = 0.5, font = 2)
}

# ------ DRAW TRANSMEMBRANE REGION (above main domains, to the right) ------
if (nrow(df_tm) > 0) {
  start <- df_tm$start[1]
  stop <- df_tm$stop[1]
  name <- 'TM'
  color <- domain_colors[['TM']]
  width <- stop - start
  
  tm_y <- backbone_y + domain_height/2 + tm_height + 0.05
  rect(start, tm_y - tm_height/2, stop, tm_y + tm_height/2,
       col = color, border = 'black', lwd = 1.5)
  
  text(start + width/2, tm_y, name, 
       cex = 0.6, font = 2, col = 'white')
  
  coord_y <- tm_y + tm_height/2 + 0.02
  text(start, coord_y, as.character(start), cex = 0.5, font = 2)
  text(stop, coord_y, as.character(stop), cex = 0.5, font = 2)
}

# ------ DRAW IL-3 RECEPTOR N-TERMINAL DOMAIN ------
if (nrow(df_il3) > 0) {
  start <- df_il3$start[1]
  stop <- df_il3$stop[1]
  name <- 'IL-3 receptor'
  color <- domain_colors[['IL3']]
  width <- stop - start
  
  rect(start, backbone_y - domain_height/2, stop, backbone_y + domain_height/2,
       col = color, border = 'black', lwd = 2)
  
  text(start + width/2, backbone_y, name, 
       cex = 0.7, font = 2, col = 'white')
  
  coord_y <- backbone_y + domain_height/2 + 0.05
  text(start, coord_y, as.character(start), cex = 0.6, font = 2)
  text(stop, coord_y, as.character(stop), cex = 0.6, font = 2)
}

# ------ DRAW IL-6 RECEPTOR BINDING DOMAIN ------
if (nrow(df_il6) > 0) {
  start <- df_il6$start[1]
  stop <- df_il6$stop[1]
  name <- 'IL-6 binding'
  color <- domain_colors[['IL6']]
  width <- stop - start
  
  rect(start, backbone_y - domain_height/2, stop, backbone_y + domain_height/2,
       col = color, border = 'black', lwd = 2)
  
  text(start + width/2, backbone_y, name, 
       cex = 0.7, font = 2, col = 'white')
  
  coord_y <- backbone_y + domain_height/2 + 0.05
  text(start, coord_y, as.character(start), cex = 0.6, font = 2)
  text(stop, coord_y, as.character(stop), cex = 0.6, font = 2)
}

# ------ DRAW FIBRONECTIN TYPE-III DOMAIN ------
if (nrow(df_fn3) > 0) {
  start <- df_fn3$start[1]
  stop <- df_fn3$stop[1]
  name <- 'Fibronectin III'
  color <- domain_colors[['FN3']]
  width <- stop - start
  
  rect(start, backbone_y - domain_height/2, stop, backbone_y + domain_height/2,
       col = color, border = 'black', lwd = 2)
  
  text(start + width/2, backbone_y, name, 
       cex = 0.7, font = 2, col = 'white')
  
  coord_y <- backbone_y + domain_height/2 + 0.05
  text(start, coord_y, as.character(start), cex = 0.6, font = 2)
  text(stop, coord_y, as.character(stop), cex = 0.6, font = 2)
}

# ------ LEGEND ------
legend('topright', 
       legend = c('Positive selection', 'Disordered region'),
       fill = c('red', rgb(0.75, 0.75, 0.75, 0.5)),
       border = c('darkred', 'gray50'),
       cex = 0.9, box.lwd = 1)

dev.off()

cat("\nPlot saved as 'CSF2RA_domains_selection.png'\n")

Protein length: 428 amino acids

Domains to plot:
IL-3 receptor N-terminal domain:
                         signature_description start stop
 IL-3 receptor alpha chain N-terminal domain    35  112

IL-6 receptor binding domain:
                         signature_description start stop
 Interleukin-6 receptor alpha chain, binding   123  215

Fibronectin type-III domain:
                 signature_description start stop
 Fibronectin type-III domain profile.   220  320

Signal peptide:
   signature_description start stop
 Signal peptide region     1   19

Transmembrane region:
                                                          signature_description
 Region of a membrane-bound protein predicted to be embedded in the membrane.
   start stop
   325  346

Disordered regions:
 [1] protein_accession     seq_md5               seq_length           
 [4] analysis              signature_accession   signature_description
 [7] start                 stop                  score                
[

### CXorf49

In [20]:
%%R

# ------ INPUT FILES ------
interpro_file <- "CXorf49_HomSap.tsv"
# ADD YOUR POSITIVE SELECTION SITES HERE:
paml_positive_sites <- c(33, 45, 48, 135, 200, 222, 236, 283, 313, 320, 489, 499)  # Replace with your sites

# ------ LOAD INTERPRO DATA ------
columns <- c('protein_accession', 'seq_md5', 'seq_length', 'analysis', 'signature_accession', 
             'signature_description', 'start', 'stop', 'score', 'status', 'date', 
             'interpro_accession', 'interpro_description', 'go_annotations', 'pathways')

df <- read.delim(interpro_file, header = FALSE, col.names = columns, stringsAsFactors = FALSE)

protein_length <- df$seq_length[1]
cat(sprintf("Protein length: %d amino acids\n", protein_length))

# ------ SELECT DOMAINS ------
# Separate disordered regions
df_disordered <- df[grepl("disorder", df$signature_description, ignore.case = TRUE), ]

# Merge overlapping disordered regions
merge_overlapping_regions <- function(df_regions) {
  if (nrow(df_regions) == 0) {
    return(df_regions)
  }
  
  df_sorted <- df_regions[order(df_regions$start), ]
  merged <- list()
  current_start <- df_sorted$start[1]
  current_stop <- df_sorted$stop[1]
  
  if (nrow(df_sorted) > 1) {
    for (i in 2:nrow(df_sorted)) {
      if (df_sorted$start[i] <= current_stop + 5) {
        current_stop <- max(current_stop, df_sorted$stop[i])
      } else {
        merged[[length(merged) + 1]] <- data.frame(start = current_start, stop = current_stop)
        current_start <- df_sorted$start[i]
        current_stop <- df_sorted$stop[i]
      }
    }
  }
  
  merged[[length(merged) + 1]] <- data.frame(start = current_start, stop = current_stop)
  do.call(rbind, merged)
}

df_disordered <- merge_overlapping_regions(df_disordered)

# Select DUF4641 domain
df_duf <- df[df$signature_accession == "PF15483", ]
df_duf <- df_duf[order(df_duf$start), ]

# ------ ASSIGN COLORS ------
domain_colors <- list(
  'DUF' = '#E74C3C'  # Red for DUF4641 domain
)

# Print domains to see what we're plotting
cat("\nDomains to plot:\n")
cat("DUF4641 domain:\n")
print(df_duf[, c('signature_description', 'start', 'stop')])
cat("\nDisordered regions:\n")
print(df_disordered)

# ------ CREATE PLOT ------
png('CXorf49_domains_selection.png', width = 18, height = 4, units = 'in', res = 300)

# Set up plot with no border
par(mar = c(4, 1, 3, 1), xpd = FALSE)
plot(NULL, xlim = c(-10, protein_length + 10), ylim = c(0.15, 0.85),
     xlab = 'Amino acid position', ylab = '', yaxt = 'n',
     main = 'CXorf49 Protein Domains and Positive Selection Sites',
     cex.lab = 1.2, cex.main = 1.4, font.lab = 2, font.main = 2,
     axes = FALSE)

# Add custom x-axis with ticks every 50 amino acids
x_ticks <- seq(0, ceiling(protein_length/50)*50, by = 50)
axis(1, at = x_ticks, labels = x_ticks, cex.axis = 0.9)

backbone_y <- 0.5
domain_height <- 0.05
disorder_height <- 0.08
triangle_width <- 0.8
triangle_height <- 0.08

# ------ PLOT POSITIVE SELECTION SITES AS TRIANGLES ------
if (length(paml_positive_sites) > 0) {
  for (site in paml_positive_sites) {
    in_any_domain <- FALSE
    
    # Check DUF4641 domain
    if (nrow(df_duf) > 0) {
      if (df_duf$start[1] <= site && site <= df_duf$stop[1]) {
        in_any_domain <- TRUE
      }
    }
    
    # Check disordered regions
    if (!in_any_domain && nrow(df_disordered) > 0) {
      for (i in 1:nrow(df_disordered)) {
        if (df_disordered$start[i] <= site && site <= df_disordered$stop[i]) {
          in_any_domain <- TRUE
          break
        }
      }
    }
    
    # Triangle positions
    if (in_any_domain) {
      triangle_bottom_y <- backbone_y - domain_height/2 - triangle_height
      triangle_top_y <- backbone_y - domain_height/2
    } else {
      triangle_bottom_y <- backbone_y - triangle_height
      triangle_top_y <- backbone_y
    }
    
    # Draw triangle
    polygon(x = c(site - triangle_width, site + triangle_width, site),
            y = c(triangle_bottom_y, triangle_bottom_y, triangle_top_y),
            col = 'red', border = 'darkred', lwd = 1)
    
    # Add site number annotation
    text(site, triangle_bottom_y - 0.02, as.character(site), 
         cex = 0.7, font = 1, col = 'black', srt = 90, adj = 1)
  }
}

# ------ DRAW BACKBONE ------
lines(c(0, protein_length), c(backbone_y, backbone_y), lwd = 3)

# ------ DRAW DISORDERED REGIONS ------
if (nrow(df_disordered) > 0) {
  for (i in 1:nrow(df_disordered)) {
    start <- df_disordered$start[i]
    stop <- df_disordered$stop[i]
    
    rect(start, backbone_y - disorder_height/2, stop, backbone_y + disorder_height/2,
         col = rgb(0.75, 0.75, 0.75, 0.5), border = 'gray50', lwd = 1)
  }
}

# ------ DRAW DUF4641 DOMAIN (main domain) ------
if (nrow(df_duf) > 0) {
  start <- df_duf$start[1]
  stop <- df_duf$stop[1]
  name <- 'Domain of Unknown function'
  color <- domain_colors[['DUF']]
  width <- stop - start
  
  rect(start, backbone_y - domain_height/2, stop, backbone_y + domain_height/2,
       col = color, border = 'black', lwd = 2)
  
  text(start + width/2, backbone_y, name, 
       cex = 0.8, font = 2, col = 'white')
  
  coord_y <- backbone_y + domain_height/2 + 0.05
  text(start, coord_y, as.character(start), cex = 0.7, font = 2)
  text(stop, coord_y, as.character(stop), cex = 0.7, font = 2)
}

# ------ LEGEND ------
legend('topright', 
       legend = c('Positive selection', 'Disordered region'),
       fill = c('red', rgb(0.75, 0.75, 0.75, 0.5)),
       border = c('darkred', 'gray50'),
       cex = 0.9, box.lwd = 1)

dev.off()

cat("\nPlot saved as 'CXorf49_domains_selection.png'\n")

Protein length: 514 amino acids

Domains to plot:
DUF4641 domain:
                 signature_description start stop
 Domain of unknown function (DUF4641)    81  505

Disordered regions:
  start stop
     1   68
   109  244
   272  484

Plot saved as 'CXorf49_domains_selection.png'
