# HR, HK, and PolyX Analysis

This notebook processes extracted data from `.txt` files and generates visualizations.

In [ ]:
# Load necessary libraries
library(tidyverse)
library(readxl)
library(dplyr)
library(conflicted)
conflict_prefer("filter", "dplyr")

## Define Organisms to Process
Specify the organisms for analysis.

In [ ]:
# Define a mapping of UP numbers to organism names
up_to_organism <- tibble(
  UP_number = c("UP000000625", "UP000002311", "UP000006548", 
               "UP000001940", "UP000000803", "UP000000589", "UP000005640"),
  Organism = c("Escherichia coli", "Saccharomyces cerevisiae", "Arabidopsis thaliana",
              "Caenorhabditis elegans", "Drosophila melanogaster", "Mus musculus", "Homo sapiens")
)

# Display table
knitr::kable(up_to_organism, caption = "Mapping of UP Numbers to Organisms")

## Select Organisms to Include in Analysis

In [ ]:
# Specify selected organisms
selected_organisms <- c("UP000005640", "UP000000589") # Modify as needed

## Load Data
Reading `.tsv` files into R and combining them into a single dataframe.

In [ ]:
# Read data from .tsv files
file_paths <- list.files("~/Master/proteomes_hrs_hk", pattern = "*.tsv", full.names = TRUE)
organism_data <- file_paths %>%
  map_dfr(~read_tsv(.x, show_col_types = FALSE) %>%
            mutate(
              Polyx_lengths = as.character(Polyx_lengths),
              Count_grouped = as.character(Count_grouped),
              Organism = str_remove(tools::file_path_sans_ext(basename(.x)), "_hrs_hk")
            ))
head(organism_data)

## Data Processing
Processing the data to classify HK and count groups properly.

In [ ]:
# Modify HK classification
HkPolyx <- organism_data %>%
  filter(Organism %in% selected_organisms) %>%
  mutate(Hk = recode(Hk, `0` = "not Hk", `1` = "Hk"))


## Visualization: HK Protein Length
Generating a boxplot of protein length grouped by HK status.

In [ ]:
# Plot HK protein length
ggplot(HkPolyx, aes(x = Hk, y = Length, fill = Hk)) +
  geom_boxplot() +
  theme_minimal()

## Statistical Analysis
Wilcoxon test to check significance.

In [ ]:
# Wilcoxon test for HK
HkPolyx %>%
  group_by(Organism) %>%
  summarise(p_value = wilcox.test(Length ~ Hk)$p.value)