<a href="https://colab.research.google.com/github/gilsonauerswald/Bioinformatic_Projects/blob/main/Phylogenetic_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# --------------------------------------
# Install Required Packages (Run Once)
# --------------------------------------
if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
BiocManager::install("msa")
install.packages(c("ape", "phangorn", "ade4"))

# --------------------------------------
# Load Required Libraries
# --------------------------------------
library(ape)         # For phylogenetic tree analysis
library(phangorn)    # For maximum likelihood estimation
library(msa)         # For multiple sequence alignment
library(ade4)        # For heatmap visualization

In [None]:
# --------------------------------------
# Step 1: Load Sequence Data
# --------------------------------------
sequences <- readDNAStringSet("https://raw.githubusercontent.com/Omicslogic-git/Phylogenetic_data/refs/heads/main/COVID_Sequences.fasta", format = "fasta")

# --------------------------------------
# Step 2: Perform Multiple Sequence Alignment (ClustalW)
# --------------------------------------
alignment <- msa(sequences, method = "ClustalW")
aligned_sequences <- as.DNAbin(alignment)  # Convert to DNAbin format

# --------------------------------------
# Step 3: Construct Distance Matrix (Jukes-Cantor Model)
# --------------------------------------
dist_matrix <- dist.dna(aligned_sequences, model = "JC69")

# --------------------------------------
# Step 4: Heatmap Visualization - Basic Greyscale
# --------------------------------------
dist_df <- as.data.frame(as.matrix(dist_matrix))
table.paint(dist_df, cleg = 0, clabel.row = 0.5, clabel.col = 0.5)

# --------------------------------------
# Step 5: Enhanced Colored Heatmap
# --------------------------------------
dist_mat <- as.matrix(dist_matrix)
par(mar = c(0.05, 4, 3.2, 0.05))
image(x = 1:nrow(dist_mat), y = 1:ncol(dist_mat), dist_mat,
      col = rev(heat.colors(100)), xaxt = "n", yaxt = "n",
      xlab = "Samples", ylab = "Samples")
axis(side = 2, at = 1:nrow(dist_mat), labels = rownames(dist_mat), las = 2, cex.axis = 0.6)
axis(side = 3, at = 1:ncol(dist_mat), labels = rownames(dist_mat), las = 3, cex.axis = 0.6)

# --------------------------------------
# Step 6: Build Neighbor-Joining (NJ) Tree
# --------------------------------------
nj_tree <- nj(dist_matrix)
plot(nj_tree, main = "Neighbor-Joining Phylogenetic Tree")

# Optional: Circular (Fan) Tree with Colors
#plot(nj_tree, type = "fan", cex = 0.7,
#     main = "Circular NJ Phylogenetic Tree",
#     tip.color = "blue", edge.color = "darkgreen", edge.width = 1.5,
#     no.margin = FALSE, label.offset = 0.05,
#     x.lim = c(-0.7, 0.7), y.lim = c(-0.7, 0.7))

# --------------------------------------
# Step 7: Maximum Likelihood (ML) Tree Construction
# --------------------------------------
dna_phyDat <- phyDat(aligned_sequences, type = "DNA")
ml_start_tree <- nj(dist_matrix)
ml_model <- pml(ml_start_tree, data = dna_phyDat)
ml_optimized <- optim.pml(ml_model, model = "JC")

# Plot ML Tree
plot(ml_optimized$tree, main = "Maximum Likelihood Phylogenetic Tree")

# --------------------------------------
# Step 8: Optional - Dendrogram Using Hclust
# --------------------------------------
hclust_tree <- hclust(dist_matrix)
plot(hclust_tree, labels = NULL, hang = 0.1, cex = 0.6, ylab = "Height")