**GENE SCOPE**

In [84]:
# ==============================================================================
# Importing Required Packages and Tools
# ==============================================================================

print("⏳ Installing required packages and tools...")
!apt-get update -qq
!apt-get install -y -qq clustalw vienna-rna
!pip install -q biopython pandas
print("✅ Installation complete!")

# ==============================================================================
# Importing Required Libraries
# ==============================================================================
import os
import subprocess
import re
from collections import Counter
from Bio import SeqIO, AlignIO
from Bio.Seq import Seq
from Bio.SeqUtils import molecular_weight
from Bio.Blast import NCBIWWW, NCBIXML
from google.colab import files
import pandas as pd
from IPython.display import display, HTML

# ==============================================================================
# Part 1: DNA -> mRNA -> Protein Analysis Functions
# ==============================================================================

def read_fasta(file_path):
    """Reads a single FASTA file and returns the sequence as a string."""
# Note: User must upload the file to the files
# on Google Colab to copy its path easily.
    try:
        record = next(SeqIO.parse(file_path, "fasta"))
        return str(record.seq)
    except Exception as e:
        print(f"Error reading FASTA file: {e}")
        return None

def dna_analysis(seq):
    """Returns nucleotide counts, GC content, length, and an interpretation."""
    counts = Counter(seq.upper())
    total_len = len(seq)
    gc_count = counts.get('G', 0) + counts.get('C', 0)
    gc_content_val = 100 * gc_count / total_len if total_len > 0 else 0

    if gc_content_val > 60:
        interpretation = "High GC Content. Suggests high thermal stability or gene-rich regions."
    elif gc_content_val < 40:
        interpretation = "Low GC Content. Suggests lower thermal stability (AT-rich)."
    else:
        interpretation = "Medium GC Content. Typical for many organisms, including mammals."

    return {
        'A': counts.get('A', 0), 'T': counts.get('T', 0),
        'C': counts.get('C', 0), 'G': counts.get('G', 0),
        'GC Content': f"{gc_content_val:.2f}%", 'Length': total_len,
        'Interpretation': interpretation
    }

def transcribe_dna(seq):
    """Transcribes DNA to mRNA."""
    return seq.upper().replace('T', 'U')

# Note: This transcription process looks relatively easy because by default
# a DNA sequence in a FASTA file (which can be found on NCBI, Ensembl etc..) usually
# represents the coding strand, also called the sense strand or non-template strand.
# And it is almost same as the mRNA (except T’s instead of U’s). It’s what’s typically
# saved in a FASTA file and annotated in databases.


def mrna_analysis(mrna_seq):
    """
    Performs comprehensive analysis on an mRNA sequence, and prints
    a formatted summary for easy reading.
    """

    codons = [mrna_seq[i:i+3] for i in range(0, len(mrna_seq) - 2, 3)]
    codon_counts = Counter(codons)
    start_codon = 'AUG'
    stop_codons = {'UAA', 'UAG', 'UGA'}
    start_positions = [i for i, c in enumerate(codons) if c == start_codon]
    stop_positions = sorted([i for i, c in enumerate(codons) if c in stop_codons])

    total_codons = len(codons) if codons else 1
    codon_data = []
    for codon, count in codon_counts.most_common():
        frequency = (count / total_codons) * 100
        codon_data.append({'Codon': codon, 'Count': count, 'Frequency (%)': f"{frequency:.2f}"})

    import pandas as pd
    codon_df = pd.DataFrame(codon_data)

    try:
        command = ['RNAfold', '--noPS']
        process = subprocess.run(command, input=mrna_seq, text=True, capture_output=True, check=True)
        lines = process.stdout.strip().split('\n')
        structure_line = lines[1] if len(lines) > 1 else "Error: Could not parse RNAfold output."

        parts = structure_line.rsplit(' (', 1)
        if len(parts) == 2:
            structure = parts[0]
            mfe_str = parts[1].rstrip(')')
        else:
            structure = structure_line
            mfe_str = "N/A"

    except Exception as e:
        structure = f"Prediction failed. Error: {e}"
        mfe_str = "N/A"

    print("\n" + "="*50)
    print("           mRNA Analysis Results")
    print("="*50)

    print("\n--- Codon Usage Analysis ---")
    if not codon_df.empty:
        print(codon_df.to_string(index=False))
    else:
        print("No codons found.")

    def print_positions_in_rows(title, positions_list):
        print(f"\n▶ {title}:")
        if not positions_list:
            print("  []")
            return

        positions_str = [str(p) for p in positions_list]
        row_size = 15
        for i in range(0, len(positions_str), row_size):
            print("  " + " ".join(positions_str[i:i+row_size]))

    print("\n\n--- Transcript Features ---")
    print_positions_in_rows("Start Codon Positions (codon #)", start_positions)
    print_positions_in_rows("Stop Codon Positions (codon #)", stop_positions)
    print("\n▶ Interpretation:")
    print("  - Start Codons (AUG) mark where protein synthesis begins.")
    print("  - Stop Codons (UAA, UAG, UGA) signal the end of translation.")
    print("  - Multiple start/stop sites can indicate alternative reading frames or complex regulation.")

    print("\n\n--- Secondary Structure Prediction (via RNAfold) ---")

    # Wraps the long structure string into multiple lines
    print("  Structure (2D):")
    row_size = 70 # Defines how many characters per line
    for i in range(0, len(structure), row_size):
        # Indents each line for alignment and clarity
        print("    " + structure[i:i+row_size])

    print(f"  MFE: {mfe_str} kcal/mol")
    print("\n▶ Interpretation:")
    print("  - This is a prediction of how the mRNA molecule folds on itself. A lower (more negative) MFE")
    print("    suggests a more stable and more likely structure.")
    print("  - The structure can affect translation efficiency and mRNA stability.")
    print("\n" + "="*50)

    return None

def translate_mrna(mrna_seq):
    """
    Finds the first start codon (AUG) and translates the mRNA sequence
    from that point until the first stop codon.
    """
    # Finds the string position of the first start codon 'AUG'
    start_pos = mrna_seq.find('AUG')

    # If no start codon is found, no protein can be made.
    if start_pos == -1:
        return "No start codon (AUG) was found in the mRNA sequence, so no protein could be translated."

    # Creates a new sequence object starting from the found 'AUG'
    # This ensures we are in the correct reading frame.
    translatable_mrna = Seq(mrna_seq[start_pos:])

    # Translates from this point. The to_stop=True argument will
    # automatically handle stopping at the first in-frame stop codon.
    protein_seq = translatable_mrna.translate(to_stop=True)

    # If translation stops immediately (e.g., 'AUGUAA...'), the result can be empty.
    if not protein_seq:
        return "A start codon was found, but it was immediately followed by a stop codon. No protein was formed."

    return str(protein_seq)

def protein_analysis(protein):
    """Returns molecular weight with interpretation."""
    clean_protein = protein.replace('*', '')
    mw = molecular_weight(clean_protein, seq_type='protein')
    interpretation = "This is the mass of one molecule of the protein in Daltons (Da). It's a key property used in lab techniques like SDS-PAGE."
    return {'Molecular Weight': f"{mw:.2f} Da", 'Length': len(clean_protein), 'Interpretation': interpretation}

# ==============================================================================
# Part 2: Multiple Sequence Alignment (MSA) & BLAST Functions
# ==============================================================================

def concatenate_fastas(file_paths, output_path="combined.fasta"):
    """
    Reads sequences from multiple FASTA files and writes them to a single,
    correctly formatted multi-FASTA file, preventing format errors.
    """
    # User must upload these 3 files to Google Colab: "IFT140_HUMAN_NCBI.fasta",
    # "IFT140_Zebrafish_NCBI.fasta", "IFT140_House_Mouse_NCBI.fasta"
    # Uses a list to collect all sequences from all files
    all_sequences = []
    for path in file_paths:
        try:
            # SeqIO.parse returns an iterator, so we extend the list with it
            all_sequences.extend(list(SeqIO.parse(path, "fasta")))
        except Exception as e:
            print(f"Warning: Could not parse file {path}. Error: {e}")
            continue # Skip to the next file

    # Checks if we actually found any sequences
    if not all_sequences:
        print("Error: No valid sequences could be read from the uploaded files.")
        return None

    # Writes all the collected sequences to the output file
    # Biopython's SeqIO.write handles the formatting perfectly.
    with open(output_path, "w") as outfile:
        SeqIO.write(all_sequences, outfile, "fasta")

    print(f"Combined {len(all_sequences)} sequences into {output_path}")
    return output_path

def run_msa_with_clustalw(fasta_file):
    """
    Runs ClustalW on a multi-sequence FASTA file and returns the
    Biopython alignment object.
    """
    print("⏳ Running ClustalW alignment...")
    outfile = "temp_msa.aln"
    try:
        command = [
            'clustalw',
            f'-INFILE={fasta_file}',
            '-ALIGN',
            '-TYPE=PROTEIN',
            f'-OUTFILE={outfile}'
        ]
        # Uses subprocess.run to execute the command safely
        subprocess.run(command, check=True, capture_output=True, text=True)
        print("✅ ClustalW alignment complete.")

        # Reads the generated alignment file
        alignment = AlignIO.read(outfile, "clustal")
        return alignment

    except FileNotFoundError:
        print("❌ ERROR: 'clustalw' command not found. Make sure ClustalW is installed and in your system's PATH.")
        return None
    except subprocess.CalledProcessError as e:
        print(f"❌ ClustalW failed. This is often due to an improperly formatted input file.")
        print(f"   Error details: {e.stderr}")
        return None
    except Exception as e:
        print(f"❌ An unexpected error occurred: {e}")
        return None

def display_highlighted_msa(alignment):
    """
    An all-in-one function that finds conserved positions, prints them in
    neat rows, displays a color-highlighted MSA, and returns the list of
    conserved indices for further analysis. After that; it provides a
    detailed, user-friendly interpretation of the MSA results,
    including percent identity and biological significance.
    """
    if not alignment:
        print("Cannot display alignment: Invalid alignment object.")
        return []

    # Finds conserved positions
    conserved_indices = []
    for i in range(alignment.get_alignment_length()):
        column = alignment[:, i]
        if len(set(column)) == 1 and '-' not in column:
            conserved_indices.append(i)

    # Prints the conserved positions in aligned rows
    print("\n--- Conserved Positions ---")
    if not conserved_indices:
        print("  No conserved positions found.")
    else:
        # Finds the maximum number length to determine padding for alignment
        max_len = len(str(max(conserved_indices)))

        row_size = 20  # Items can be adjusted per row here
        for i in range(0, len(conserved_indices), row_size):
            # Formats each number with padding so they align perfectly in columns
            row_of_numbers = [f"{p:<{max_len}}" for p in conserved_indices[i:i+row_size]]
            # Joins with a couple of spaces for clear separation and indent for style
            print("  " + "  ".join(row_of_numbers))

    # Displays the color-highlighted MSA
    # A soft pastel color palette for the 20 standard amino acids
    color_palette = {
        'A': '#FFB3BA',  # Light Pink
        'V': '#FFDFBA',  # Light Apricot
        'I': '#FFFFBA',  # Light Yellow
        'L': '#BAFFC9',  # Light Mint
        'M': '#BAE1FF',  # Light Blue
        'F': '#E0BBE4',  # Light Lavender
        'Y': '#FFC8DD',  # Light Carnation Pink
        'W': '#D9D9D9',  # Light Grey
        'S': '#C9FFE5',  # Light Seafoam Green
        'T': '#D4F0F0',  # Light Powder Blue
        'C': '#FFE8D6',  # Light Peach
        'P': '#F2C6DE',  # Light Lilac
        'N': '#D1E7DD',  # Light Sage
        'Q': '#A9DEF9',  # Light Sky Blue
        'K': '#F4C2C2',  # Baby Pink
        'R': '#E6E6FA',  # Lavender
        'H': '#B5EAD7',  # Light Pastel Green
        'G': '#FDFFB6',  # Light Cream
        'D': '#FFD6A5',  # Light Tangerine
        'E': '#EAEAEA',  # Light Concrete
    }

    # Finds the conserved positions

    print("\n")
    print(f"Found {len(conserved_indices)} conserved positions. Highlighting each amino acid type with a unique color.")
    print("\n")

    # Starts building the HTML string
    html_output = "<pre style='font-family: monospace; line-height: 1.4; font-size: 14px;'>"

    for record in alignment:
        # Adds the sequence ID, padded for alignment
        html_output += f"<b>{record.id.ljust(15)}</b>"

        # Iterates through each amino acid in the sequence
        for i, aa in enumerate(str(record.seq)):
            # If the index is conserved, finds the right color
            if i in conserved_indices:
                # Gets the color for this specific amino acid, default to grey if not found
                color = color_palette.get(aa.upper(), '#D9D9D9')
                # Applies the style
                html_output += f"<span style='background-color: {color}; color: black; font-weight: bold;'>{aa}</span>"
            else:
                # Otherwise, just adds the amino acid with no background
                html_output += f"<span>{aa}</span>"

        # Adds a line break for the next sequence
        html_output += "<br>"

    html_output += "</pre>"

    # Uses IPython.display.HTML to render the final HTML
    display(HTML(html_output))

    if not alignment:
        print("Cannot interpret results: Invalid alignment object.")
        return

    num_sequences = len(alignment)
    alignment_length = alignment.get_alignment_length()
    num_conserved = len(conserved_indices)

    # Calculates overall percent identity
    percent_identity = (num_conserved / alignment_length) * 100 if alignment_length > 0 else 0

    # Interpretation Text:

    # Dynamic conclusion based on similarity
    if percent_identity > 70:
        conclusion = "This is a very high degree of conservation, strongly suggesting a close evolutionary relationship. These proteins likely share a critical, fundamental function and structure."
    elif percent_identity > 40:
        conclusion = "This is a moderate degree of conservation, suggesting a shared ancestry and the presence of common functional or structural domains, though significant divergence has also occurred."
    else:
        conclusion = "This is a low degree of conservation, suggesting a more distant evolutionary relationship. While the conserved regions may point to a shared ancient function, the proteins have diverged considerably over time."

    # Prints the Formatted Interpretation

    print("\n" + "="*50)
    print("      Multiple Sequence Alignment Interpretation")
    print("="*50)

    print(f"\n▶ Alignment Overview:")
    print(f"  - Number of Sequences: {num_sequences}")
    print(f"  - Total Alignment Length: {alignment_length} residues")

    print(f"\n▶ Conservation Analysis:")
    print(f"  - Number of Conserved Positions: {num_conserved}")
    print(f"  - Overall Percent Identity: {percent_identity:.2f}%")

    print(f"\n▶ Biological Significance:")
    print("  Conserved regions are powerful indicators of biological importance. Because these")
    print("  amino acids have resisted change over evolutionary time, they often correspond to:")
    print("    - The active site of an enzyme.")
    print("    - Key structural components required for the protein to fold correctly.")
    print("    - Sites for binding to other molecules (like DNA or other proteins).")

    print(f"\n▶ Conclusion:")
    print(f"  {conclusion}")
    print("="*50)


def blast_and_parse_sequence(sequence_str, top_n=10):
    """
    Runs a BLAST search, provides a detailed interpretation of the results,
    and returns a pandas DataFrame of the top N hits.
    """
    print("⏳ Running BLAST search... this may take a moment.")
    try:
        # Uses blastp for protein sequences against the non-redundant protein database
        result_handle = NCBIWWW.qblast("blastp", "nr", sequence_str)

        # Parses the XML results
        blast_records = NCBIXML.parse(result_handle)

        results_data = []
        for blast_record in blast_records:
            for alignment in blast_record.alignments:
                for hsp in alignment.hsps:
                    # Calculates percent identity
                    percent_identity = (hsp.identities / hsp.align_length) * 100

                    results_data.append({
                        'Description': alignment.title,
                        'Max Score': hsp.score,
                        'E-value': hsp.expect,
                        'Query Cover (%)': f"{((hsp.query_end - hsp.query_start + 1) / blast_record.query_length) * 100:.2f}",
                        'Per. Ident (%)': f"{percent_identity:.2f}",
                        'Accession': alignment.accession
                    })

        sorted_results = sorted(results_data, key=lambda x: x['Max Score'], reverse=True)
        top_results = sorted_results[:top_n]

        import pandas as pd
        df = pd.DataFrame(top_results)

        print("✅ BLAST search complete.")

        # Prints the Interpretation Guide
        print("\n" + "="*60)
        print("          How to Interpret Your BLAST Results")
        print("="*60)
        print("BLAST has compared your sequence against a massive database of known proteins.")
        print("Here's what the columns in the table below mean:\n")

        print("  ▶ Max Score:")
        print("    - A measure of the quality of the alignment. Higher scores are better.")

        print("\n  ▶ E-value (Expect Value):")
        print("    - The most important metric for significance. It's the number of hits")
        print("      you would expect to see by pure chance. The closer to zero, the")
        print("      more significant the match is. (e.g., e-100 is better than e-10).")

        print("\n  ▶ Query Cover (%):")
        print("    - What percentage of your original sequence length is covered by the")
        print("      alignment with the database sequence.")

        print("\n  ▶ Per. Ident (%):")
        print("    - The percentage of amino acids that are identical between your sequence")
        print("      and the database sequence in the aligned region.")

        print("\n  ▶ Accession:")
        print("    - The unique ID for the matching sequence in the NCBI database. You can")
        print("      use this to look up more information about that protein.")
        print("="*60 + "\n")

        return df

    except Exception as e:
        print(f"BLAST search failed: '{e}'")
        return None





⏳ Installing required packages and tools...
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
✅ Installation complete!


In [85]:
# Note: User must upload the file named "dna_sequence_NCBI.fasta" to the files
# on Google Colab to copy its path easily.
seq = read_fasta("/content/dna_sequence_NCBI.fasta")
seq

'TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCTAACCCTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCGCCCGCCCGGGTCTGACCTGAGGAGAACTGTGCTCCGCCTTCAGAGTACCACCGAAATCTGTGCAGAGGACAACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCCGTTGCAAAGGCGCGCCGCGCCGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGGCGCAGGCGCAGACACATGCTAGCGCGTCGGGGTGGAGGCGTGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGGCGCAGGCGCAGAGACACATGCTACCGCGTCCAGGGGTGGAGGCGTGGCG'

In [86]:
dna_analysis(seq)

{'A': 224,
 'T': 107,
 'C': 398,
 'G': 180,
 'GC Content': '63.59%',
 'Length': 909,
 'Interpretation': 'High GC Content. Suggests high thermal stability or gene-rich regions.'}

In [87]:
mrna = transcribe_dna(seq)
mrna

'UAACCCUAACCCUAACCCUAACCCUAACCCUAACCCUAACCCUAACCCUAACCCUAACCCUAACCCUAACCCUAACCCUAACCCUAACCCUAACCCUAACCCUAACCCAACCCUAACCCUAACCCUAACCCUAACCCUAACCCUAACCCCUAACCCUAACCCUAACCCUAACCCUAACCUAACCCUAACCCUAACCCUAACCCUAACCCUAACCCUAACCCUAACCCUAACCCCUAACCCUAACCCUAAACCCUAAACCCUAACCCUAACCCUAACCCUAACCCUAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCUAACCCCUAACCCUAACCCUAACCCUACCCUAACCCUAACCCUAACCCUAACCCUAACCCUAACCCCUAACCCCUAACCCUAACCCUAACCCUAACCCUAACCCUAACCCUAACCCCUAACCCUAACCCUAACCCUAACCCUCGCGGUACCCUCAGCCGGCCCGCCCGCCCGGGUCUGACCUGAGGAGAACUGUGCUCCGCCUUCAGAGUACCACCGAAAUCUGUGCAGAGGACAACGCAGCUCCGCCCUCGCGGUGCUCUCCGGGUCUGUGCUGAGGAGAACGCAACUCCGCCGUUGCAAAGGCGCGCCGCGCCGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGGCGCAGGCGCAGACACAUGCUAGCGCGUCGGGGUGGAGGCGUGGCGCAGGCGCAGAGAGGCGCGCCGCGCCGGCGCAGGCGCAGAGACACAUGCUACCGCGUCCAGGGGUGGAGGCGUGGCG'

In [88]:
mrna_analysis(mrna)


           mRNA Analysis Results

--- Codon Usage Analysis ---
Codon  Count Frequency (%)
  AAC     40         13.20
  CCC     39         12.87
  CCU     33         10.89
  UAA     30          9.90
  CGC     21          6.93
  GCG     14          4.62
  ACC     11          3.63
  GGC      9          2.97
  CUA      8          2.64
  CCG      8          2.64
  GAG      8          2.64
  GCC      7          2.31
  AGG      7          2.31
  GCA      7          2.31
  AGA      6          1.98
  CAG      6          1.98
  GGU      4          1.32
  CGG      4          1.32
  CUC      3          0.99
  GUG      3          0.99
  ACA      3          0.99
  GCU      3          0.99
  CUG      2          0.66
  GGA      2          0.66
  UGC      2          0.66
  UCC      2          0.66
  GUC      2          0.66
  UAC      1          0.33
  AGC      1          0.33
  UGA      1          0.33
  GAA      1          0.33
  UUC      1          0.33
  GUA      1          0.33
  CCA      1      

In [89]:
protein = translate_mrna(mrna)
protein



'MLARRGGGVAQAQRGAPRRRRRRDTCYRVQGWRRG'

In [90]:
protein_analysis(protein)

{'Molecular Weight': '4111.70 Da',
 'Length': 35,
 'Interpretation': "This is the mass of one molecule of the protein in Daltons (Da). It's a key property used in lab techniques like SDS-PAGE."}

In [91]:
# User must upload these 3 files to Google Colab: "IFT140_HUMAN_NCBI.fasta",
# "IFT140_Zebrafish_NCBI.fasta", "IFT140_House_Mouse_NCBI.fasta"
file_paths= ["/content/IFT140_HUMAN_NCBI.fasta","/content/IFT140_House_Mouse_NCBI.fasta","/content/IFT140_Zebrafish_NCBI.fasta"]
multi_fasta_file = concatenate_fastas(file_paths, output_path="combined.fasta")

Combined 3 sequences into combined.fasta


In [92]:
alignment=run_msa_with_clustalw(multi_fasta_file)
alignment

⏳ Running ClustalW alignment...
✅ ClustalW alignment complete.


<<class 'Bio.Align.MultipleSeqAlignment'> instance (3 records of length 1481) at 7d908de2e450>

In [93]:
print(alignment)

Alignment with 3 rows and 1481 columns
MALYYDHQIEAPDAAGSPSFISWHPVHPFLAVAYISTTSTGSVD...DDP NP_055529.2
MALYFDHRIKAPDTPSSPSHITWHPTHPFLAVASISPSSGGNVD...NDP NP_001397248.1
MAVYFDHRAEAPDSSGVPVLISWHSSVCVLAVGSVNPSTGGCVD...H-- XP_021329488.2


In [94]:
conserved_indices = display_highlighted_msa(alignment)
conserved_indices


--- Conserved Positions ---
  0     1     3     5     6     10    11    12    17    20    22    23    29    30    31    40    42    43    45    46  
  48    49    50    52    56    58    59    61    66    68    69    70    75    76    78    79    80    81    82    83  
  93    98    102   103   106   109   111   112   115   118   121   122   125   126   130   133   135   136   139   140 
  142   143   145   146   148   152   153   155   156   157   158   160   161   162   164   165   169   170   172   173 
  174   175   176   177   178   180   181   182   183   184   185   186   188   189   190   192   198   199   201   202 
  203   205   208   211   212   214   217   218   219   222   228   231   238   242   244   245   247   248   249   252 
  254   255   261   262   263   265   267   270   271   272   273   274   275   278   281   282   287   290   291   294 
  296   297   301   303   304   306   311   313   314   317   320   321   322   323   324   325   328   329   330   333 
  3


      Multiple Sequence Alignment Interpretation

▶ Alignment Overview:
  - Number of Sequences: 3
  - Total Alignment Length: 1481 residues

▶ Conservation Analysis:
  - Number of Conserved Positions: 796
  - Overall Percent Identity: 53.75%

▶ Biological Significance:
  Conserved regions are powerful indicators of biological importance. Because these
  amino acids have resisted change over evolutionary time, they often correspond to:
    - The active site of an enzyme.
    - Key structural components required for the protein to fold correctly.
    - Sites for binding to other molecules (like DNA or other proteins).

▶ Conclusion:
  This is a moderate degree of conservation, suggesting a shared ancestry and the presence of common functional or structural domains, though significant divergence has also occurred.


In [95]:
sequence_str = read_fasta("/content/IFT140_House_Mouse_NCBI.fasta")
sequence_str

'MALYFDHRIKAPDTPSSPSHITWHPTHPFLAVASISPSSGGNVDIYLEQGEPVPDTHIERSFQATSLCWHPTRLILAIGWETGEVIMFNKQDKEQHTVPLPHTTDIAILSWSTSGSCLVSGDKLGVLLLWRLDQRGRVQGTPLLKHEYGKALTHCIFRLPPPGEDLVQLAKAAVSGDEKALDMFNWRKSSFGSFLKTGSQEGLSFFVSLMDGTVHYVDEKGKTAQVASTDSSIQTLFYIERREALVVVTENLLLSLYVVTPEGEAEEVMKVKLSGKTGCRADITLIEGSLLVTAIGEPVLRFWDLERGENYILSLQEKFGFEKGESINCVCFCKAKGLLAAGTNKGRVAMWKKVPSFPNGRGAEGKDMWALQTPTELEGNITQIKWGSRKNLLAVSSTESVSILSEQAMSSHFHQQVAAVQISPSLVNVSFLSTGGTHSLHTDMHISGVFATKDAVAVWNGKQVAIFEPSGSTLRNAGTFLCETSVLAMHEESIYTVEPNRLQVRTWQGTVKQLLLFSETEGSPCFLDVCGTFLVAGTDLAHFKSFDLSRREAKVHCSCKNLAQLVPDVGSITSLRCNANGNKISILLSKVNNSPDSKIYIYDVEMDTVNVFNFTTGQIGQIQTLPFNEPPTNETRSFMDKSLAGYTPVNHFWDQSEPRLFVCEALQEAPGAQPQAVDKQPRVEEGTCHKEEVLILSFFASEEHGFLLHDSFPRPSTYQSLLGMEVPHYYFTKKPGEADKEDRVDSGYYHIPQMVAKRPLRDFVGLEDCDKSTRDAMLNFSFFVTIGDMDEAFKSIKLIKSEAVWENMARMCVKTQRLDVAKVCLGNMGHARGARALREAEQEPELEARVAMLAIQLGMLEEAEQLYKKCKRYDLLNKFYQASDQWQKAVEVAELHDRVHLRTTYYNYAKHLEASADCGQALSYYEKSDTHRFEVPRMLSEDLQSLELYINRMKDKTLWRWWAQYLESQAEMDTALRYYELAQDYFSLVRIHCFQGNIQ

In [96]:
blast_and_parse_sequence(sequence_str)

⏳ Running BLAST search... this may take a moment.
✅ BLAST search complete.

          How to Interpret Your BLAST Results
BLAST has compared your sequence against a massive database of known proteins.
Here's what the columns in the table below mean:

  ▶ Max Score:
    - A measure of the quality of the alignment. Higher scores are better.

  ▶ E-value (Expect Value):
    - The most important metric for significance. It's the number of hits
      you would expect to see by pure chance. The closer to zero, the
      more significant the match is. (e.g., e-100 is better than e-10).

  ▶ Query Cover (%):
    - What percentage of your original sequence length is covered by the
      alignment with the database sequence.

  ▶ Per. Ident (%):
    - The percentage of amino acids that are identical between your sequence
      and the database sequence in the aligned region.

  ▶ Accession:
    - The unique ID for the matching sequence in the NCBI database. You can
      use this to look up more

Unnamed: 0,Description,Max Score,E-value,Query Cover (%),Per. Ident (%),Accession
0,ref|NP_001397248.1| intraflagellar transport p...,7961.0,0.0,100.0,100.0,NP_001397248
1,gb|AAI39006.1| Intraflagellar transport 140 ho...,7954.0,0.0,100.0,99.93,AAI39006
2,gb|ABB72790.1| intraflagellar transport protei...,7935.0,0.0,100.0,99.66,ABB72790
3,ref|NP_001397246.1| intraflagellar transport p...,7927.0,0.0,100.0,99.19,NP_001397246
4,"gb|EDL22395.1| mCG17645, isoform CRA_a [Mus mu...",7911.0,0.0,100.0,98.85,EDL22395
5,ref|XP_021041711.1| intraflagellar transport p...,7856.0,0.0,100.0,98.57,XP_021041711
6,ref|XP_021076831.1| intraflagellar transport p...,7829.0,0.0,100.0,98.29,XP_021076831
7,ref|XP_052050583.1| intraflagellar transport p...,7738.0,0.0,100.0,97.34,XP_052050583
8,ref|XP_028616848.1| intraflagellar transport p...,7725.0,0.0,100.0,97.13,XP_028616848
9,ref|XP_034363377.1| intraflagellar transport p...,7717.0,0.0,100.0,96.93,XP_034363377
