In [8]:
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Sample protein sequences (you can replace these with your actual sequences)
sequences = {
    "K401": "MSAEREIPAEDSIKVVCRFRPLNDSEEKAGSKFVVKFPNNVEENCISIAGKVYLFDKVFKPNASQEKVYNEAAKSIVTDVLAGYNGTIFAYGQTSSGKTHTMEGVIGDSVKQGIIPRIVNDIFNHIYAMEVNLEFHIKVSYYEIYMDKIRDLLDVSKVNLSVHEDKNRVPYVKGATERFVSSPEDVFEVIEEGKSNRHIAVTNMNEHSSRSHSVFLINVKQENLENQKKLSGKLYLVDLAGSEKVSKTGAEGTVLDEAKNINKSLSALGNVISALADGNKTHIPYRDSKLTRILQESLGGNARTTIVICCSPASFNESETKSTLDFGRRAKTVKNVVCVNEELTAEEWKRRYEKEKEKNARLKGKVEKLEIELARWRAGETVKAEEQINMEDLMEASTPNL",
    "Kif3": "MSSIRVVCRFRPQNKLELAQGGDSIVSIAPENDSVTINGSESNHSFSFDYVFPSNTTQRDVYDHAAKPVIEDIMAGYNGTLFVYGQTGSGKTFSMTGINDPNGDQELRGIVPRMIETVFEFISNADENIEFIVKASYIEIYMERIRDLLDTRKDNLKVREEKGKGVWVEGTSEVYIYREEDILDVINTGISNRAIAETRMNAESSRSHSIFILTIQQKNLKVGSIKTGKLYLVDLAGSEKISKTGAQGTTLDEAKMINKSLSSLGNVINALTDGKSTHIPYRDSKLTRVLQESLGGNSRTTLIINCSPSSYNEAETISTLRFGSRAKNIKNKAKINQERSAAELKILLSKAENEIENLKGYIKELETVSGVTVSNLKSSGSGSGSGSGSSSSSSGSSGGSGSGGSSNLSNSVNSTSNLNTSSNTSSSNVNANANVITTSVSAPTSPKDTELIKVLQEKCIQLEKQLFKKEEEKKEILEQLEQQQEQIQDKDQEIEGLNSMIESSNNINSLYQNSTNENSVLNVQLSELKLALEKSRFEATEQSLTIEGLNEENQSIKSQLEILKDRIAQSGDSSIASLVPSTPKSSAEMDPL",
    "kif5": 'MAETNNECSIKVLCRFRPLNQAEILRGDKFIPIFQGDDSVVIGGKPYVFDRVFPPNTTQEQVYHACAMQIVKDVLAGYNGTIFAYGQTSSGKTHTMEGKLHDPQLMGIIPRIARDIFNHIYSMDENLEFHIKVSYFEIYLDKIRDLLDVTKTNLSVHEDKNRVPFVKGCTERFVSSPEEILDVIDEGKSNRHVAVTNMNEHSSRSHSIFLINIKQENMETEQKLSGKLYLVDLAGSEKVSKTGAEGAVLDEAKNINKSLSALGNVISALAEGTKSYVPYRDSKMTRILQDSLGGNCRTTMFICCSPSSYNDAETKSTLMFGQRAKTIKNTASVNLELTAEQWKKKYEKEKEKTKAQKETIAKLEAELSRWRNGENVPETERLAGEEAALGAELCEETPVNDNSSIVVRIAPEERQKYEEEIRRLYKQLDDKDDEINQQSQLIEKLKQQMLDQEELLVSTRGDNEKVQRELSHLQSENDAAKDEVKEVLQALEELAVNYDQ',
    "ThTr": 'MSTTPLGEVQNTLATATEGTQVAALMTKDSESSVQVPTPGPAPAPAATPAGAAAAAAGDHDDDDEDKMSVRVAVRVRPLLPKENTVRARECIEFVPGSAQLVLGGQRSFTYDYVFDPAANQAALYTDAVSPLIDAFFDGYNATVLAYGQTGSGKTYTMGSGSNASLAAAQLGVIPRVIRAMYTRIAEGSADSAYSMRVSFLEIHNENIVDLFNPAVSTKTNSLAIREADGAITVAGITELAVNSADDLLDKLERGSISRTTASTLMNSESSRSHAIFSLILEQSPASGAGRLVSKFHFVDLAGSERLKRTKAVGDRLKEGININCGLLALGNVISALGDPKRKARHIPYRDSKLTRLLQDSLGGNSRTLMIACVSPADINFEETLNTLRYADRARKIKNKPIVNLDLNSAQLSSLKAHIKQLQLELAAARTSGGDALPPAPALPSDNVLKLQMENEVLQSAYGKLQAKVKSLTDSLLKVTAERDAYAHQLADAGLAPQTEGPAAAATIQGYLETIASLKAQLASRASSPELDASDSASSFALDDEELDDDAAPGALLDDASAAAVAAAAAAAAQLAQLESETVASENTFAAAQDKVAAKVAHLDRNISLKEELLAKIVESKKALNEMKEDYESKLASLQSDIATIEADKAAIEAE', 
    "AdPa": 'MAGGNVRVVCRFRPQNRVEKECGGKICVNFDDKNTVRVQSEHGTRKFVMDRMFNMTTPQGEVYSYAAKPVIKDVMDGYNGTIFAYGQTGAGKTHTMEGPSLRSSTDKGVIPRIIDEIFEYIEQADPSIVFTVKLSYVEIYCEKIRDLLNPSSVNLPVRESRRAGVYIEGVTEKYVSSQEETFQVMEEGNLSRATACTRMNAASSRSHSLFILTIGQKNQEGSTKSGKLYLVDLAGSEKISKTHATGSTLDEAKTINLSLTVLGQVINALTDRKKKHIPYRDSKLTRVLQESLGGNAKTTLIICCSCSSYNCEETVSTLRFGERAKQIKNKAKVNQELSVAELKVMLADAKKEIAQLKKYIKALESGLSPEEALLSVMADGGKVVMDEEDEGEKEEDEIEEEKDPDKPP',
    "BleSto": 'METSDDSSGNIRVLCRFRPLNEKEKAMAENICVDFGPDSKTVMMRAETENSEPLRFVFDYVFDPSTVQSKIYKVAASPIVDAVMQGYNGTIFAYGQTSSGKTFTMTGVITDPELMGIIPRMVGDVFAKINNADEHIEFTVKVGYCEIYMEKIKDLLDPRKNNLKIHEDKARGVYIEDLTESYVTNDREVYELMRIGTNNREVAYTHMNAGSSRSHSLFCVTITQTNKLDMSTKSGKFYLVDLAGSEKVGKTGAEGKRLEEAKNINKSLTALGQVINALTDGKSSHVPYRDSKLTRVLQDSLGGNSKTTLIIACSPHPYNEAETLSTLRFGIRAKSIKNKAKVNKEYTVAELKLMLTKAKEEIMLKDKRISVLEQAVNKSGILIGGDEESSTKDESELEISKVSVYDDMIQELEDTRARLSEEVDMNSKLRQEIEIKTKENEEIKGDYDFLNKHIGSLQEKLTTTESTLAEKEEHIEQLIMAQEYLRNDISSLNEKKNELEQTIIKKDEEILKWKTQPREAPSTVNIKDLEHQIQVLRAQLQEEQKKSKELIARCEHLERRWNEQASNAIPNL',
    "Acsu": 'MSSIRVVCRFRPQNKIELAQGGCSIVEVNDGLTVNIKGNEGANHSFTFDRIYTEKNSQKDVYDDAAKPVIEDIMQGYNGTIFVYGQTSSGKTHTMQGPSIDDPELKGVIPRMINTVFDCIKKADENIEFIVKASYIEIYMEKIRDLLDIRKDNLKVREEKAKGVWVEGTTEVYIYREDDILEVMRTGQANRAIASTNMNAESSRSHSIFILSIQQKNLKEGSLKNGKLYLVDLAGSEKVAKTGAQGLTLDEAKMINKSLSSLGNVINALTDGKSAHIPYRDSKLTRVLQESLGGNSRTTLIINCSPSSYNEVETISTLRFGSRAKNIKNKAKINQERSAAELKILLSKAEKEIESLKVYIKEIESVSGVASSRPGASGSAGSGDSGDLQTLKEKCIALEKQLFQKEEEKKELAEQLDSQIIQLKDKEQELETHIHHIATLKEESARTSQFVNESDFLLGQLSEMKIQLEQVNYDATEQSLLIEELTTENTTLKAQLEEAAKGQSSTVVTSSPMTVEDASLSVTKHNEEWTEMEAQIKMLQRTKLSAGTPASPRGSLSAPAVNLALQEENNGMKDK',
    "AcSu2": 'MSSIRVVCRFRPQNKIELAQGGCSIIDVSDNQTVNIKGSESNHTFTFDRIYDERNSQKDVYDDAAKPVIEDIMLGYNGTIFVYGQTSSGKTHTMQGPSIDDAELKGVIPRMINTVFECINKADQNVEFIVKASYIEIYMEKIRDLLDVRKDNLRVREEKGKGVWVEGTTEVYIYREEDILEVMRTGQANRAIAETKMNAESSRSHSIFILSIQQKNLKEGSNKHGKLYLVDLAGSEKVAKTGAQGLTLDEAKMINKSLSSLGNVINSLTDGKSAHIPYRDSKLTRVLQESLGGNSRTTLIINCSPSSYNEVETVSTLRFGNRAKNIKNKAKINQERSAAELKILLAKAEKEIESLKEYTKELESLTGVPSSKSILVTSSFTHPLTNQSIIEFGNNNDSNTSADMQQLKEKCISLEKLLFQKEEERKELLDQMDSSLIQLHDKEQELEVQAHLYIAMQHETTKLSSISQENETLTSQVTELKLSLEKKKYEAMEQSLLIEELNGEIAILKSDAPKTVSAKGSGVTSPIGIA',
    "Dipu": 'MSSIRVVCRFRPQNKNELAQGGCSIVSVAADNQSVSINGAESNHTFTFDRVFHDQCTQKEVYDDAAKPVIEDIMAGYNGTIFVYGQTSSGKTHTMQGPSIDDQELKGVIPRMIQTVFECISNADENIEFIVKASYIEIYMERIRDLLDTKKDNLKVREEKGKGVWVDGTTEAYIYGEHDILNVIRNGQANRAIAETKMNAESSRSHSIFILTIQQKNLKEGSVKTGKLYLVDLAGSEKISKTGAQGLTLDEAKMINKSLSSLGNVINALTDGKSAHIPYRDSKLTRVLQESLGGNSRTTLIINCSPSSYNEAETVSTLRFGSRAKNIKNKAKINQERSAAELKILLSKAENEIESLKGYIKELESVGGAPRSSGSQSASSASANSENDQALKALQEKCISLEKQLFKKEEEKRELLEQLEIAQEQIQDKDQELEAMNALRDSANKYSSLYQTITNENSVLNTQISEMKLILDKSRFEISEQALTIEGLQSENQSIKSEFTILQEKFSKQSSNP',
    "HeAl": 'MSSIRVVCRFRPQNKIELAQGGCSVVDVADDQTVTIKGNESNHTFTFDRIYTEKNSQKDVYDDAAKPVIEDIMQGYNGTIFVYGQTSSGKTHTMQGPSIDDAELKGVIPRMINTVFDCITKADENIEFIVKASYIEIYMERIRDLLDVRKDNLKVREEKGKGVWVDGTTEVYIYREDDILEVMRAGQANRAIAETKMNAESSRSHSIFILTIQQKNLKEGSNKSGKLYLVDLAGSEKIAKTGAQGLTLDEAKMINKSLSSLGNVINALTDGKSTHIPYRDSKLTRVLQESLGGNSRTTLIINCSPSSYNETETLSTLRFGNRAKSIKNKAKINQERSAAELKILLSKAEKEIESLKDYIKELETVSGVPHSKIGNNLDTDKSADVQGLKEKCIQLEKLLFQKEEEKKELSEQLDTISIQLQDKEQELETQTHQVTSLKDEASKYVSLSNENDILSAQLTEIKLLLEKKNYESVEQTLVIEELSAENASIKSQLQEKIESSKGVGGIGDHYTPS',
    "NaGr": 'MTEEFGGSNIQVVCRFRPLNTLEKQMGGGEVVDFDGKTCKLNNKNGKHDFTFDHIFKSGSKQGDLFNVVGKPVVEDIFKGYNGTVFVYGQTGSGKSYTMMGPNEDHKGYCTDSNLKGLIPRMIEEIFDRVENSDPDIEFTIQISYIEIYLEKIRDLLDPHHQDLKIKEDRESGRGVYIKGATEEYVTSVEEVYNLLKVGAGNRVVSSTRMNDESSRSHSIFIITIGQKHLVNLDSKTGKLFLVDLAGSEKVKKTGASGQTLEEAKNINKSLSALGMVINALTDGVSKFVPYRDSKLTRLLQDSLGGNSRTTLIINCSMSSYNEDETLSTLRFGFRAKNIKNKPKVNRELSAKELQKLLDKAKEEIRELKEYTNGIEEELKIYKKGGIKTEIK',
    "TiLa": 'MSQQQSASIRVVCRFRPQNKIELAQGGCSVVNIPDNQTVQIKGAENNHTFTFDRVYSDRATQKDVYEDAAKPVIEDICSGYNGTIFVYGQTSSGKTHTMQGPSFEDAELKGVIPRMINTIFDCINKADENIEFIVKASFIEIYMERIRDLLDPVKNNLKIREEKGKGVWVDGTTEVYIYRENDILEVMRAGAANRAIGETKMNAESSRSHSIFILSIQQKNLLKGTVKTGKLYLVDLAGSEKISKTGAQGLTLDEAKMINKSLSSLGNVINALTDGKSTHIPYRDSKLTRVLQESLGGNSRTTLIINCSPSSYNENETVSTLRFGSRAKNIKNKAKINQEMSAAELKEMLAKCNQEIESLKKYIQQLETLGTSTTGMSGSAALLNTTQSDDILKTIQEKCINLEKQLFQKEEDRKEIQDQLDQLLDQIQDKDQEIESQIQSIQSLQSHNTMLTNENNILTSQLSDFKLSLEKSKYLSSEQQILIDTLNNENSSIKSELKLLNEKILVDYSNSN',
    "B" : 'MSSIRVVCRFRPQNKLELAQGGDSIVSIAPENDSVTINGSESNHSFSFDYVFPSNTTQRDVYDHAAKPVIEDIMAGYNGTLFVYGQTGSGKTFSMTGINDPNGDQELRGIVPRMIETVFEFISNADENIEFIVKASYIEIYMERIRDLLDTRKDNLKVREEKGKGVWVEGTSEVYVSSPEDVFEVIEEGKSNRHIAVTNMNEHSSRSHSVFLINVKQENLENQKKLSGKLYLVDLAGSEKVSKTGAEGTVLDEAKNINKSLSALGNVISALADGNKTHIPYRDSKLTRILQESLGGNARTTIVICCSPASFNESETKSTLDFGRRAKTVKNVVCVNEELTAEEWKRRYEKEKEKNARLKGKVEKLEIELARWRAGETVKAEEQINMEDLMEASTPNL',
    "C": "MSAEREIPAEDSIKVVCRFRPLNDSEEKAGSKFVVKFPNNVEENCISIAGKVYLFDKVFKPNASQEKVYNEAAKSIVTDVLAGYNGTIFAYGQTSSGKTHTMEGVIGDSVKQGIIPRIVNDIFNHIYAMEVNLEFHIKVSYYEIYMDKIRDLLDVSKVNLSVHEDKNRVPYVKGATERFIYREEDILDVINTGISNRAIAETRMNAESSRSHSIFILTIQQKNLKVGSIKTGKLYLVDLAGSEKISKTGAQGTTLDEAKMINKSLSSLGNVINALTDGKSTHIPYRDSKLTRVLQESLGGNSRTTLIINCSPSSYNEAETISTLRFGSRAKNIKNVVCVNEELTAEEWKRRYEKEKEKNARLKGKVEKLEIELARWRAGETVKAEEQINMEDLMEASTPNL",
    "D": 'MSSIRVVCRFRPQNKLELAQGGDSIVSIAPENDSVTINGSESNHSFSFDYVFPSNTTQRDVYDHAAKPVIEDIMAGYNGTLFVYGQTGSGKTFSMTGINDPNGDQELRGIVPRMIETVFEFISNADENIEFIVKASYIEIYMERIRDLLDTRKDNLKVREEKGKGVWVEGTSEVYIYREEDILDVINTGISNRAIAETRMNAESSRSHSIFILTIQQKNLKVGSIKTGKLYLVDLAGSEKISKTGAQGTTLDEAKMINKSLSSLGNVINALTDGKSTHIPYRDSKLTRVLQESLGGNSRTTLIINCSPSSYNEAETISTLRFGSRAKNIKNVVCVNEELTAEEWKRRYEKEKEKNARLKGKVEKLEIELARWRAGETVKAEEQINMEDLMEASTPNL',
    "E":"MSAEREIPAEDSIKVVCRFRPLNDSEEKAGSKFVVKFPNNVEENCISIAGKVYLFDKVFKPNASQEKVYNEAAKSIVTDVLAGYNGTIFAYGQTSSGKTHTMEGVIGDSVKQGIIPRIVNDIFNHIYAMEVNLEFHIKVSYYEIYMDKIRDLLDVSKVNLSVHEDKNRVPYVKGATERFVSSPEDVFEVIEEGKSNRHIAVTNMNEHSSRSHSVFLINVKQENLENQKKLSGKLYLVDLAGSEKVSKTGAEGTVLDEAKNINKSLSALGNVISALADGNKTHIPYRDSKLTRILQESLGGNARTTIVICCSPASFNESETKSTLDFGRRAKTVKNKAKINQERSAAELKILLSKAENEIENLKGYIKELETVSGVTVSNLKSSGSGSGSGSGSSSSSSGSSGGSGSGGSSNLSNSVNSTSNLNTSSNTSSSNVNANANVITTSVSAPTSPKDTELIKVLQEKCIQLEKQLFKKEEEKKEILEQLEQQQEQIQDKDQEIEGLNSMIESSNNINSLYQNSTNENSVLNVQLSELKLALEKSRFEATEQSLTIEGLNEENQSIKSQLEILKDRIAQSGDSSIASLVPSTPKSSAEMDPL",   
    "F": 'MSSIRVVCRFRPQNKLELAQGGDSIVSIAPENDSVTINGSESNHSFSFDYVFPSNTTQRDVYDHAAKPVIEDIMAGYNGTLFVYGQTGSGKTFSMTGINDPNGDQELRGIVPRMIETVFEFISNADENIEFIVKASYIEIYMERIRDLLDTRKDNLKVREEKGKGVWVEGTSEVYVSSPEDVFEVIEEGKSNRHIAVTNMNEHSSRSHSVFLINVKQENLENQKKLSGKLYLVDLAGSEKVSKTGAEGTVLDEAKNINKSLSALGNVISALADGNKTHIPYRDSKLTRILQESLGGNARTTIVICCSPASFNESETKSTLDFGRRAKTVKNKAKINQERSAAELKILLSKAENEIENLKGYIKELETVSGVTVSNLKSSGSGSGSGSGSSSSSSGSSGGSGSGGSSNLSNSVNSTSNLNTSSNTSSSNVNANANVITTSVSAPTSPKDTELIKVLQEKCIQLEKQLFKKEEEKKEILEQLEQQQEQIQDKDQEIEGLNSMIESSNNINSLYQNSTNENSVLNVQLSELKLALEKSRFEATEQSLTIEGLNEENQSIKSQLEILKDRIAQSGDSSIASLVPSTPKSSAEMDPL',
    "G": 'MSSIRVVCRFRPQNKLELAQGGDSIVSIAPENDSVTINGSESNHSFSFDYVFPSNTTQRDVYDHAAKPVIEDIMAGYNGTLFVYGQTGSGKTFSMTGINDPNGDQELRGIVPRMIETVFEFISNADENIEFIVKASYIEIYMERIRDLLDTRKDNLKVREEKGKGVWVEGTSEVYVSSPEDVFEVIEEGKSNRHIAVTNMNEHSSRSHSVFLINVKQENLENQKKLSGKLYLVDLAGSEKVSKTGAEGTVLDEAKNINKSLSALGNVISALADGNKTHIPYRDSKLTRILQESLGGNARTTIVICCSPASFNESETKSTLDFGRRAKTVKNKAKINQERSAAELKILLSKAENEIENLKGYIKELETVSGVTVSNLKSSGSGSGSGSGSSSSSSGSSGGSGSGGSSNLSNSVNSTSNLNTSSNTSSSNVNANANVITTSVSAPTSPKDTELIKVLQEKCIQLEKQLFKKEEEKKEILEQLEQQQEQIQDKDQEIEGLNSMIESSNNINSLYQNSTNENSVLNVQLSELKLALEKSRFEATEQSLTIEGLNEENQSIKSQLEILKDRIAQSGDSSIASLVPSTPKSSAEMDPL',
    
      # Add more protein sequences as needed
}

# List of amino acids
amino_acids = "ACDEFGHIKLMNPQRSTVWY"

# Function to calculate the fraction of each amino acid
def calculate_fractions(sequence):
    total_count = len(sequence)
    fractions = {aa: sequence.count(aa) / total_count for aa in amino_acids}
    return fractions

# Calculate fractions for each sequence
fractions_data = {name: calculate_fractions(seq) for name, seq in sequences.items()}

# Create the interactive histogram
fig = make_subplots(rows=1, cols=1)

for protein_name, fractions in fractions_data.items():
    fig.add_trace(
        go.Bar(
            x=list(fractions.keys()),
            y=list(fractions.values()),
            name=protein_name,
            hoverinfo="name+x+y",
        )
    )

fig.update_layout(
    title="Amino Acid Frequency Histogram",
    xaxis_title="Amino Acids",
    yaxis_title="Frequency (Fraction)",
    barmode="overlay",
    bargap=0.15,
    bargroupgap=0.1
)

fig.update_traces(opacity=0.75)

fig.show()


In [9]:
import numpy as np


# List of amino acids
amino_acids = "ACDEFGHIKLMNPQRSTVWY"

# Function to calculate the fraction of each amino acid
def calculate_fractions(sequence):
    total_count = len(sequence)
    fractions = {aa: sequence.count(aa) / total_count for aa in amino_acids}
    return fractions

# Function to calculate the information entropy
def calculate_entropy(fractions):
    entropy = -sum(fraction * np.log2(fraction) for fraction in fractions.values() if fraction > 0)
    return entropy

# Calculate fractions and entropy for each sequence
fractions_data = {name: calculate_fractions(seq) for name, seq in sequences.items()}
entropy_data = {name: calculate_entropy(fractions) for name, fractions in fractions_data.items()}

# Print entropy values for each sequence
for name, entropy in entropy_data.items():
    print(f"{name}: Entropy = {entropy:.4f}")


K401: Entropy = 4.0537
Kif3: Entropy = 3.9412
kif5: Entropy = 4.1152
ThTr: Entropy = 3.9123
AdPa: Entropy = 4.0871
BleSto: Entropy = 4.0753
Acsu: Entropy = 4.0340
AcSu2: Entropy = 4.0339
Dipu: Entropy = 4.0199
HeAl: Entropy = 4.0114
NaGr: Entropy = 4.0294
TiLa: Entropy = 4.0289
B: Entropy = 4.0745
C: Entropy = 4.0589
D: Entropy = 4.0653
E: Entropy = 3.9464
F: Entropy = 3.9540
G: Entropy = 3.9540
