In [6]:
pip install biopython



In [7]:
pwd

'/content'

## 2.3.4.4

In [8]:
from Bio import SeqIO

# Specify the path to your FASTA file
fasta_file = "gisaid_epiflu_sequence (2_3_4).fasta"

# Use SeqIO to parse the FASTA file
for record in SeqIO.parse(fasta_file, "fasta"):
    print(f"ID: {record.id}")
    print(f"Sequence: {record.seq}")
    print(f"Description: {record.description}")

ID: EPI221918|HA|A/duck/Nong-Khai/Thailand/KU-56/2007|EPI_ISL_63488|ABY67919|A_/_H5N1
Sequence: MEKIVLLLAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQDILEKTHNGKLCDLDGVKPLILRDCSVAGWLLGNPMCDEFINVPEWSYIVEKANPANDLCYPGNFNDYEELKHLLSRINHFEKIQIIPKSSWSDHEASSGVSSACPYQGTPSFFRNVVWLIKKNNTYPTIKRSYNNTNQEDLLILWGIHHSNDAAEQTKLYQNPTTYISVGTSTLNQRLVPKIATRSKVNGQSGRMDFFWTMLKPNDAINFESNGNFIAPEYAYKIVKKGDSAIMKSEVEYGNCNTKCQTPIGAINSSMPFHNIHPLTIGECPKYVKSNKLVLATGLRNSPLRERRRKRGLFGAIAGFIEGGWQGMVDGWYGYHHSNEQGSGYAADKESTQKAIDGVTNKVNSIIDKMNTQFEAVGREFNNLERRIENLNKKMEDGFLDVWTYNAELLVLMENERTLDFHDSNVKNLYDKVRLQLRDNAKELGNGCFEFYHKCDNECMESVRNGTYDYPQYSEEARLKREEISGVKLESIGTYQILSIYSTAASSLALAIMVAGLSLWMCSNGSLQCRICIKLESE
Description: EPI221918|HA|A/duck/Nong-Khai/Thailand/KU-56/2007|EPI_ISL_63488|ABY67919|A_/_H5N1
ID: EPI222002|HA|A/environment/Hunan/1-8/2007|EPI_ISL_63498|ACZ05883|A_/_H5N1
Sequence: MEKIVLLLAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQDILEKTHNGKLCDLDGVKPLILRDCSVAGWLLGNPMCDEFINVPEWSYIVEKANPANDLCYPGNFNDYEELKHLLSRINHFEKIQIIPKSSWSDHEASSGVS

In [9]:
import re
import pandas as pd

# Specify the path to your FASTA file
fasta_file = "gisaid_epiflu_sequence (2_3_4).fasta"

def get_HA_df(fasta_file):
    # Define the regex patterns to search for within the header lines
    pattern_epid = r'EPI_ISL_\d+'
    pattern_date = r'\d{4}'

    # Initialize lists to store identifiers, dates, and sequences
    identifiers = []
    dates = []
    sequences = []

    # Open the FASTA file for reading
    with open(fasta_file, "r") as file:
        current_header = ""
        current_sequence = ""
        for line in file:
            line = line.strip()
            # Check if the line is a header line
            if line.startswith('>'):
                # If we already have a header and sequence, process them
                if current_header and current_sequence:
                    epi_match = re.search(pattern_epid, current_header)
                    date_match = re.search(pattern_date, current_header)

                    # Print debug information
                    print(f"Header: {current_header}")
                    print(f"EPI Match: {epi_match.group(0) if epi_match else 'No match'}")
                    print(f"Date Match: {date_match.group(0) if date_match else 'No match'}")

                    if epi_match and date_match:
                        identifiers.append(epi_match.group(0))
                        dates.append(date_match.group(0))
                        sequences.append(current_sequence)

                    current_sequence = ""

                current_header = line
            else:
                current_sequence += line
        # Process the last sequence
        if current_header and current_sequence:
            epi_match = re.search(pattern_epid, current_header)
            date_match = re.search(pattern_date, current_header)

            # Print debug information
            print(f"Header: {current_header}")
            print(f"EPI Match: {epi_match.group(0) if epi_match else 'No match'}")
            print(f"Date Match: {date_match.group(0) if date_match else 'No match'}")

            if epi_match and date_match:
                identifiers.append(epi_match.group(0))
                dates.append(date_match.group(0))
                sequences.append(current_sequence)

    # Create a DataFrame
    data = {'EPI_ID': identifiers, 'Date': dates, 'H5_Sequence': sequences}
    df = pd.DataFrame(data)

    # Identify the putative glycosylation sites
    gls = r'[Nn][A-Za-z][SsTt]'
    gls_no = []

    for sequence in df["H5_Sequence"]:
        matches = re.findall(gls, sequence)
        match_count = len(matches)
        gls_no.append(match_count)

    df["#GLS"] = gls_no

    return df

# Example usage
df = get_HA_df(fasta_file)
print(df)


Header: >EPI221918|HA|A/duck/Nong-Khai/Thailand/KU-56/2007|EPI_ISL_63488|ABY67919|A_/_H5N1
EPI Match: EPI_ISL_63488
Date Match: 2219
Header: >EPI222002|HA|A/environment/Hunan/1-8/2007|EPI_ISL_63498|ACZ05883|A_/_H5N1
EPI Match: EPI_ISL_63498
Date Match: 2220
Header: >EPI499480|HA|A/chicken/Jiangsu/WJ/2009|EPI_ISL_153622|AGG86722|A_/_H5N1
EPI Match: EPI_ISL_153622
Date Match: 4994
Header: >EPI499481|HA|A/chicken/China/JX/2011|EPI_ISL_153623|AGG86723|A_/_H5N1
EPI Match: EPI_ISL_153623
Date Match: 4994
Header: >EPI499479|HA|A/chicken/Jiangsu/XZ/2010|EPI_ISL_153621|AGG86721|A_/_H5N1
EPI Match: EPI_ISL_153621
Date Match: 4994
Header: >EPI164205|HA|A/muscovy_duck/Vietnam/NCVD-69/2007|EPI_ISL_24600||A_/_H5N1
EPI Match: EPI_ISL_24600
Date Match: 1642
Header: >EPI499482|HA|A/chicken/Shandong/k0603/2010|EPI_ISL_153624|AGG86724|A_/_H5N1
EPI Match: EPI_ISL_153624
Date Match: 4994
Header: >EPI164216|HA|A/Anhui/1/2005|EPI_ISL_24603||A_/_H5N1
EPI Match: EPI_ISL_24603
Date Match: 1642
Header: >EPI49948

 ## 2.3.4

In [24]:
df[df["EPI_ID"] =="EPI_ISL_108029" ]


Unnamed: 0,EPI_ID,Date,H5_Sequence,#GLS


In [25]:
from Bio import SeqIO

# Specify the path to your FASTA file
fasta_file = "gisaid_epiflu_sequence (2_3_4).fasta"

# Use SeqIO to parse the FASTA file
for record in SeqIO.parse(fasta_file, "fasta"):
    print(f"ID: {record.id}")
    print(f"Sequence: {record.seq}")
    print(f"Description: {record.description}")

ID: EPI221918|HA|A/duck/Nong-Khai/Thailand/KU-56/2007|EPI_ISL_63488|ABY67919|A_/_H5N1
Sequence: MEKIVLLLAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQDILEKTHNGKLCDLDGVKPLILRDCSVAGWLLGNPMCDEFINVPEWSYIVEKANPANDLCYPGNFNDYEELKHLLSRINHFEKIQIIPKSSWSDHEASSGVSSACPYQGTPSFFRNVVWLIKKNNTYPTIKRSYNNTNQEDLLILWGIHHSNDAAEQTKLYQNPTTYISVGTSTLNQRLVPKIATRSKVNGQSGRMDFFWTMLKPNDAINFESNGNFIAPEYAYKIVKKGDSAIMKSEVEYGNCNTKCQTPIGAINSSMPFHNIHPLTIGECPKYVKSNKLVLATGLRNSPLRERRRKRGLFGAIAGFIEGGWQGMVDGWYGYHHSNEQGSGYAADKESTQKAIDGVTNKVNSIIDKMNTQFEAVGREFNNLERRIENLNKKMEDGFLDVWTYNAELLVLMENERTLDFHDSNVKNLYDKVRLQLRDNAKELGNGCFEFYHKCDNECMESVRNGTYDYPQYSEEARLKREEISGVKLESIGTYQILSIYSTAASSLALAIMVAGLSLWMCSNGSLQCRICIKLESE
Description: EPI221918|HA|A/duck/Nong-Khai/Thailand/KU-56/2007|EPI_ISL_63488|ABY67919|A_/_H5N1
ID: EPI222002|HA|A/environment/Hunan/1-8/2007|EPI_ISL_63498|ACZ05883|A_/_H5N1
Sequence: MEKIVLLLAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQDILEKTHNGKLCDLDGVKPLILRDCSVAGWLLGNPMCDEFINVPEWSYIVEKANPANDLCYPGNFNDYEELKHLLSRINHFEKIQIIPKSSWSDHEASSGVS

In [27]:
import re
import pandas as pd

# Specify the path to your FASTA file
fasta_file = "gisaid_epiflu_sequence (2_3_4).fasta"

def get_HA_df(fasta_file):
    # Define the regex patterns to search for within the header lines
    pattern_epid = r'EPI_ISL_\d+'
    pattern_date = r'\d{4}'

    # Initialize lists to store identifiers, dates, and sequences
    identifiers = []
    dates = []
    sequences = []

    # Open the FASTA file for reading
    with open(fasta_file, "r") as file:
        current_header = ""
        current_sequence = ""
        for line in file:
            line = line.strip()
            # Check if the line is a header line
            if line.startswith('>'):
                # If we already have a header and sequence, process them
                if current_header and current_sequence:
                    epi_match = re.search(pattern_epid, current_header)
                    date_match = re.search(pattern_date, current_header)

                    # Print debug information
                    print(f"Header: {current_header}")
                    print(f"EPI Match: {epi_match.group(0) if epi_match else 'No match'}")
                    print(f"Date Match: {date_match.group(0) if date_match else 'No match'}")

                    if epi_match and date_match:
                        identifiers.append(epi_match.group(0))
                        dates.append(date_match.group(0))
                        sequences.append(current_sequence)

                    current_sequence = ""

                current_header = line
            else:
                current_sequence += line
        # Process the last sequence
        if current_header and current_sequence:
            epi_match = re.search(pattern_epid, current_header)
            date_match = re.search(pattern_date, current_header)

            # Print debug information
            print(f"Header: {current_header}")
            print(f"EPI Match: {epi_match.group(0) if epi_match else 'No match'}")
            print(f"Date Match: {date_match.group(0) if date_match else 'No match'}")

            if epi_match and date_match:
                identifiers.append(epi_match.group(0))
                dates.append(date_match.group(0))
                sequences.append(current_sequence)

    # Create a DataFrame
    data = {'EPI_ID': identifiers, 'Date': dates, 'H5_Sequence': sequences}
    df = pd.DataFrame(data)

    # Identify the putative glycosylation sites
    gls = r'[Nn][A-Za-z][SsTt]'
    gls_no = []

    for sequence in df["H5_Sequence"]:
        matches = re.findall(gls, sequence)
        match_count = len(matches)
        gls_no.append(match_count)

    df["#GLS"] = gls_no

    return df

# Example usage
df = get_HA_df(fasta_file)
print(df)


Header: >EPI221918|HA|A/duck/Nong-Khai/Thailand/KU-56/2007|EPI_ISL_63488|ABY67919|A_/_H5N1
EPI Match: EPI_ISL_63488
Date Match: 2219
Header: >EPI222002|HA|A/environment/Hunan/1-8/2007|EPI_ISL_63498|ACZ05883|A_/_H5N1
EPI Match: EPI_ISL_63498
Date Match: 2220
Header: >EPI499480|HA|A/chicken/Jiangsu/WJ/2009|EPI_ISL_153622|AGG86722|A_/_H5N1
EPI Match: EPI_ISL_153622
Date Match: 4994
Header: >EPI499481|HA|A/chicken/China/JX/2011|EPI_ISL_153623|AGG86723|A_/_H5N1
EPI Match: EPI_ISL_153623
Date Match: 4994
Header: >EPI499479|HA|A/chicken/Jiangsu/XZ/2010|EPI_ISL_153621|AGG86721|A_/_H5N1
EPI Match: EPI_ISL_153621
Date Match: 4994
Header: >EPI164205|HA|A/muscovy_duck/Vietnam/NCVD-69/2007|EPI_ISL_24600||A_/_H5N1
EPI Match: EPI_ISL_24600
Date Match: 1642
Header: >EPI499482|HA|A/chicken/Shandong/k0603/2010|EPI_ISL_153624|AGG86724|A_/_H5N1
EPI Match: EPI_ISL_153624
Date Match: 4994
Header: >EPI164216|HA|A/Anhui/1/2005|EPI_ISL_24603||A_/_H5N1
EPI Match: EPI_ISL_24603
Date Match: 1642
Header: >EPI49948

In [28]:
df.head()

Unnamed: 0,EPI_ID,Date,H5_Sequence,#GLS
0,EPI_ISL_63488,2219,MEKIVLLLAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQ...,8
1,EPI_ISL_63498,2220,MEKIVLLLAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQ...,8
2,EPI_ISL_153622,4994,MEKIMFLLAIASLVKGDQICIGYHANNSTEQVDTIMEKNVTVTHAQ...,7
3,EPI_ISL_153623,4994,MEKIMLLLAIISLVKGDQICIGYHANNSTEQVDTIMEKNVTVTHAQ...,7
4,EPI_ISL_153621,4994,MEKIMFLLAIVSLVKGDQICIGYHANNSTEQVDTIMEKNVTVTHAQ...,7


## 2.3.4.4b

In [12]:
from Bio import SeqIO

# Specify the path to your FASTA file
fasta_file = "gisaid_epiflu_sequence (2_3_4).fasta"

# Use SeqIO to parse the FASTA file
for record in SeqIO.parse(fasta_file, "fasta"):
    print(f"ID: {record.id}")
    print(f"Sequence: {record.seq}")
    print(f"Description: {record.description}")

ID: EPI221918|HA|A/duck/Nong-Khai/Thailand/KU-56/2007|EPI_ISL_63488|ABY67919|A_/_H5N1
Sequence: MEKIVLLLAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQDILEKTHNGKLCDLDGVKPLILRDCSVAGWLLGNPMCDEFINVPEWSYIVEKANPANDLCYPGNFNDYEELKHLLSRINHFEKIQIIPKSSWSDHEASSGVSSACPYQGTPSFFRNVVWLIKKNNTYPTIKRSYNNTNQEDLLILWGIHHSNDAAEQTKLYQNPTTYISVGTSTLNQRLVPKIATRSKVNGQSGRMDFFWTMLKPNDAINFESNGNFIAPEYAYKIVKKGDSAIMKSEVEYGNCNTKCQTPIGAINSSMPFHNIHPLTIGECPKYVKSNKLVLATGLRNSPLRERRRKRGLFGAIAGFIEGGWQGMVDGWYGYHHSNEQGSGYAADKESTQKAIDGVTNKVNSIIDKMNTQFEAVGREFNNLERRIENLNKKMEDGFLDVWTYNAELLVLMENERTLDFHDSNVKNLYDKVRLQLRDNAKELGNGCFEFYHKCDNECMESVRNGTYDYPQYSEEARLKREEISGVKLESIGTYQILSIYSTAASSLALAIMVAGLSLWMCSNGSLQCRICIKLESE
Description: EPI221918|HA|A/duck/Nong-Khai/Thailand/KU-56/2007|EPI_ISL_63488|ABY67919|A_/_H5N1
ID: EPI222002|HA|A/environment/Hunan/1-8/2007|EPI_ISL_63498|ACZ05883|A_/_H5N1
Sequence: MEKIVLLLAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQDILEKTHNGKLCDLDGVKPLILRDCSVAGWLLGNPMCDEFINVPEWSYIVEKANPANDLCYPGNFNDYEELKHLLSRINHFEKIQIIPKSSWSDHEASSGVS

In [13]:
import re
import pandas as pd

# Specify the path to your FASTA file
fasta_file = "gisaid_epiflu_sequence (2_3_4).fasta"

def get_HA_df(fasta_file):
    # Define the regex patterns to search for within the header lines
    pattern_epid = r'EPI_ISL_\d+'
    pattern_date = r'\d{4}'

    # Initialize lists to store identifiers, dates, and sequences
    identifiers = []
    dates = []
    sequences = []

    # Open the FASTA file for reading
    with open(fasta_file, "r") as file:
        current_header = ""
        current_sequence = ""
        for line in file:
            line = line.strip()
            # Check if the line is a header line
            if line.startswith('>'):
                # If we already have a header and sequence, process them
                if current_header and current_sequence:
                    epi_match = re.search(pattern_epid, current_header)
                    date_match = re.search(pattern_date, current_header)

                    # Print debug information
                    print(f"Header: {current_header}")
                    print(f"EPI Match: {epi_match.group(0) if epi_match else 'No match'}")
                    print(f"Date Match: {date_match.group(0) if date_match else 'No match'}")

                    if epi_match and date_match:
                        identifiers.append(epi_match.group(0))
                        dates.append(date_match.group(0))
                        sequences.append(current_sequence)

                    current_sequence = ""

                current_header = line
            else:
                current_sequence += line
        # Process the last sequence
        if current_header and current_sequence:
            epi_match = re.search(pattern_epid, current_header)
            date_match = re.search(pattern_date, current_header)

            # Print debug information
            print(f"Header: {current_header}")
            print(f"EPI Match: {epi_match.group(0) if epi_match else 'No match'}")
            print(f"Date Match: {date_match.group(0) if date_match else 'No match'}")

            if epi_match and date_match:
                identifiers.append(epi_match.group(0))
                dates.append(date_match.group(0))
                sequences.append(current_sequence)

    # Create a DataFrame
    data = {'EPI_ID': identifiers, 'Date': dates, 'H5_Sequence': sequences}
    df = pd.DataFrame(data)

    # Identify the putative glycosylation sites
    gls = r'[Nn][A-Za-z][SsTt]'
    gls_no = []

    for sequence in df["H5_Sequence"]:
        matches = re.findall(gls, sequence)
        match_count = len(matches)
        gls_no.append(match_count)

    df["#GLS"] = gls_no

    return df

# Example usage
df = get_HA_df(fasta_file)
print(df)

Header: >EPI221918|HA|A/duck/Nong-Khai/Thailand/KU-56/2007|EPI_ISL_63488|ABY67919|A_/_H5N1
EPI Match: EPI_ISL_63488
Date Match: 2219
Header: >EPI222002|HA|A/environment/Hunan/1-8/2007|EPI_ISL_63498|ACZ05883|A_/_H5N1
EPI Match: EPI_ISL_63498
Date Match: 2220
Header: >EPI499480|HA|A/chicken/Jiangsu/WJ/2009|EPI_ISL_153622|AGG86722|A_/_H5N1
EPI Match: EPI_ISL_153622
Date Match: 4994
Header: >EPI499481|HA|A/chicken/China/JX/2011|EPI_ISL_153623|AGG86723|A_/_H5N1
EPI Match: EPI_ISL_153623
Date Match: 4994
Header: >EPI499479|HA|A/chicken/Jiangsu/XZ/2010|EPI_ISL_153621|AGG86721|A_/_H5N1
EPI Match: EPI_ISL_153621
Date Match: 4994
Header: >EPI164205|HA|A/muscovy_duck/Vietnam/NCVD-69/2007|EPI_ISL_24600||A_/_H5N1
EPI Match: EPI_ISL_24600
Date Match: 1642
Header: >EPI499482|HA|A/chicken/Shandong/k0603/2010|EPI_ISL_153624|AGG86724|A_/_H5N1
EPI Match: EPI_ISL_153624
Date Match: 4994
Header: >EPI164216|HA|A/Anhui/1/2005|EPI_ISL_24603||A_/_H5N1
EPI Match: EPI_ISL_24603
Date Match: 1642
Header: >EPI49948

In [14]:
print(df["EPI_ID"])

0       EPI_ISL_63488
1       EPI_ISL_63498
2      EPI_ISL_153622
3      EPI_ISL_153623
4      EPI_ISL_153621
            ...      
577     EPI_ISL_27610
578     EPI_ISL_27611
579     EPI_ISL_16356
580     EPI_ISL_16357
581     EPI_ISL_11263
Name: EPI_ID, Length: 582, dtype: object


In [15]:
##code

In [16]:
from Bio import SeqIO
sequence_Record=[record for record in SeqIO.parse("gisaid_epiflu_sequence (2_3_4).fasta","fasta")]
for  record in sequence_Record:
  print(record.id)
  print(record.seq)
  print(len(record))

EPI221918|HA|A/duck/Nong-Khai/Thailand/KU-56/2007|EPI_ISL_63488|ABY67919|A_/_H5N1
MEKIVLLLAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQDILEKTHNGKLCDLDGVKPLILRDCSVAGWLLGNPMCDEFINVPEWSYIVEKANPANDLCYPGNFNDYEELKHLLSRINHFEKIQIIPKSSWSDHEASSGVSSACPYQGTPSFFRNVVWLIKKNNTYPTIKRSYNNTNQEDLLILWGIHHSNDAAEQTKLYQNPTTYISVGTSTLNQRLVPKIATRSKVNGQSGRMDFFWTMLKPNDAINFESNGNFIAPEYAYKIVKKGDSAIMKSEVEYGNCNTKCQTPIGAINSSMPFHNIHPLTIGECPKYVKSNKLVLATGLRNSPLRERRRKRGLFGAIAGFIEGGWQGMVDGWYGYHHSNEQGSGYAADKESTQKAIDGVTNKVNSIIDKMNTQFEAVGREFNNLERRIENLNKKMEDGFLDVWTYNAELLVLMENERTLDFHDSNVKNLYDKVRLQLRDNAKELGNGCFEFYHKCDNECMESVRNGTYDYPQYSEEARLKREEISGVKLESIGTYQILSIYSTAASSLALAIMVAGLSLWMCSNGSLQCRICIKLESE
572
EPI222002|HA|A/environment/Hunan/1-8/2007|EPI_ISL_63498|ACZ05883|A_/_H5N1
MEKIVLLLAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQDILEKTHNGKLCDLDGVKPLILRDCSVAGWLLGNPMCDEFINVPEWSYIVEKANPANDLCYPGNFNDYEELKHLLSRINHFEKIQIIPKSSWSDHEASSGVSAACPYQGTPSFFRNVVWLIKKNNTYPTIKKSYNNTNQEDLLILWGIHHSNDAAEQTKLYQNPTTYISVGTSTLNQRLVPKIATRSKVNGQSGRMDFFWTVLKPNDAINFESNGNFIAPE