## This is just a playground. See finished standalone python script at ~/GitHub/YBD/genename_to_bedfile.py

In [33]:
import subprocess, json
import pandas as pd

gene_name = 'SAUR'
level = 4447
# Function to parse the 'Attributes' column in gff file and convert it into a dictionary of key-value pairs
def parse_attributes(attribute_str):
    attribute_pairs = attribute_str.split(';')
    attribute_dict = {}
    for pair in attribute_pairs:
        if len(pair.split('=')) == 2:
            key, value = pair.split('=')
        else:
            key = pair.split('=')[0]
            value = ''
        attribute_dict[key] = value
    return attribute_dict
anno_file = "/bioinfo2/palm/ref/orthoDBv11/dura{}_Results/dura{}.og.annotations".format(level, level)
gff_file = '/bioinfo2/palm/ref/dura/dura_ref.gff'
gff_column_names = ['Seqid', 'Source', 'Type', 'Start', 'End', 'Score', 'Strand', 'Phase', 'Attributes']
df = pd.read_csv(gff_file, sep='\t',comment='#', names=gff_column_names)
df1 = df['Attributes'].apply(parse_attributes).apply(pd.Series)
df_gff = pd.concat([df.drop('Attributes', axis=1), df1], axis=1)
cmd = "curl 'https://data.orthodb.org/current/search?query={}&level={}' -L -o {}_at{}.dat".format(gene_name, level, gene_name, level)
print(cmd)
subprocess.run(cmd, shell=True, check=True)
with open("{}_at{}.dat".format(gene_name, level), "r") as json_file:
    data_dict = json.load(json_file)

df = pd.read_csv(anno_file, sep='\t', header=0)
selected_rows = df[df['ODB_OG'].isin(data_dict['data'])]
selected_rows = selected_rows.rename(columns={'#query': 'query'})
selected_rows['gene'] = selected_rows['query'].str.split('-').str[0]

# Filter rows where value from column A is in the list
filtered_rows = df_gff[df_gff['ID'].isin(selected_rows['gene'])]

# write bed file
df = filtered_rows[['Seqid', 'Start','End','ID']]
df_no_duplicates = df.drop_duplicates()
df_sorted = df_no_duplicates.sort_values(by=['Seqid', 'Start'])
# add pound(#) to the first column name - convention of bed file
df_sorted.columns = ['#' + df_sorted.columns[0] if i == 0 else col for i, col in enumerate(df_sorted.columns)]
bedfile = gene_name + '_at' + level + '_dura.bed'
df_sorted.to_csv(bedfile, sep='\t', index=False)


curl 'https://data.orthodb.org/current/search?query=SAUR&level=4447' -L -o SAUR_at4447.dat


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  5896    0  5896    0     0   3234      0 --:--:--  0:00:01 --:--:--  3234


# Write bed file based on a list of transcript names

In [3]:
import pandas as pd
def parse_attributes(attribute_str):
    attribute_pairs = attribute_str.split(';')
    attribute_dict = {}
    for pair in attribute_pairs:
        if len(pair.split('=')) == 2:
            key, value = pair.split('=')
        else:
            key = pair.split('=')[0]
            value = ''
        attribute_dict[key] = value
    return attribute_dict
with open('/bioinfo2/palm/accession/Apriyanto2023/DE_table.genes', 'r') as f:
    genes = [line.strip() for line in f]
gff_file = '/bioinfo2/palm/ref/dura/dura_ref.gff'
gff_column_names = ['Seqid', 'Source', 'Type', 'Start', 'End', 'Score', 'Strand', 'Phase', 'Attributes']
df = pd.read_csv(gff_file, sep='\t',comment='#', names=gff_column_names)
df1 = df['Attributes'].apply(parse_attributes).apply(pd.Series)
df_gff = pd.concat([df.drop('Attributes', axis=1), df1], axis=1)

# Filter rows where value from column A is in the gene list
filtered_rows = df_gff[df_gff['ID'].isin(genes)]

# write bed file
df = filtered_rows[['Seqid', 'Start','End','ID']]
df_no_duplicates = df.drop_duplicates()
df_sorted = df_no_duplicates.sort_values(by=['Seqid', 'Start'])
# add pound(#) to the first column name - convention of bed file
df_sorted.columns = ['#' + df_sorted.columns[0] if i == 0 else col for i, col in enumerate(df_sorted.columns)]
bedfile = 'Apriyanto2023_DE100.bed'
df_sorted.to_csv(bedfile, sep='\t', index=False)
# the rest of the analysis please see https://www.notion.so/Association-studies-e780ce66173e44849149291fed4c0e52?pvs=4

# To look up any SNP from the genome (slow)

In [7]:
# first let's work with just one coord
from Bio import SeqIO
from Bio.Seq import Seq, MutableSeq

# eventually this will be a script that only takes the coord, and go look the exon and REF ALT by it self.
# * means stop codon. When this is changed we need to look further down the sequence until another stop codon right?

coord = 'chr1_8180009'
gff_file = '/bioinfo2/palm/ref/dura/dura_ref.gff'

CHR, POS = coord.split('_')
POS = int(POS)
gff_column_names = ['Seqid', 'Source', 'Type', 'Start', 'End', 'Score', 'Strand', 'Phase', 'Attributes']
df = pd.read_csv(gff_file, sep='\t',comment='#', names=gff_column_names)
df1 = df['Attributes'].apply(parse_attributes).apply(pd.Series)
df_gff = pd.concat([df.drop('Attributes', axis=1), df1], axis=1)
filtered_df = df_gff[(df_gff['Type'] == "exon") & (df['Seqid'] == CHR) & (df['Start'] <= POS) & (df['End'] >= POS)]
gene = filtered_df['ID'].iloc[0]


In [83]:
in_file = '/bioinfo2/palm/analysis/batch1/batch1_chr1_nosex.assoc.qassoc.tab'
in_df = pd.read_csv(in_file, sep = '\t', header=0)
in_df['coord'] = 'chr' + in_df['CHR'].astype(str) + '_' + in_df['BP'].astype(str)

In [90]:
in_df1 = in_df[in_df['P'] < 0.01]

In [121]:
vcf_file = '/bioinfo2/palm/analysis/batch1/Apriyanto2023_DE100.exon.b1ch1.vcf'
vcf = pysam.VariantFile(vcf_file)






<pysam.libcbcf.VariantRecordInfo object at 0x7fbd4c3daee0>
['AB', 'ABP', 'AC', 'AF', 'AN', 'AO', 'CIGAR', 'DP', 'DPB', 'DPRA', 'EPP', 'EPPR', 'GTI', 'LEN', 'MEANALT', 'MQM', 'MQMR', 'NS', 'NUMALT', 'ODDS', 'PAIRED', 'PAIREDR', 'PAO', 'PQA', 'PQR', 'PRO', 'QA', 'QR', 'RO', 'RPL', 'RPP', 'RPPR', 'RPR', 'RUN', 'SAF', 'SAP', 'SAR', 'SRF', 'SRP', 'SRR', 'TYPE', 'technology.NovaSeqXPlus']
('snp',)
<pysam.libcbcf.VariantRecordInfo object at 0x7fbd96c08940>
['AB', 'ABP', 'AC', 'AF', 'AN', 'AO', 'CIGAR', 'DP', 'DPB', 'DPRA', 'EPP', 'EPPR', 'GTI', 'LEN', 'MEANALT', 'MQM', 'MQMR', 'NS', 'NUMALT', 'ODDS', 'PAIRED', 'PAIREDR', 'PAO', 'PQA', 'PQR', 'PRO', 'QA', 'QR', 'RO', 'RPL', 'RPP', 'RPPR', 'RPR', 'RUN', 'SAF', 'SAP', 'SAR', 'SRF', 'SRP', 'SRR', 'TYPE', 'technology.NovaSeqXPlus']
('snp',)
<pysam.libcbcf.VariantRecordInfo object at 0x7fbd4cb9ea00>
['AB', 'ABP', 'AC', 'AF', 'AN', 'AO', 'CIGAR', 'DP', 'DPB', 'DPRA', 'EPP', 'EPPR', 'GTI', 'LEN', 'MEANALT', 'MQM', 'MQMR', 'NS', 'NUMALT', 'ODDS', 'PAI

In [124]:
for variant in vcf.fetch():
    print(variant.info["TYPE"])

('snp',)
('snp',)
('snp',)
('ins',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('complex',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('del',)
('ins', 'ins')
('snp',)
('snp',)
('snp', 'complex', 'ins', 'del')
('snp',)
('snp',)
('complex', 'snp', 'ins')
('snp',)
('del', 'mnp')
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('ins', 'snp')
('snp',)
('ins', 'ins', 'ins', 'snp')
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('ins',)
('complex',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('ins', 'ins', 'del', 'ins')
('del',)
('snp',)
('del',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp', 'complex')
('snp',)
('complex', 'snp')
('snp',)
('snp',)
('snp',)
('snp', 'complex')
('snp',)
('comple

In [129]:
a='TCTTCTTCTTCTTCTCCTCCTGCTTCTCTCCCTCTCCTCGGCAATCTCACTCCTCGCTGCCCTTATATTTTTTCTCTCTCTCTCTCTATCTCTATTCCCATTTTGCCCCTCCAACGGATACATCGATAATAGAAGAGAGATAGTGACTCCGGGGAAAGGGAAGGGGGAGAAAGGGGAACAAAAAGAGAGAGAGAAGAGTTTGAGTGTGTGCGAGTGGGGGAGAGAGTTTTTGAGGGTGGTACCTGTGAGAAGGGAGGGGAGGGGGTTTATTTGGTTGCCGAGAGGAGGGGGAAAATAATAAGAAGAAAAGAAGGGGATTTTTATTTTATTTTCATCTGGTGAGGGATTTTTTTCTTTTTTTTTTCTTCTTTTCTTCTTCTTCTTGTTTGAGGGAGGGGAAGAGAGGAGAAGAAGGGAAAGAAGAGGGGCCATGGCGGCGCACGTTAGCGTGAACCATGGCTTGCTGGCGCTGTTACTGGTGCTGATCTGCTTATCGTGCGCCGAGGGTGCGGTGCGGGTCGCGAGGCAGCGGCTGGAGGTGCGGCGCCACCTGAAGCGATTCAATAAACCCGCCGTCAAGAGCATCAAG'

In [133]:
exon_seq_var = MutableSeq(str(a))

In [134]:
exon_seq_var[38:39] = 'G'

# Look up SNPs from a designated bed file (fast)

In [137]:
import pysam
import pandas as pd
from Bio.Seq import Seq, MutableSeq
from Bio import SeqIO

out_file = "/bioinfo2/palm/analysis/batch1/Apriyanto2023_DE100_batch1.summary"
bed_file = '/bioinfo2/palm/analysis/yield/Apriyanto2023_DE100.exon.bed'
vcf_file = '/bioinfo2/palm/analysis/batch1/Apriyanto2023_DE100.exon.b1ch1.vcf'
in_file = '/bioinfo2/palm/analysis/batch1/Apriyanto2023_DE100.exon.b1ch1.assoc.qassoc.tab'
in_df = pd.read_csv(in_file, sep = '\t', header=0)
in_df = in_df[in_df['P'] < 0.01]
in_df['coord'] = 'chr' + in_df['CHR'].astype(str) + '_' + in_df['BP'].astype(str)
bed_column_names = ['Seqid', 'Start', 'End', 'Genes']
bed_df = pd.read_csv(bed_file, sep='\t',comment='#', names=bed_column_names)
vcf = pysam.VariantFile(vcf_file, "r")
genome_fa = '/bioinfo2/palm/ref/dura/dura_ref.fasta'
anno_file = "/bioinfo2/palm/ref/dura/dura_orthoDBv11_4447_3193_33090.og.annotations"
anno_df = pd.read_csv(anno_file, sep='\t', header=0)
anno_df = anno_df.rename(columns={'#query': 'query'})

def coord2codon(coord, bed_df = bed_df, vcf = vcf, genome_fa = genome_fa, anno_df = anno_df):
    # this function takes a coordinate of a SNP in the dura genome and returns some useful info about it
    print(coord)
    CHR, POS = coord.split('_')
    POS = int(POS)
    # look up gene from bed file
    snp_df = bed_df[(bed_df['Seqid'] == CHR) & (bed_df['Start'] <= POS) & (bed_df['End'] >= POS)]
    if len(snp_df) > 0: # if snp is in the region we care
        start = int(snp_df['Start'].iloc[0])
        end = int(snp_df['End'].iloc[0])
        gene = snp_df['Genes'].iloc[0]
        anno_df = anno_df[anno_df['query'] == gene]
        if len(anno_df) == 1:
            og = anno_df['ODB_OG'].iloc[0]
            gene_name = anno_df['Description'].iloc[0]
        elif len(anno_df) == 0:
            og = 'NA'
            gene_name = 'NA'
        else:
            print(gene, " has more than one og, recording the first one")
            og = anno_df['ODB_OG'].iloc[0]
            gene_name = anno_df['Description'].iloc[0]
        # look up REF and ALT from vcf
        for variant in vcf.fetch():
            TYPE = variant.info["TYPE"]
            type_out = '|'.join([i for i in TYPE])
            if variant.chrom == CHR:
                if variant.pos == POS:
                    REF = variant.ref
                    ALT = variant.alts[0]
                    break
        # get the codon and amino acid
        for record in SeqIO.parse(genome_fa,'fasta'):
            if record.id == CHR:
                exon_seq = record.seq[start-1:end]
                exon_seq_var = MutableSeq(str(exon_seq))
                protein_seq = exon_seq.translate()
                snp_position = POS - start
                # Find the codon's start position
                codon_start = snp_position - (snp_position % 3)
                # Extract the codons
                codon = exon_seq[codon_start: codon_start+3]
                exon_seq_var[snp_position: snp_position+len(ALT)] = ALT
                modified_codon = exon_seq_var[codon_start: codon_start+3]    
                original_aa = codon.translate()
                modified_aa = modified_codon.translate()
                break

        print(';'.join([coord, gene, og, gene_name, type_out, REF, ALT, str(codon), str(modified_codon), str(original_aa), str(modified_aa)]))
        return [gene, og, gene_name, type_out, REF, ALT, str(codon), str(modified_codon), str(original_aa), str(modified_aa)]

# new column names
col_names = ['gene', 'og', 'gene_name', 'Type','REF', 'ALT', 'codon', 'modified_codon', 'original_aa', 'modified_aa']

# Generate lists and split them into separate columns
in_df[col_names] = in_df['coord'].apply(coord2codon).apply(pd.Series)
    
        
    

chr1_82720377




chr1_82720377_Egu005022-mRNA-1_110056at4447_carboxyl-terminal peptidase precursor_snp_C_G_CTC_CTG_L_L
chr1_86754740
chr1_86754740_Egu005191-mRNA-2_136145at4447_jacalin-related lectin 3_snp_C_G_CCC_CCG_P_P
chr1_86758000
chr1_86758000_Egu005191-mRNA-3_136145at4447_jacalin-related lectin 3_snp_C_G_CAG_GAG_Q_E
chr1_93213103


KeyboardInterrupt: 

In [None]:
        for record in SeqIO.parse(genome_fa,'fasta'):
            if record.id == CHR:
                exon_seq = record.seq[start-1:end]
                exon_seq_snp = MutableSeq(str(exon_seq))
                protein_seq = exon_seq.translate()
                snp_position = POS - start
                if TYPE == 'snp'
                if exon_seq[snp_position] == REF:
                    exon_seq_snp[snp_position] = ALT
                else:
                    print("REF wrong. Break")
                    break
                # Find the codon's start position
                codon_start = snp_position - (snp_position % 3)
                # Extract the codons
                codon = exon_seq[codon_start: codon_start+3]
                # If needed, replace the SNP in the codon
                modified_codon = exon_seq_snp[codon_start: codon_start+3] # modify with SNP
                # Translate the original and modified codon to amino acids
                original_aa = codon.translate()
                modified_aa = modified_codon.translate()
                break


In [69]:
                # if the variant type if SNP
                if (len(TYPE) == 1) & (TYPE[0] == 'snp'):
                    if exon_seq[snp_position] == REF:
                        exon_seq_var[snp_position] = ALT
                    else:
                        print("REF wrong. Break")
                        break
                    # Translate the original and modified codon to amino acids
                else:

Unnamed: 0,CHR,SNP,BP,NMISS,BETA,SE,R2,T,P,X10,coord,gene,og,gene_name,REF,ALT,codon,modified_codon,original_aa,modified_aa
0,16,.,20788022,48,17.55,7.255,0.1128,2.418,0.01961,,chr16_20788022,Egu030167-mRNA-1,130847at4447,Heat stress transcription factor B-2c,T,C,"(T, G, G)","[C, G, G]",(W),[R]
1,16,.,32010076,47,-18.43,6.919,0.1362,-2.664,0.01069,,chr16_32010076,Egu030617-mRNA-1,128811at4447,transmembrane protein 53-A,C,T,"(G, G, C)","[G, G, T]",(G),[G]


In [62]:
# this function takes a coordinate of a SNP in the dura genome and returns some useful info about it
coord = 'chr3_69323261'
CHR, POS = coord.split('_')
POS = int(POS)
# look up gene from bed file
snp_df = bed_df[(bed_df['Seqid'] == CHR) & (bed_df['Start'] <= POS) & (bed_df['End'] >= POS)]
start = int(snp_df['Start'].iloc[0])
end = int(snp_df['End'].iloc[0])
gene = snp_df['Genes'].iloc[0]
anno_df = anno_df[anno_df['query'] == gene]
if len(anno_df) == 1:
    og = anno_df['ODB_OG'].iloc[0]
    gene_name = anno_df['Description'].iloc[0]
elif len(anno_df) == 0:
    og = 'NA'
    gene_name = 'NA'
else:
    print(gene, " has more than one og, recording the first one")
    og = anno_df['ODB_OG'].iloc[0]
    gene_name = anno_df['Description'].iloc[0]
# look up REF and ALT from vcf
for variant in vcf:
    if variant.chrom == CHR:
        print
        if variant.pos == POS:
            REF = variant.ref
            ALT = variant.alts[0]
            print(REF, ALT)
            break
# get the codon and amino acid
for record in SeqIO.parse(genome_fa,'fasta'):
    if record.id == CHR:
        exon_seq = record.seq[start-1:end]
        exon_seq_snp = MutableSeq(str(exon_seq))
        protein_seq = exon_seq.translate()
        snp_position = POS - start
        if exon_seq[snp_position] == REF:
            exon_seq_snp[snp_position] = ALT
        else:
            "REF wrong. Break"
            break
        # Find the codon's start position
        codon_start = snp_position - (snp_position % 3)
        # Extract the codon
        codon = exon_seq[codon_start: codon_start+3]
        # If needed, replace the SNP in the codon
        modified_codon = exon_seq_snp[codon_start: codon_start+3] # modify with SNP
        # Translate the original and modified codon to amino acids
        original_aa = codon.translate()
        modified_aa = modified_codon.translate()
        break
print(';'.join([coord, gene, og, gene_name, REF, ALT, str(codon), str(modified_codon), str(original_aa), str(modified_aa)]))
return [gene, og, gene_name, REF, ALT, codon, modified_codon, original_aa, modified_aa]


KeyboardInterrupt: 

In [57]:
in_df

Unnamed: 0,CHR,SNP,BP,NMISS,BETA,SE,R2,T,P,X10,coord
0,1,.,72869566,49,-25.33,7.081,0.21400,-3.577,0.000819,,chr1_72869566
1,1,.,72869593,50,18.00,8.701,0.08189,2.069,0.043940,,chr1_72869593
2,1,.,73734371,50,29.14,10.540,0.13750,2.766,0.008038,,chr1_73734371
3,1,.,82723240,50,27.56,8.345,0.18520,3.303,0.001815,,chr1_82723240
4,1,.,92918856,52,26.25,9.942,0.12240,2.641,0.011010,,chr1_92918856
...,...,...,...,...,...,...,...,...,...,...,...
64,15,.,63227901,51,-13.79,6.668,0.08024,-2.068,0.043980,,chr15_63227901
65,15,.,63228447,48,-19.94,6.750,0.15950,-2.955,0.004920,,chr15_63228447
66,16,.,14913334,37,26.96,7.669,0.26090,3.515,0.001235,,chr16_14913334
67,16,.,20788022,48,17.55,7.255,0.11280,2.418,0.019610,,chr16_20788022


In [40]:
def coord2codon(coord, bed_df = bed_df, vcf = vcf, genome_fa = genome_fa, anno_df = anno_df):
    # this function takes a coordinate of a SNP in the dura genome and returns some useful info about it
    CHR, POS = coord.split('_')
    POS = int(POS)
    # look up gene from bed file
    snp_df = bed_df[(bed_df['Seqid'] == CHR) & (bed_df['Start'] <= POS) & (bed_df['End'] >= POS)]
    start = int(snp_df['Start'].iloc[0])
    end = int(snp_df['End'].iloc[0])
    gene = snp_df['Genes'].iloc[0]
    anno_df = anno_df[anno_df['query'] == gene]
    og = str(anno_df['ODB_OG'])
    gene_name = str(anno_df['Description'])
    # look up REF and ALT from vcf
    for variant in vcf:
        if variant.chrom == CHR:
            if variant.pos == POS:
                REF = variant.ref
                ALT = variant.alts
                break
            else:
                print(coord, "Wrong POS")
    # get the codon and amino acid
    for record in SeqIO.parse(genome_fa,'fasta'):
        if record.id == CHR:
            exon_seq = record.seq[start-1:end]
            exon_seq_snp = MutableSeq(str(exon_seq))
            protein_seq = exon_seq.translate()
            snp_position = POS - start
            if exon_seq[snp_position] == REF:
                exon_seq_snp[snp_position] = ALT[0]
            else:
                "REF wrong. Break"
                break
            # Find the codon's start position
            codon_start = snp_position - (snp_position % 3)
            # Extract the codon
            codon = exon_seq[codon_start: codon_start+3]
            print(coord, codon)
            # If needed, replace the SNP in the codon
            modified_codon = exon_seq_snp[codon_start: codon_start+3] # modify with SNP
            # Translate the original and modified codon to amino acids
            original_aa = codon.translate()
            modified_aa = modified_codon.translate()
            break
    return [gene, og, gene_name, REF, ALT, codon, modified_codon, original_aa, modified_aa]

In [41]:
a = coord2codon('chr1_72869566')

UnboundLocalError: local variable 'REF' referenced before assignment

In [22]:
a[51:54]

'GGT'

In [3]:
import pysam
import pandas as pd
vcf = pysam.VariantFile("/bioinfo2/palm/analysis/yield/Apriyanto2023/Apriyanto2023_DE100.vcf", "r") 
for variant in vcf:
    if variant.chrom == CHR:
        if variant.pos == POS:
            REF = variant.ref
            ALT = variant.alts
            break

In [1]:
import sys
sys.executable

'/home/xiaorong/anaconda3/envs/bioinfo/bin/python'

## Get the gene name and amino acid give the coord of a SNP

In [94]:
# first let's work with just one coord
from Bio import SeqIO
from Bio.Seq import Seq, MutableSeq

# eventually this will be a script that only takes the coord, and go look the exon and REF ALT by it self.
# * means stop codon. When this is changed we need to look further down the sequence until another stop codon right?

coord = 'chr7_37218457'
bed_file = '/bioinfo2/palm/analysis/yield/Apriyanto2023_DE100.exon.bed'
vcf_file = '/bioinfo2/palm/analysis/' 

CHR, POS = coord.split('_')
POS = int(POS)
gff_column_names = ['Seqid', 'Source', 'Type', 'Start', 'End', 'Score', 'Strand', 'Phase', 'Attributes']
df = pd.read_csv(gff_file, sep='\t',comment='#', names=gff_column_names)
df1 = df['Attributes'].apply(parse_attributes).apply(pd.Series)
df_gff = pd.concat([df.drop('Attributes', axis=1), df1], axis=1)
filtered_df = df_gff[(df_gff['Type'] == "exon") & (df['Seqid'] == CHR) & (df['Start'] <= POS) & (df['End'] >= POS)]
gene = filtered_df['ID'].iloc[0]

REF = 'G'
ALT = 'A'

filtered_rows = df_gff[df_gff['ID']==gene]
start = int(filtered_rows['Start'])
end = int(filtered_rows['End'])
            
for record in SeqIO.parse('/bioinfo2/palm/ref/dura/dura_ref.fasta','fasta'):
    if record.id == CHR:
        seq = record.seq[start-1:end]
        pos_ref = record.seq[int(POS)-1]
        # check if the length of this exon is divisible by three
        if len(seq)%3 != 0:
            print("warning: this exon is not dividable by 3")
            # check if the position matches with the REF provided
        if pos_ref == REF:
            # get the coding region containing this SNP
            if (int(POS)-start)%3 == 0:
                codon = record.seq[int(POS)-1:int(POS)+2]
                codon_m = MutableSeq(str(codon))
                codon_m[0] = ALT 
            elif (int(POS)-start)%3 == 1:
                codon = record.seq[int(POS)-2:int(POS)+1]
                codon_m = MutableSeq(str(codon))
                codon_m[1] = ALT
            elif (int(POS)-start)%3 == 2:
                codon = record.seq[int(POS)-3:int(POS)]
                codon_m = MutableSeq(str(codon))
                codon_m[2] = ALT
            # translate these codon
            aa = codon.translate()
            aa_m = codon_m.translate()
            print(codon, aa, codon_m, aa_m)
            break
        else:
            print("This coord in the genome is different from what was called in the vcf")
            break
    

GAA E AAA K


In [6]:
df_gff

Unnamed: 0,Seqid,Source,Type,Start,End,Score,Strand,Phase,ID,Name,Alias,Unnamed: 12,Parent,_AED,_QI,_eAED,score
0,egu.contig.1973,maker,gene,10123,11712,.,+,.,Egu032749,Egu032749,snap_masked-egu.contig.1973-processed-gene-0.0,,,,,,
1,egu.contig.1973,maker,mRNA,10123,11712,.,+,.,Egu032749-mRNA-1,Egu032749-mRNA-1,snap_masked-egu.contig.1973-processed-gene-0.0...,,Egu032749,0.06,0|0|0|0.5|1|1|2|0|472,0.04,
2,egu.contig.1973,maker,exon,10123,11511,.,+,.,Egu032749-mRNA-1:1,,,,Egu032749-mRNA-1,,,,
3,egu.contig.1973,maker,exon,11686,11712,.,+,.,Egu032749-mRNA-1:2,,,,Egu032749-mRNA-1,,,,
4,egu.contig.1973,maker,CDS,10123,11511,.,+,0,Egu032749-mRNA-1:cds,,,,Egu032749-mRNA-1,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
615982,egu.contig.1149,maker,CDS,281718,281844,.,-,0,Egu032211-mRNA-2:cds,,,,Egu032211-mRNA-2,,,,
615983,egu.contig.1149,maker,CDS,280181,280249,.,-,2,Egu032211-mRNA-2:cds,,,,Egu032211-mRNA-2,,,,
615984,egu.contig.1149,maker,CDS,279893,280085,.,-,2,Egu032211-mRNA-2:cds,,,,Egu032211-mRNA-2,,,,
615985,egu.contig.1149,maker,CDS,278640,279072,.,-,1,Egu032211-mRNA-2:cds,,,,Egu032211-mRNA-2,,,,


In [107]:
from Bio.Seq import Seq, MutableSeq

seq = Seq('GACGTTAGGAGGAGCATATACTAACTTGTTTTTATAA')
seq.translate()
seq1 = MutableSeq(str(seq))

In [85]:
aa = Seq('MHLGHRHGHEKGKSPATTMPPKGWVGIRVGQEGEEQQRFEVPVDYLKHPLFMALLHQAKEEFGYEQSGAITIPCGVDHFRHVRDTINRDSAATAAAHHHSHLPHLAGLTLGGAYTNLFL')

In [109]:
seq = Seq('ACGTTAGGAGGAGCATATACTAACTTGTTTTTATAA')
seq.translate()

Seq('TLGGAYTNLFL*')

In [119]:
seq = Seq('ATGCACCTGGGCCACAGGCACGGGCATGAGAAGGGAAAGAGTCCGGCGACGACGATGCCACCTAAAGGGTGGGTGGGGATAAGGGTGGGGCAGGAAGGGGAGGAGCAGCAGCGGTTCGAGGTGCCTGTGGACTATCTGAAGCACCCGCTCTTCATGGCCTTGCTGCACCAGGCGAAGGAGGAGTTCGGATATGAGCAGAGCGGAGCCATCACCATTCCCTGCGGCGTCGATCACTTCCGTCATGTCCGGGACACCATTAACCGTGACTCCGCGGCCACCGCCGCCGCGCACCACCACAGCCACCTCCCTCACCTTGCCGGCT')
a = seq.translate()

In [118]:
len(seq)

322

In [120]:
a[-10:]

Seq('HSHLPHLAGC')

In [58]:
from collections import Counter
Counter(df_gff['Type'])

Counter({'gene': 32778,
         'mRNA': 45938,
         'exon': 196546,
         'CDS': 271952,
         'three_prime_UTR': 36830,
         'five_prime_UTR': 31943})

In [96]:
Counter(df_gff['Source'])

Counter({'maker': 615987})

In [11]:


# Extract values from column B and C from the filtered rows
result_values = filtered_rows[['Seqid', 'Start','End','ID']]

# Convert columns A and B to numeric
result_values['Start'] = pd.to_numeric(result_values['Start'])
result_values['End'] = pd.to_numeric(result_values['End'])

# Perform the operations and apply the conditions
result_values['Start'] = result_values['Start'] - flanking
result_values['Start'] = result_values['Start'].apply(lambda x: max(0, x))  # Set negative values to 0

result_values['End'] = result_values['End'] + flanking
bedfile = fileA.split('_blastp.tbl')[0] + '_' + str(flanking) + '.bed'
result_values.to_csv(bedfile, sep='\t', index=False)


Unnamed: 0,query,ODB_OG,evalue,score,COG_category,Description,GOs_mf,GOs_bp,EC,KEGG_ko,Interpro
75,Egu020222-mRNA-1,687at4447,2.14e-26,117.89,-,auxin-responsive protein SAUR71,-,GO:0009733,"6.5.1.1,4.1.1.15","ko00250,ko00410,ko00430,ko00650,ko03030,ko0341...",IPR003676
262,Egu026424-mRNA-1,2328at4447,4.810000000000001e-66,174.13,-,Auxin-responsive protein SAUR36,-,GO:0009733,-,-,IPR003676
663,Egu020226-mRNA-1,5943at4447,5.9600000000000004e-43,180.65,-,auxin-responsive protein SAUR71,-,GO:0009733,-,-,IPR003676
664,Egu020227-mRNA-1,5943at4447,0.0,148.897,-,auxin-responsive protein SAUR71,-,GO:0009733,-,-,IPR003676
3706,Egu011092-mRNA-1,33316at4447,4.29e-28,86.71,-,indole-3-acetic acid-induced protein ARG7,-,GO:0009733,-,"ko00010,ko01200",IPR003676
3707,Egu030210-mRNA-1,33316at4447,0.0,108.054,-,indole-3-acetic acid-induced protein ARG7,-,GO:0009733,-,"ko00010,ko01200",IPR003676
5388,Egu009584-mRNA-1,47410at4447,9.09e-29,110.76,-,auxin-responsive protein SAUR32,-,GO:0009733,2.7.2.3,"ko00010,ko00230,ko00710,ko00910,ko00941,ko0094...",IPR003676
9270,Egu019713-mRNA-1,75523at4447,0.0,78.6238,-,auxin-responsive protein SAUR41,-,GO:0009733,-,-,IPR003676
9271,Egu031500-mRNA-1,75523at4447,3.15e-23,73.9,-,auxin-responsive protein SAUR41,-,GO:0009733,-,-,IPR003676
9614,Egu020383-mRNA-2,77872at4447,0.0,110.458,-,auxin-induced protein 6B,-,GO:0009733,-,"ko00190,ko00270,ko04145,sita01100,sita01110",IPR003676


In [103]:
Counter(df_gff['Alias'])

Counter({'snap_masked-egu.contig.1973-processed-gene-0.0': 1,
         'snap_masked-egu.contig.1973-processed-gene-0.0-mRNA-1': 1,
         nan: 537271,
         'maker-egu.contig.1477-snap-gene-0.0': 1,
         'maker-egu.contig.1477-snap-gene-0.0-mRNA-1': 1,
         'maker-egu.contig.1784-snap-gene-0.4': 1,
         'maker-egu.contig.1784-snap-gene-0.4-mRNA-1': 1,
         'maker-egu.contig.1784-snap-gene-0.5': 1,
         'maker-egu.contig.1784-snap-gene-0.5-mRNA-1': 1,
         'snap_masked-egu.contig.1415-processed-gene-0.4': 1,
         'snap_masked-egu.contig.1415-processed-gene-0.4-mRNA-1': 1,
         'snap_masked-egu.contig.1962-processed-gene-0.2': 1,
         'snap_masked-egu.contig.1962-processed-gene-0.2-mRNA-1': 1,
         'maker-egu.contig.1962-snap-gene-0.2': 1,
         'maker-egu.contig.1962-snap-gene-0.2-mRNA-1': 1,
         'maker-egu.contig.1962-snap-gene-0.0': 1,
         'maker-egu.contig.1962-snap-gene-0.0-mRNA-1': 1,
         'maker-egu.contig.1588-snap-gen

In [99]:
df_gff

Unnamed: 0,Seqid,Source,Type,Start,End,Score,Strand,Phase,ID,Name,Alias,Unnamed: 12,Parent,_AED,_QI,_eAED,score
0,egu.contig.1973,maker,gene,10123,11712,.,+,.,Egu032749,Egu032749,snap_masked-egu.contig.1973-processed-gene-0.0,,,,,,
1,egu.contig.1973,maker,mRNA,10123,11712,.,+,.,Egu032749-mRNA-1,Egu032749-mRNA-1,snap_masked-egu.contig.1973-processed-gene-0.0...,,Egu032749,0.06,0|0|0|0.5|1|1|2|0|472,0.04,
2,egu.contig.1973,maker,exon,10123,11511,.,+,.,Egu032749-mRNA-1:1,,,,Egu032749-mRNA-1,,,,
3,egu.contig.1973,maker,exon,11686,11712,.,+,.,Egu032749-mRNA-1:2,,,,Egu032749-mRNA-1,,,,
4,egu.contig.1973,maker,CDS,10123,11511,.,+,0,Egu032749-mRNA-1:cds,,,,Egu032749-mRNA-1,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
615982,egu.contig.1149,maker,CDS,281718,281844,.,-,0,Egu032211-mRNA-2:cds,,,,Egu032211-mRNA-2,,,,
615983,egu.contig.1149,maker,CDS,280181,280249,.,-,2,Egu032211-mRNA-2:cds,,,,Egu032211-mRNA-2,,,,
615984,egu.contig.1149,maker,CDS,279893,280085,.,-,2,Egu032211-mRNA-2:cds,,,,Egu032211-mRNA-2,,,,
615985,egu.contig.1149,maker,CDS,278640,279072,.,-,1,Egu032211-mRNA-2:cds,,,,Egu032211-mRNA-2,,,,


In [None]:


i = 0
protein_file = '/bioinfo/palm/ref/dura/dura_proteins.fasta'
with open('dura_proteins_{}_at{}.aa'.format(gene_name, level),'w') as fh:
    for record in SeqIO.parse(prselected_rowsotein_file,'fasta'):
        if record.id in ['query'].values:
            cluster = selected_rows.loc[selected_rows['query'] == record.id, 'ODB_OG']
            tag = selected_rows.loc[selected_rows['query'] == record.id, 'Description']
            if len(tag) == 1:
                record.description = ':'.join([cluster.iloc[0], tag.iloc[0]])
            else:
                print(record.id + 'appeared in more than one cluster?')

            SeqIO.write(record, fh, 'fasta')
            i += 1
print(i)