## This is just a playground. See finished standalone python script at ~/GitHub/YBD/genename_to_bedfile.py

In [33]:
import subprocess, json
import pandas as pd

gene_name = 'SAUR'
level = 4447
# Function to parse the 'Attributes' column in gff file and convert it into a dictionary of key-value pairs
def parse_attributes(attribute_str):
    attribute_pairs = attribute_str.split(';')
    attribute_dict = {}
    for pair in attribute_pairs:
        if len(pair.split('=')) == 2:
            key, value = pair.split('=')
        else:
            key = pair.split('=')[0]
            value = ''
        attribute_dict[key] = value
    return attribute_dict
anno_file = "/bioinfo2/palm/ref/orthoDBv11/dura{}_Results/dura{}.og.annotations".format(level, level)
gff_file = '/bioinfo2/palm/ref/dura/dura_ref.gff'
gff_column_names = ['Seqid', 'Source', 'Type', 'Start', 'End', 'Score', 'Strand', 'Phase', 'Attributes']
df = pd.read_csv(gff_file, sep='\t',comment='#', names=gff_column_names)
df1 = df['Attributes'].apply(parse_attributes).apply(pd.Series)
df_gff = pd.concat([df.drop('Attributes', axis=1), df1], axis=1)
cmd = "curl 'https://data.orthodb.org/current/search?query={}&level={}' -L -o {}_at{}.dat".format(gene_name, level, gene_name, level)
print(cmd)
subprocess.run(cmd, shell=True, check=True)
with open("{}_at{}.dat".format(gene_name, level), "r") as json_file:
    data_dict = json.load(json_file)

df = pd.read_csv(anno_file, sep='\t', header=0)
selected_rows = df[df['ODB_OG'].isin(data_dict['data'])]
selected_rows = selected_rows.rename(columns={'#query': 'query'})
selected_rows['gene'] = selected_rows['query'].str.split('-').str[0]

# Filter rows where value from column A is in the list
filtered_rows = df_gff[df_gff['ID'].isin(selected_rows['gene'])]

# write bed file
df = filtered_rows[['Seqid', 'Start','End','ID']]
df_no_duplicates = df.drop_duplicates()
df_sorted = df_no_duplicates.sort_values(by=['Seqid', 'Start'])
# add pound(#) to the first column name - convention of bed file
df_sorted.columns = ['#' + df_sorted.columns[0] if i == 0 else col for i, col in enumerate(df_sorted.columns)]
bedfile = gene_name + '_at' + level + '_dura.bed'
df_sorted.to_csv(bedfile, sep='\t', index=False)


curl 'https://data.orthodb.org/current/search?query=SAUR&level=4447' -L -o SAUR_at4447.dat


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  5896    0  5896    0     0   3234      0 --:--:--  0:00:01 --:--:--  3234


# Write bed file based on a list of transcript names

In [3]:
import pandas as pd
def parse_attributes(attribute_str):
    attribute_pairs = attribute_str.split(';')
    attribute_dict = {}
    for pair in attribute_pairs:
        if len(pair.split('=')) == 2:
            key, value = pair.split('=')
        else:
            key = pair.split('=')[0]
            value = ''
        attribute_dict[key] = value
    return attribute_dict
with open('/bioinfo2/palm/accession/Apriyanto2023/DE_table.genes', 'r') as f:
    genes = [line.strip() for line in f]
gff_file = '/bioinfo2/palm/ref/dura/dura_ref.gff'
gff_column_names = ['Seqid', 'Source', 'Type', 'Start', 'End', 'Score', 'Strand', 'Phase', 'Attributes']
df = pd.read_csv(gff_file, sep='\t',comment='#', names=gff_column_names)
df1 = df['Attributes'].apply(parse_attributes).apply(pd.Series)
df_gff = pd.concat([df.drop('Attributes', axis=1), df1], axis=1)

# Filter rows where value from column A is in the gene list
filtered_rows = df_gff[df_gff['ID'].isin(genes)]

# write bed file
df = filtered_rows[['Seqid', 'Start','End','ID']]
df_no_duplicates = df.drop_duplicates()
df_sorted = df_no_duplicates.sort_values(by=['Seqid', 'Start'])
# add pound(#) to the first column name - convention of bed file
df_sorted.columns = ['#' + df_sorted.columns[0] if i == 0 else col for i, col in enumerate(df_sorted.columns)]
bedfile = 'Apriyanto2023_DE100.bed'
df_sorted.to_csv(bedfile, sep='\t', index=False)
# the rest of the analysis please see https://www.notion.so/Association-studies-e780ce66173e44849149291fed4c0e52?pvs=4

# To look up any SNP from the genome (slow)

In [7]:
# first let's work with just one coord
from Bio import SeqIO
from Bio.Seq import Seq, MutableSeq

# eventually this will be a script that only takes the coord, and go look the exon and REF ALT by it self.
# * means stop codon. When this is changed we need to look further down the sequence until another stop codon right?

coord = 'chr1_8180009'
gff_file = '/bioinfo2/palm/ref/dura/dura_ref.gff'

CHR, POS = coord.split('_')
POS = int(POS)
gff_column_names = ['Seqid', 'Source', 'Type', 'Start', 'End', 'Score', 'Strand', 'Phase', 'Attributes']
df = pd.read_csv(gff_file, sep='\t',comment='#', names=gff_column_names)
df1 = df['Attributes'].apply(parse_attributes).apply(pd.Series)
df_gff = pd.concat([df.drop('Attributes', axis=1), df1], axis=1)
filtered_df = df_gff[(df_gff['Type'] == "exon") & (df['Seqid'] == CHR) & (df['Start'] <= POS) & (df['End'] >= POS)]
gene = filtered_df['ID'].iloc[0]


In [83]:
in_file = '/bioinfo2/palm/analysis/batch1/batch1_chr1_nosex.assoc.qassoc.tab'
in_df = pd.read_csv(in_file, sep = '\t', header=0)
in_df['coord'] = 'chr' + in_df['CHR'].astype(str) + '_' + in_df['BP'].astype(str)

In [90]:
in_df1 = in_df[in_df['P'] < 0.01]

In [121]:
vcf_file = '/bioinfo2/palm/analysis/batch1/Apriyanto2023_DE100.exon.b1ch1.vcf'
vcf = pysam.VariantFile(vcf_file)






<pysam.libcbcf.VariantRecordInfo object at 0x7fbd4c3daee0>
['AB', 'ABP', 'AC', 'AF', 'AN', 'AO', 'CIGAR', 'DP', 'DPB', 'DPRA', 'EPP', 'EPPR', 'GTI', 'LEN', 'MEANALT', 'MQM', 'MQMR', 'NS', 'NUMALT', 'ODDS', 'PAIRED', 'PAIREDR', 'PAO', 'PQA', 'PQR', 'PRO', 'QA', 'QR', 'RO', 'RPL', 'RPP', 'RPPR', 'RPR', 'RUN', 'SAF', 'SAP', 'SAR', 'SRF', 'SRP', 'SRR', 'TYPE', 'technology.NovaSeqXPlus']
('snp',)
<pysam.libcbcf.VariantRecordInfo object at 0x7fbd96c08940>
['AB', 'ABP', 'AC', 'AF', 'AN', 'AO', 'CIGAR', 'DP', 'DPB', 'DPRA', 'EPP', 'EPPR', 'GTI', 'LEN', 'MEANALT', 'MQM', 'MQMR', 'NS', 'NUMALT', 'ODDS', 'PAIRED', 'PAIREDR', 'PAO', 'PQA', 'PQR', 'PRO', 'QA', 'QR', 'RO', 'RPL', 'RPP', 'RPPR', 'RPR', 'RUN', 'SAF', 'SAP', 'SAR', 'SRF', 'SRP', 'SRR', 'TYPE', 'technology.NovaSeqXPlus']
('snp',)
<pysam.libcbcf.VariantRecordInfo object at 0x7fbd4cb9ea00>
['AB', 'ABP', 'AC', 'AF', 'AN', 'AO', 'CIGAR', 'DP', 'DPB', 'DPRA', 'EPP', 'EPPR', 'GTI', 'LEN', 'MEANALT', 'MQM', 'MQMR', 'NS', 'NUMALT', 'ODDS', 'PAI

In [124]:
for variant in vcf.fetch():
    print(variant.info["TYPE"])

('snp',)
('snp',)
('snp',)
('ins',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('complex',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('del',)
('ins', 'ins')
('snp',)
('snp',)
('snp', 'complex', 'ins', 'del')
('snp',)
('snp',)
('complex', 'snp', 'ins')
('snp',)
('del', 'mnp')
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('ins', 'snp')
('snp',)
('ins', 'ins', 'ins', 'snp')
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('ins',)
('complex',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('ins', 'ins', 'del', 'ins')
('del',)
('snp',)
('del',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp',)
('snp', 'complex')
('snp',)
('complex', 'snp')
('snp',)
('snp',)
('snp',)
('snp', 'complex')
('snp',)
('comple

In [164]:
in_df_adj

Unnamed: 0,CHR,SNP,UNADJ,GC,BONF,HOLM,SIDAK_SS,SIDAK_SD,FDR_BH,FDR_BY
,2,chr2_151950957,7.533e-42,1.863000e-07,5.036000e-35,5.036000e-35,inf,inf,5.036000e-35,8.206000e-34
,2,chr2_4763349,1.051e-38,2.387000e-17,7.026000e-32,7.026000e-32,inf,inf,2.387000e-32,3.889000e-31
,2,chr2_1902364,1.687e-37,7.533000e-42,1.128000e-30,1.128000e-30,inf,inf,2.387000e-32,3.889000e-31
,2,chr2_96141504,2.577e-37,5.654000e-34,1.723000e-30,1.723000e-30,inf,inf,2.387000e-32,3.889000e-31
,2,chr2_158324180,2.604e-37,6.030000e-34,1.741000e-30,1.741000e-30,inf,inf,2.387000e-32,3.889000e-31
...,...,...,...,...,...,...,...,...,...,...
,2,chr2_22292153,1,1.000000e+00,1.000000e+00,1.000000e+00,1.0,1.0,1.000000e+00,1.000000e+00
,2,chr2_18289297,1,1.000000e+00,1.000000e+00,1.000000e+00,1.0,1.0,1.000000e+00,1.000000e+00
,2,chr2_40755887,1,1.000000e+00,1.000000e+00,1.000000e+00,1.0,1.0,1.000000e+00,1.000000e+00
,2,chr2_106500245,1,1.000000e+00,1.000000e+00,1.000000e+00,1.0,1.0,1.000000e+00,1.000000e+00


In [155]:
in_df

Unnamed: 0,CHR,SNP,BP,NMISS,BETA,SE,R2,T,P
0,2,chr2_128,128,103,-37.200,22.27,0.026880,-1.67000,0.09794
1,2,chr2_133,133,103,17.600,57.94,0.000912,0.30370,0.76200
2,2,chr2_180,180,102,-31.630,21.00,0.022180,-1.50600,0.13520
3,2,chr2_185,185,102,-31.630,21.00,0.022180,-1.50600,0.13520
4,2,chr2_240,240,104,-1.024,43.88,0.000005,-0.02333,0.98140
...,...,...,...,...,...,...,...,...,...
6698015,2,chr2_160141624,160141624,62,-18.050,41.85,0.003091,-0.43130,0.66780
6698016,2,chr2_160147416,160147416,23,-26.320,155.30,0.001366,-0.16950,0.86700
6698017,2,chr2_160147423,160147423,23,-33.490,111.80,0.004256,-0.29960,0.76740
6698018,2,chr2_160147477,160147477,27,-18.850,156.60,0.000579,-0.12040,0.90510


In [178]:
merged_df1 = merged_df[merged_df['BONF'] < 0.01]


In [179]:
merged_df1.shape

(38451, 17)

In [181]:
in_file_adj

'/bioinfo2/palm/analysis/batch1/batch1_chr2_nosex.assoc.qassoc.adjusted.tab'

In [180]:
vcf_file = '/bioinfo2/palm/analysis/batch1/Apriyanto2023_DE100.exon.b1ch2.vcf'
for variant in vcf.fetch():
    TYPE = variant.info["TYPE"]
    type_out = '|'.join([i for i in TYPE])
    if variant.id == coord:
        REF = variant.ref
        ALT = variant.alts[0]
        break

In [190]:
in_df.dtypes

CHR          object
SNP          object
BP            int64
NMISS         int64
BETA        float64
SE          float64
R2          float64
T           float64
P           float64
UNADJ       float64
GC          float64
BONF        float64
HOLM        float64
SIDAK_SS    float64
SIDAK_SD    float64
FDR_BH      float64
FDR_BY      float64
dtype: object

# Look up SNPs from a designated bed file (fast)

In [185]:
import pysam
import pandas as pd
from Bio.Seq import Seq, MutableSeq
from Bio import SeqIO

# Input
out_file = "/bioinfo2/palm/analysis/batch1/Apriyanto2023_DE100_batch1_chr2.summary"
bed_file = '/bioinfo2/palm/analysis/yield/Apriyanto2023_DE100.exon.bed'
# prepare vcf by: bcftools view -R /bioinfo2/palm/analysis/yield/Apriyanto2023_DE100.exon.bed /bioinfo2/palm/WIL_sequence/3_freebayes/batch1/batch1_chr2_lc200_C5.bcf > Apriyanto2023_DE100.exon.b1ch2.vcf
vcf_file = '/bioinfo2/palm/analysis/batch1/Apriyanto2023_DE100.exon.b1ch2.vcf'
# tab file is prepared by 
in_file = '/bioinfo2/palm/analysis/batch1/batch1_chr2_nosex.assoc.qassoc.tab'
p = 0.01

# Main code
in_df = pd.read_csv(in_file, sep = '\t', header=0, dtype={'CHR':str, 'SNP': str})
in_df = in_df.drop([col for col in in_df.columns if col.startswith('Unnamed:')], axis=1)
in_file_adj = in_file.split('.tab')[0] + '.adjusted.tab'
in_df_adj = pd.read_csv(in_file_adj, sep = '\t', header=0)
colnames = list(in_df_adj.columns)
in_df_adj = in_df_adj.iloc[:, :-1]
in_df_adj.columns = colnames[1:]
in_df_adj['CHR'] = in_df_adj['CHR'].astype(str)
in_df = in_df.merge(in_df_adj)
in_df = in_df[in_df['BONF'] < p]
bed_column_names = ['Seqid', 'Start', 'End', 'Genes']
bed_df = pd.read_csv(bed_file, sep='\t',comment='#', names=bed_column_names)
vcf = pysam.VariantFile(vcf_file, "r")
genome_fa = '/bioinfo2/palm/ref/dura/dura_ref.fasta'
anno_file = "/bioinfo2/palm/ref/dura/dura_orthoDBv11_4447_3193_33090.og.annotations"
anno_df = pd.read_csv(anno_file, sep='\t', header=0)
anno_df = anno_df.rename(columns={'#query': 'query'})

def coord2codon(coord, bed_df = bed_df, vcf = vcf, genome_fa = genome_fa, anno_df = anno_df):
    # this function takes a coordinate of a SNP in the dura genome and returns some useful info about it
    CHR, POS = coord.split('_')
    POS = int(POS)
    # look up gene from bed file
    snp_df = bed_df[(bed_df['Seqid'] == CHR) & (bed_df['Start'] <= POS) & (bed_df['End'] >= POS)]
    if len(snp_df) > 0: # if snp is in the region we care
        print(coord)
        start = int(snp_df['Start'].iloc[0])
        end = int(snp_df['End'].iloc[0])
        gene = snp_df['Genes'].iloc[0]
        anno_df = anno_df[anno_df['query'] == gene]
        if len(anno_df) == 1:
            og = anno_df['ODB_OG'].iloc[0]
            gene_name = anno_df['Description'].iloc[0]
        elif len(anno_df) == 0:
            og = 'NA'
            gene_name = 'NA'
        else:
            print(gene, " has more than one og, recording the first one")
            og = anno_df['ODB_OG'].iloc[0]
            gene_name = anno_df['Description'].iloc[0]
        # look up REF and ALT from vcf
        for variant in vcf.fetch():
            TYPE = variant.info["TYPE"]
            type_out = '|'.join([i for i in TYPE])
            if variant.id == coord:
                REF = variant.ref
                ALT = variant.alts[0]
                break
        # get the codon and amino acid
        for record in SeqIO.parse(genome_fa,'fasta'):
            if record.id == CHR:
                exon_seq = record.seq[start-1:end]
                exon_seq_var = MutableSeq(str(exon_seq))
                protein_seq = exon_seq.translate()
                snp_position = POS - start
                # Find the codon's start position
                codon_start = snp_position - (snp_position % 3)
                # Extract the codons
                codon = exon_seq[codon_start: codon_start+3]
                exon_seq_var[snp_position: snp_position+len(ALT)] = ALT
                modified_codon = exon_seq_var[codon_start: codon_start+3]    
                original_aa = codon.translate()
                modified_aa = modified_codon.translate()
                break

        print(';'.join([coord, gene, og, gene_name, type_out, REF, ALT, str(codon), str(modified_codon), str(original_aa), str(modified_aa)]))
        return [gene, og, gene_name, type_out, REF, ALT, str(codon), str(modified_codon), str(original_aa), str(modified_aa)]

# new column names
col_names = ['gene', 'og', 'gene_name', 'Type','REF', 'ALT', 'codon', 'modified_codon', 'original_aa', 'modified_aa']

# Generate lists and split them into separate columns
def custom_series(x):
    return pd.Series(x, dtype='str')

in_df[col_names] = in_df['SNP'].apply(coord2codon).apply(custom_series)
#in_df = in_df.drop(in_df.columns[0], axis=1)
in_df.to_csv(out_file, index=False)
        
    

ValueError: Columns must be same length as key

In [200]:
import pysam
import pandas as pd
from Bio.Seq import Seq, MutableSeq
from Bio import SeqIO

# Input
out_file = "/bioinfo2/palm/analysis/batch1/Apriyanto2023_DE100_batch1_chr1.test.summary"
bed_file = '/bioinfo2/palm/analysis/yield/Apriyanto2023_DE100.exon.bed'
# prepare vcf by: bcftools view -R /bioinfo2/palm/analysis/yield/Apriyanto2023_DE100.exon.bed /bioinfo2/palm/WIL_sequence/3_freebayes/batch1/batch1_chr2_lc200_C5.bcf > Apriyanto2023_DE100.exon.b1ch2.vcf
vcf_file = '/bioinfo2/palm/analysis/batch1/Apriyanto2023_DE100.exon.b1ch1.vcf'
# tab file is prepared by 
in_file = '/bioinfo2/palm/analysis/batch1/batch1_chr1_nosex.assoc.qassoc.tab'
p = 0.5

# Main code
in_df = pd.read_csv(in_file, sep = '\t', header=0, dtype={'CHR':str, 'SNP': str})
in_df = in_df.drop([col for col in in_df.columns if col.startswith('Unnamed:')], axis=1)
in_file_adj = in_file.split('.tab')[0] + '.adjusted.tab'
in_df_adj = pd.read_csv(in_file_adj, sep = '\t', header=0)
colnames = list(in_df_adj.columns)
in_df_adj = in_df_adj.iloc[:, :-1]
in_df_adj.columns = colnames[1:]
in_df_adj['CHR'] = in_df_adj['CHR'].astype(str)
in_df = in_df.merge(in_df_adj)
in_df = in_df[in_df['BONF'] < p]
in_df.to_csv('/bioinfo2/palm/analysis/batch1/in_df.txt', sep='\t', index=False)

bed_column_names = ['Seqid', 'Start', 'End', 'Genes']
bed_df = pd.read_csv(bed_file, sep='\t',comment='#', names=bed_column_names)
vcf = pysam.VariantFile(vcf_file, "r")
genome_fa = '/bioinfo2/palm/ref/dura/dura_ref.fasta'
anno_file = "/bioinfo2/palm/ref/dura/dura_orthoDBv11_4447_3193_33090.og.annotations"
anno_df = pd.read_csv(anno_file, sep='\t', header=0)
anno_df = anno_df.rename(columns={'#query': 'query'})

for coord in in_df['SNP']:
    # this function takes a coordinate of a SNP in the dura genome and returns some useful info about it
    CHR, POS = coord.split('_')
    POS = int(POS)
    # look up gene from bed file
    snp_df = bed_df[(bed_df['Seqid'] == CHR) & (bed_df['Start'] <= POS) & (bed_df['End'] >= POS)]
    if len(snp_df) > 0: # if snp is in the region we care
        print(coord)
        start = int(snp_df['Start'].iloc[0])
        end = int(snp_df['End'].iloc[0])
        gene = snp_df['Genes'].iloc[0]
        anno_df = anno_df[anno_df['query'] == gene]
        if len(anno_df) == 1:
            og = anno_df['ODB_OG'].iloc[0]
            gene_name = anno_df['Description'].iloc[0]
        elif len(anno_df) == 0:
            og = 'NA'
            gene_name = 'NA'
        else:
            print(gene, " has more than one og, recording the first one")
            og = anno_df['ODB_OG'].iloc[0]
            gene_name = anno_df['Description'].iloc[0]
        # look up REF and ALT from vcf
        for variant in vcf.fetch():
            TYPE = variant.info["TYPE"]
            type_out = '|'.join([i for i in TYPE])
            print(variant.id, coord)
            if variant.id == coord:
                REF = variant.ref
                ALT = variant.alts[0]
                break
        # get the codon and amino acid
        for record in SeqIO.parse(genome_fa,'fasta'):
            if record.id == CHR:
                exon_seq = record.seq[start-1:end]
                exon_seq_var = MutableSeq(str(exon_seq))
                protein_seq = exon_seq.translate()
                snp_position = POS - start
                # Find the codon's start position
                codon_start = snp_position - (snp_position % 3)
                # Extract the codons
                codon = exon_seq[codon_start: codon_start+3]
                exon_seq_var[snp_position: snp_position+len(ALT)] = ALT
                modified_codon = exon_seq_var[codon_start: codon_start+3]    
                original_aa = codon.translate()
                modified_aa = modified_codon.translate()
                break
        print(';'.join([coord, gene, og, gene_name, type_out, REF, ALT, str(codon), str(modified_codon), str(original_aa), str(modified_aa)]))
    

chr1_86758000
chr1_72869463 chr1_86758000
chr1_72869477 chr1_86758000
chr1_72869507 chr1_86758000
chr1_72869527 chr1_86758000
chr1_72869566 chr1_86758000
chr1_72869593 chr1_86758000
chr1_72869712 chr1_86758000
chr1_72869721 chr1_86758000
chr1_72869790 chr1_86758000
chr1_72870090 chr1_86758000
chr1_72870446 chr1_86758000
chr1_72870533 chr1_86758000
chr1_72870659 chr1_86758000
chr1_72870738 chr1_86758000
chr1_72870881 chr1_86758000
chr1_72871070 chr1_86758000
chr1_72871270 chr1_86758000
chr1_72871276 chr1_86758000
chr1_72871319 chr1_86758000
chr1_72871462 chr1_86758000
chr1_72871561 chr1_86758000
chr1_72871675 chr1_86758000
chr1_72871697 chr1_86758000
chr1_72871709 chr1_86758000
chr1_72871771 chr1_86758000
chr1_72871920 chr1_86758000
chr1_72872008 chr1_86758000
chr1_72872324 chr1_86758000
chr1_72872376 chr1_86758000
chr1_72872392 chr1_86758000
chr1_72872437 chr1_86758000
chr1_72872615 chr1_86758000
chr1_72882701 chr1_86758000
chr1_72882829 chr1_86758000
chr1_72883799 chr1_86758000
chr1_7

chr1_93213103;Egu005507-mRNA-1;NA;NA;ins;TTC;TCTC;TTC;TCT;F;S
chr1_93213371
chr1_72869463 chr1_93213371
chr1_72869477 chr1_93213371
chr1_72869507 chr1_93213371
chr1_72869527 chr1_93213371
chr1_72869566 chr1_93213371
chr1_72869593 chr1_93213371
chr1_72869712 chr1_93213371
chr1_72869721 chr1_93213371
chr1_72869790 chr1_93213371
chr1_72870090 chr1_93213371
chr1_72870446 chr1_93213371
chr1_72870533 chr1_93213371
chr1_72870659 chr1_93213371
chr1_72870738 chr1_93213371
chr1_72870881 chr1_93213371
chr1_72871070 chr1_93213371
chr1_72871270 chr1_93213371
chr1_72871276 chr1_93213371
chr1_72871319 chr1_93213371
chr1_72871462 chr1_93213371
chr1_72871561 chr1_93213371
chr1_72871675 chr1_93213371
chr1_72871697 chr1_93213371
chr1_72871709 chr1_93213371
chr1_72871771 chr1_93213371
chr1_72871920 chr1_93213371
chr1_72872008 chr1_93213371
chr1_72872324 chr1_93213371
chr1_72872376 chr1_93213371
chr1_72872392 chr1_93213371
chr1_72872437 chr1_93213371
chr1_72872615 chr1_93213371
chr1_72882701 chr1_93213371




chr1_93215477;Egu005507-mRNA-1;NA;NA;snp;A;C;TAT;TCT;Y;S


In [None]:
        for record in SeqIO.parse(genome_fa,'fasta'):
            if record.id == CHR:
                exon_seq = record.seq[start-1:end]
                exon_seq_snp = MutableSeq(str(exon_seq))
                protein_seq = exon_seq.translate()
                snp_position = POS - start
                if TYPE == 'snp'
                if exon_seq[snp_position] == REF:
                    exon_seq_snp[snp_position] = ALT
                else:
                    print("REF wrong. Break")
                    break
                # Find the codon's start position
                codon_start = snp_position - (snp_position % 3)
                # Extract the codons
                codon = exon_seq[codon_start: codon_start+3]
                # If needed, replace the SNP in the codon
                modified_codon = exon_seq_snp[codon_start: codon_start+3] # modify with SNP
                # Translate the original and modified codon to amino acids
                original_aa = codon.translate()
                modified_aa = modified_codon.translate()
                break


In [69]:
                # if the variant type if SNP
                if (len(TYPE) == 1) & (TYPE[0] == 'snp'):
                    if exon_seq[snp_position] == REF:
                        exon_seq_var[snp_position] = ALT
                    else:
                        print("REF wrong. Break")
                        break
                    # Translate the original and modified codon to amino acids
                else:

Unnamed: 0,CHR,SNP,BP,NMISS,BETA,SE,R2,T,P,X10,coord,gene,og,gene_name,REF,ALT,codon,modified_codon,original_aa,modified_aa
0,16,.,20788022,48,17.55,7.255,0.1128,2.418,0.01961,,chr16_20788022,Egu030167-mRNA-1,130847at4447,Heat stress transcription factor B-2c,T,C,"(T, G, G)","[C, G, G]",(W),[R]
1,16,.,32010076,47,-18.43,6.919,0.1362,-2.664,0.01069,,chr16_32010076,Egu030617-mRNA-1,128811at4447,transmembrane protein 53-A,C,T,"(G, G, C)","[G, G, T]",(G),[G]


In [62]:
# this function takes a coordinate of a SNP in the dura genome and returns some useful info about it
coord = 'chr3_69323261'
CHR, POS = coord.split('_')
POS = int(POS)
# look up gene from bed file
snp_df = bed_df[(bed_df['Seqid'] == CHR) & (bed_df['Start'] <= POS) & (bed_df['End'] >= POS)]
start = int(snp_df['Start'].iloc[0])
end = int(snp_df['End'].iloc[0])
gene = snp_df['Genes'].iloc[0]
anno_df = anno_df[anno_df['query'] == gene]
if len(anno_df) == 1:
    og = anno_df['ODB_OG'].iloc[0]
    gene_name = anno_df['Description'].iloc[0]
elif len(anno_df) == 0:
    og = 'NA'
    gene_name = 'NA'
else:
    print(gene, " has more than one og, recording the first one")
    og = anno_df['ODB_OG'].iloc[0]
    gene_name = anno_df['Description'].iloc[0]
# look up REF and ALT from vcf
for variant in vcf:
    if variant.chrom == CHR:
        print
        if variant.pos == POS:
            REF = variant.ref
            ALT = variant.alts[0]
            print(REF, ALT)
            break
# get the codon and amino acid
for record in SeqIO.parse(genome_fa,'fasta'):
    if record.id == CHR:
        exon_seq = record.seq[start-1:end]
        exon_seq_snp = MutableSeq(str(exon_seq))
        protein_seq = exon_seq.translate()
        snp_position = POS - start
        if exon_seq[snp_position] == REF:
            exon_seq_snp[snp_position] = ALT
        else:
            "REF wrong. Break"
            break
        # Find the codon's start position
        codon_start = snp_position - (snp_position % 3)
        # Extract the codon
        codon = exon_seq[codon_start: codon_start+3]
        # If needed, replace the SNP in the codon
        modified_codon = exon_seq_snp[codon_start: codon_start+3] # modify with SNP
        # Translate the original and modified codon to amino acids
        original_aa = codon.translate()
        modified_aa = modified_codon.translate()
        break
print(';'.join([coord, gene, og, gene_name, REF, ALT, str(codon), str(modified_codon), str(original_aa), str(modified_aa)]))
return [gene, og, gene_name, REF, ALT, codon, modified_codon, original_aa, modified_aa]


KeyboardInterrupt: 

In [40]:
def coord2codon(coord, bed_df = bed_df, vcf = vcf, genome_fa = genome_fa, anno_df = anno_df):
    # this function takes a coordinate of a SNP in the dura genome and returns some useful info about it
    CHR, POS = coord.split('_')
    POS = int(POS)
    # look up gene from bed file
    snp_df = bed_df[(bed_df['Seqid'] == CHR) & (bed_df['Start'] <= POS) & (bed_df['End'] >= POS)]
    start = int(snp_df['Start'].iloc[0])
    end = int(snp_df['End'].iloc[0])
    gene = snp_df['Genes'].iloc[0]
    anno_df = anno_df[anno_df['query'] == gene]
    og = str(anno_df['ODB_OG'])
    gene_name = str(anno_df['Description'])
    # look up REF and ALT from vcf
    for variant in vcf:
        if variant.chrom == CHR:
            if variant.pos == POS:
                REF = variant.ref
                ALT = variant.alts
                break
            else:
                print(coord, "Wrong POS")
    # get the codon and amino acid
    for record in SeqIO.parse(genome_fa,'fasta'):
        if record.id == CHR:
            exon_seq = record.seq[start-1:end]
            exon_seq_snp = MutableSeq(str(exon_seq))
            protein_seq = exon_seq.translate()
            snp_position = POS - start
            if exon_seq[snp_position] == REF:
                exon_seq_snp[snp_position] = ALT[0]
            else:
                "REF wrong. Break"
                break
            # Find the codon's start position
            codon_start = snp_position - (snp_position % 3)
            # Extract the codon
            codon = exon_seq[codon_start: codon_start+3]
            print(coord, codon)
            # If needed, replace the SNP in the codon
            modified_codon = exon_seq_snp[codon_start: codon_start+3] # modify with SNP
            # Translate the original and modified codon to amino acids
            original_aa = codon.translate()
            modified_aa = modified_codon.translate()
            break
    return [gene, og, gene_name, REF, ALT, codon, modified_codon, original_aa, modified_aa]

# Knowledge base from catalogue

In [2]:
import glob
import pandas as pd
# Merge snpEff.genes.txt from different chromosomes
dfs = []
for file in glob.glob('/bioinfo2/palm/WIL_sequence/3_freebayes/batch1/batch1_chr*.snpEff.genes.txt'):
    print(file)
    df = pd.read_csv(file, header=1, sep="\t")
    # Remove the "#" from the column names
    df.columns = df.columns.str.replace("#", "").str.strip()
    dfs.append(df)
    
result_df = pd.concat(dfs, ignore_index=True, join='outer').fillna(0)
result_df.to_csv('/bioinfo2/palm/knowledgebase/chr1to10_snpEff.genes.txt',sep='\t', index=False)

/bioinfo2/palm/WIL_sequence/3_freebayes/batch1/batch1_chr1.snpEff.genes.txt
/bioinfo2/palm/WIL_sequence/3_freebayes/batch1/batch1_chr9.snpEff.genes.txt
/bioinfo2/palm/WIL_sequence/3_freebayes/batch1/batch1_chr2.snpEff.genes.txt
/bioinfo2/palm/WIL_sequence/3_freebayes/batch1/batch1_chr7.snpEff.genes.txt
/bioinfo2/palm/WIL_sequence/3_freebayes/batch1/batch1_chr6.snpEff.genes.txt
/bioinfo2/palm/WIL_sequence/3_freebayes/batch1/batch1_chr3.snpEff.genes.txt
/bioinfo2/palm/WIL_sequence/3_freebayes/batch1/batch1_chr8.snpEff.genes.txt
/bioinfo2/palm/WIL_sequence/3_freebayes/batch1/batch1_chr10.snpEff.genes.txt
/bioinfo2/palm/WIL_sequence/3_freebayes/batch1/batch1_chr5.snpEff.genes.txt
/bioinfo2/palm/WIL_sequence/3_freebayes/batch1/batch1_chr4.snpEff.genes.txt


In [3]:
trait = 'fruit'
name = 'fruit'
cata = pd.read_csv('/bioinfo2/palm/knowledgebase/{}/dura_{}_catalogue.txt'.format(trait,name), sep = '\t',names=['TranscriptId','GeneName'])
gene_list = cata['TranscriptId']

df_1 = result_df[result_df['TranscriptId'].isin(gene_list)]

anno_file = "/bioinfo2/palm/ref/dura/dura4447.og.annotations"
anno_df = pd.read_csv(anno_file, sep='\t', header=0)
anno_df = anno_df.rename(columns={'#query': 'TranscriptId'})
df_merge = df_1.merge(anno_df)
df_merge.to_csv('/bioinfo2/palm/knowledgebase/{}/dura_{}.snpEff.genes.txt'.format(trait, name), sep='\t', index=False )


In [4]:

cata = pd.read_csv('/bioinfo2/palm/knowledgebase/RNA_seq/Apriyanto2023_DE100.genes.txt', names=['TranscriptId'])
gene_list = cata['TranscriptId']

/bioinfo2/palm/WIL_sequence/3_freebayes/batch1/batch1_chr1.snpEff.genes.txt
/bioinfo2/palm/WIL_sequence/3_freebayes/batch1/batch1_chr9.snpEff.genes.txt
/bioinfo2/palm/WIL_sequence/3_freebayes/batch1/batch1_chr2.snpEff.genes.txt
/bioinfo2/palm/WIL_sequence/3_freebayes/batch1/batch1_chr7.snpEff.genes.txt
/bioinfo2/palm/WIL_sequence/3_freebayes/batch1/batch1_chr6.snpEff.genes.txt
/bioinfo2/palm/WIL_sequence/3_freebayes/batch1/batch1_chr3.snpEff.genes.txt
/bioinfo2/palm/WIL_sequence/3_freebayes/batch1/batch1_chr8.snpEff.genes.txt
/bioinfo2/palm/WIL_sequence/3_freebayes/batch1/batch1_chr10.snpEff.genes.txt
/bioinfo2/palm/WIL_sequence/3_freebayes/batch1/batch1_chr5.snpEff.genes.txt
/bioinfo2/palm/WIL_sequence/3_freebayes/batch1/batch1_chr4.snpEff.genes.txt


# knowledge base from .gff or .bed

In [25]:
from pybedtools import BedTool
gff_data = BedTool('/bioinfo2/palm/knowledgebase/drought/dura_drought_genes_rbh_49.gff')
gff_dic = {}
for feature in gff_data:
    gff_dic[feature.attrs.get('ID')] = ''
gene_list = gff_dic.keys()

In [8]:
# if reading from a bed file
col_names = ['chr', 'start', 'end', 'name','note', 'strand', 'TranscriptId']
bed = pd.read_csv('/bioinfo2/palm/knowledgebase/fattyacid/dura_fattyacid_genes.bed', sep = '\t', names = col_names)
gene_list = bed['TranscriptId']

In [9]:
df_merge

Unnamed: 0,GeneName,GeneId,TranscriptId,BioType,variants_impact_HIGH,variants_impact_LOW,variants_impact_MODERATE,variants_impact_MODIFIER,variants_effect_3_prime_UTR_variant,variants_effect_5_prime_UTR_premature_start_codon_gain_variant,...,ODB_OG,evalue,score,COG_category,Description,GOs_mf,GOs_bp,EC,KEGG_ko,Interpro
0,Egu004811,Egu004811,Egu004811-mRNA-1,protein_coding,0,8,9,583,10,0,...,118148at4447,0.0,222.85,"I,C,E,H","Methylcrotonoyl-CoA carboxylase subunit alpha,...","GO:0046872,GO:0016874,GO:0005524,GO:0000166",-,-,"ko00860,ko04144","IPR000089,IPR001882,IPR005479,IPR005481,IPR005..."
1,Egu004811,Egu004811,Egu004811-mRNA-2,protein_coding,0,8,8,558,0,0,...,118148at4447,0.0,222.367,"I,C,E,H","Methylcrotonoyl-CoA carboxylase subunit alpha,...","GO:0046872,GO:0016874,GO:0005524,GO:0000166",-,-,"ko00860,ko04144","IPR000089,IPR001882,IPR005479,IPR005481,IPR005..."
2,Egu022699,Egu022699,Egu022699-mRNA-1,protein_coding,0,5,10,638,4,0,...,64848at4447,0.0,184.767,"I,Q,M,C",(3R)-hydroxymyristoyl-,"GO:0047451,GO:0019171,GO:0016836,GO:0016829,GO...","GO:0009245,GO:0006629",-,"ko00230,ko00730,ko04075,ko04626,zma01100,zma01110","IPR010084,IPR013114,IPR029069"
3,Egu023815,Egu023815,Egu023815-mRNA-1,protein_coding,0,11,4,711,17,0,...,13185at4447,0.0,225.255,"P,F",stearoyl-,"GO:0045300,GO:0016491,GO:0046872","GO:0006631,GO:0006629,GO:0055114",1.14.19.-,ko04626,"IPR005067,IPR005803,IPR009078,IPR012348"
4,Egu000360,Egu000360,Egu000360-mRNA-1,protein_coding,1,3,7,979,1,0,...,128492at4447,0.0,179.51,"M,I,O",Acyl-CoA-binding domain-containing protein 6,"GO:0008289,GO:0000062",GO:0006869,-,"bdi01100,bdi01110,ko00220,ko00250,ko00520,ko00...","IPR000582,IPR006652,IPR014352,IPR015915,IPR035984"
5,Egu000360,Egu000360,Egu000360-mRNA-2,protein_coding,1,3,7,979,1,0,...,11254at4447,1.5399999999999999e-174,103.16,"M,O,T",acyl-CoA-binding domain-containing protein 4,GO:0000062,GO:0006869,-,-,"IPR006652,IPR015915"
6,Egu001574,Egu001574,Egu001574-mRNA-3,protein_coding,3,10,10,514,1,6,...,156112at4447,0.0,128.881,-,enoyl-,-,-,-,-,-
7,Egu002030,Egu002030,Egu002030-mRNA-1,protein_coding,0,6,3,683,13,0,...,13185at4447,0.0,224.872,"P,F",stearoyl-,"GO:0045300,GO:0016491,GO:0046872","GO:0006631,GO:0006629,GO:0055114",1.14.19.-,ko04626,"IPR005067,IPR005803,IPR009078,IPR012348"
8,Egu002186,Egu002186,Egu002186-mRNA-1,protein_coding,1,9,7,654,12,0,...,134714at4447,5.89e-174,169.59,-,3-oxoacyl-,"GO:0016491,GO:0004316,GO:0102131,GO:0102132","GO:0006629,GO:0006631,GO:0055114",-,"ko00260,ko03010,ko03013","IPR002347,IPR011284,IPR020904,IPR036291"
9,Egu002499,Egu002499,Egu002499-mRNA-1,protein_coding,1,3,2,281,9,0,...,64848at4447,7.16e-148,219.09,"I,Q,M,C",(3R)-hydroxymyristoyl-,"GO:0047451,GO:0019171,GO:0016836,GO:0016829,GO...","GO:0009245,GO:0006629",-,"ko00230,ko00730,ko04075,ko04626,zma01100,zma01110","IPR010084,IPR013114,IPR029069"


In [6]:
import pandas as pd
df = pd.read_csv('/bioinfo2/palm/knowledgebase/chr12679.snpEff.genes.txt', sep='\t')


  df = pd.read_csv('/bioinfo2/palm/knowledgebase/chr12679.snpEff.genes.txt', sep='\t')


In [7]:
df

Unnamed: 0,GeneName,GeneId,TranscriptId,BioType,variants_impact_HIGH,variants_impact_LOW,variants_impact_MODERATE,variants_impact_MODIFIER,variants_effect_3_prime_UTR_variant,variants_effect_5_prime_UTR_premature_start_codon_gain_variant,...,variants_effect_splice_acceptor_variant,variants_effect_splice_donor_variant,variants_effect_splice_region_variant,variants_effect_start_lost,variants_effect_start_retained_variant,variants_effect_stop_gained,variants_effect_stop_lost,variants_effect_stop_retained_variant,variants_effect_synonymous_variant,variants_effect_upstream_gene_variant
0,Egu003662,Egu003662,Egu003662-mRNA-1,protein_coding,0,6,1,625,1,0,...,0,0,1,0,0,0,0,0,5,353.0
1,Egu003663,Egu003663,Egu003663-mRNA-1,protein_coding,0,0,2,476,6,0,...,0,0,0,0,0,0,0,0,0,192.0
2,Egu003664,Egu003664,Egu003664-mRNA-1,protein_coding,0,2,2,414,2,0,...,0,0,2,0,0,0,0,0,0,186.0
3,Egu003664,Egu003664,Egu003664-mRNA-2,protein_coding,0,2,2,414,2,0,...,0,0,2,0,0,0,0,0,0,186.0
4,Egu003665,Egu003665,Egu003665-mRNA-1,protein_coding,0,6,8,690,3,0,...,0,0,0,0,0,0,0,0,6,475.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17946,Egu024114,Egu024114,Egu024114-mRNA-1,protein_coding,1,0,0,264,0,0,...,1,0,0,0,1,0,0,176,,
17947,Egu024115,Egu024115,Egu024115-mRNA-1,protein_coding,2,7,12,1477,0,0,...,7,0,0,1,0,0,4,258,,
17948,Egu024116,Egu024116,Egu024116-mRNA-1,protein_coding,0,0,0,294,0,0,...,0,0,0,0,0,0,0,9,,
17949,Egu024116,Egu024116,Egu024116-mRNA-2,protein_coding,0,0,0,290,0,0,...,0,0,0,0,0,0,0,9,,


In [3]:
import pysam
import pandas as pd
vcf = pysam.VariantFile("/bioinfo2/palm/analysis/yield/Apriyanto2023/Apriyanto2023_DE100.vcf", "r") 
for variant in vcf:
    if variant.chrom == CHR:
        if variant.pos == POS:
            REF = variant.ref
            ALT = variant.alts
            break

In [1]:
import sys
sys.executable

'/home/xiaorong/anaconda3/envs/bioinfo/bin/python'

## Get the gene name and amino acid give the coord of a SNP

In [94]:
# first let's work with just one coord
from Bio import SeqIO
from Bio.Seq import Seq, MutableSeq

# eventually this will be a script that only takes the coord, and go look the exon and REF ALT by it self.
# * means stop codon. When this is changed we need to look further down the sequence until another stop codon right?

coord = 'chr7_37218457'
bed_file = '/bioinfo2/palm/analysis/yield/Apriyanto2023_DE100.exon.bed'
vcf_file = '/bioinfo2/palm/analysis/' 

CHR, POS = coord.split('_')
POS = int(POS)
gff_column_names = ['Seqid', 'Source', 'Type', 'Start', 'End', 'Score', 'Strand', 'Phase', 'Attributes']
df = pd.read_csv(gff_file, sep='\t',comment='#', names=gff_column_names)
df1 = df['Attributes'].apply(parse_attributes).apply(pd.Series)
df_gff = pd.concat([df.drop('Attributes', axis=1), df1], axis=1)
filtered_df = df_gff[(df_gff['Type'] == "exon") & (df['Seqid'] == CHR) & (df['Start'] <= POS) & (df['End'] >= POS)]
gene = filtered_df['ID'].iloc[0]

REF = 'G'
ALT = 'A'

filtered_rows = df_gff[df_gff['ID']==gene]
start = int(filtered_rows['Start'])
end = int(filtered_rows['End'])
            
for record in SeqIO.parse('/bioinfo2/palm/ref/dura/dura_ref.fasta','fasta'):
    if record.id == CHR:
        seq = record.seq[start-1:end]
        pos_ref = record.seq[int(POS)-1]
        # check if the length of this exon is divisible by three
        if len(seq)%3 != 0:
            print("warning: this exon is not dividable by 3")
            # check if the position matches with the REF provided
        if pos_ref == REF:
            # get the coding region containing this SNP
            if (int(POS)-start)%3 == 0:
                codon = record.seq[int(POS)-1:int(POS)+2]
                codon_m = MutableSeq(str(codon))
                codon_m[0] = ALT 
            elif (int(POS)-start)%3 == 1:
                codon = record.seq[int(POS)-2:int(POS)+1]
                codon_m = MutableSeq(str(codon))
                codon_m[1] = ALT
            elif (int(POS)-start)%3 == 2:
                codon = record.seq[int(POS)-3:int(POS)]
                codon_m = MutableSeq(str(codon))
                codon_m[2] = ALT
            # translate these codon
            aa = codon.translate()
            aa_m = codon_m.translate()
            print(codon, aa, codon_m, aa_m)
            break
        else:
            print("This coord in the genome is different from what was called in the vcf")
            break
    

GAA E AAA K


In [6]:
df_gff

Unnamed: 0,Seqid,Source,Type,Start,End,Score,Strand,Phase,ID,Name,Alias,Unnamed: 12,Parent,_AED,_QI,_eAED,score
0,egu.contig.1973,maker,gene,10123,11712,.,+,.,Egu032749,Egu032749,snap_masked-egu.contig.1973-processed-gene-0.0,,,,,,
1,egu.contig.1973,maker,mRNA,10123,11712,.,+,.,Egu032749-mRNA-1,Egu032749-mRNA-1,snap_masked-egu.contig.1973-processed-gene-0.0...,,Egu032749,0.06,0|0|0|0.5|1|1|2|0|472,0.04,
2,egu.contig.1973,maker,exon,10123,11511,.,+,.,Egu032749-mRNA-1:1,,,,Egu032749-mRNA-1,,,,
3,egu.contig.1973,maker,exon,11686,11712,.,+,.,Egu032749-mRNA-1:2,,,,Egu032749-mRNA-1,,,,
4,egu.contig.1973,maker,CDS,10123,11511,.,+,0,Egu032749-mRNA-1:cds,,,,Egu032749-mRNA-1,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
615982,egu.contig.1149,maker,CDS,281718,281844,.,-,0,Egu032211-mRNA-2:cds,,,,Egu032211-mRNA-2,,,,
615983,egu.contig.1149,maker,CDS,280181,280249,.,-,2,Egu032211-mRNA-2:cds,,,,Egu032211-mRNA-2,,,,
615984,egu.contig.1149,maker,CDS,279893,280085,.,-,2,Egu032211-mRNA-2:cds,,,,Egu032211-mRNA-2,,,,
615985,egu.contig.1149,maker,CDS,278640,279072,.,-,1,Egu032211-mRNA-2:cds,,,,Egu032211-mRNA-2,,,,


In [107]:
from Bio.Seq import Seq, MutableSeq

seq = Seq('GACGTTAGGAGGAGCATATACTAACTTGTTTTTATAA')
seq.translate()
seq1 = MutableSeq(str(seq))

In [85]:
aa = Seq('MHLGHRHGHEKGKSPATTMPPKGWVGIRVGQEGEEQQRFEVPVDYLKHPLFMALLHQAKEEFGYEQSGAITIPCGVDHFRHVRDTINRDSAATAAAHHHSHLPHLAGLTLGGAYTNLFL')

In [109]:
seq = Seq('ACGTTAGGAGGAGCATATACTAACTTGTTTTTATAA')
seq.translate()

Seq('TLGGAYTNLFL*')

In [119]:
seq = Seq('ATGCACCTGGGCCACAGGCACGGGCATGAGAAGGGAAAGAGTCCGGCGACGACGATGCCACCTAAAGGGTGGGTGGGGATAAGGGTGGGGCAGGAAGGGGAGGAGCAGCAGCGGTTCGAGGTGCCTGTGGACTATCTGAAGCACCCGCTCTTCATGGCCTTGCTGCACCAGGCGAAGGAGGAGTTCGGATATGAGCAGAGCGGAGCCATCACCATTCCCTGCGGCGTCGATCACTTCCGTCATGTCCGGGACACCATTAACCGTGACTCCGCGGCCACCGCCGCCGCGCACCACCACAGCCACCTCCCTCACCTTGCCGGCT')
a = seq.translate()

In [118]:
len(seq)

322

In [120]:
a[-10:]

Seq('HSHLPHLAGC')

In [58]:
from collections import Counter
Counter(df_gff['Type'])

Counter({'gene': 32778,
         'mRNA': 45938,
         'exon': 196546,
         'CDS': 271952,
         'three_prime_UTR': 36830,
         'five_prime_UTR': 31943})

In [96]:
Counter(df_gff['Source'])

Counter({'maker': 615987})

In [103]:
Counter(df_gff['Alias'])

Counter({'snap_masked-egu.contig.1973-processed-gene-0.0': 1,
         'snap_masked-egu.contig.1973-processed-gene-0.0-mRNA-1': 1,
         nan: 537271,
         'maker-egu.contig.1477-snap-gene-0.0': 1,
         'maker-egu.contig.1477-snap-gene-0.0-mRNA-1': 1,
         'maker-egu.contig.1784-snap-gene-0.4': 1,
         'maker-egu.contig.1784-snap-gene-0.4-mRNA-1': 1,
         'maker-egu.contig.1784-snap-gene-0.5': 1,
         'maker-egu.contig.1784-snap-gene-0.5-mRNA-1': 1,
         'snap_masked-egu.contig.1415-processed-gene-0.4': 1,
         'snap_masked-egu.contig.1415-processed-gene-0.4-mRNA-1': 1,
         'snap_masked-egu.contig.1962-processed-gene-0.2': 1,
         'snap_masked-egu.contig.1962-processed-gene-0.2-mRNA-1': 1,
         'maker-egu.contig.1962-snap-gene-0.2': 1,
         'maker-egu.contig.1962-snap-gene-0.2-mRNA-1': 1,
         'maker-egu.contig.1962-snap-gene-0.0': 1,
         'maker-egu.contig.1962-snap-gene-0.0-mRNA-1': 1,
         'maker-egu.contig.1588-snap-gen

In [99]:
df_gff

Unnamed: 0,Seqid,Source,Type,Start,End,Score,Strand,Phase,ID,Name,Alias,Unnamed: 12,Parent,_AED,_QI,_eAED,score
0,egu.contig.1973,maker,gene,10123,11712,.,+,.,Egu032749,Egu032749,snap_masked-egu.contig.1973-processed-gene-0.0,,,,,,
1,egu.contig.1973,maker,mRNA,10123,11712,.,+,.,Egu032749-mRNA-1,Egu032749-mRNA-1,snap_masked-egu.contig.1973-processed-gene-0.0...,,Egu032749,0.06,0|0|0|0.5|1|1|2|0|472,0.04,
2,egu.contig.1973,maker,exon,10123,11511,.,+,.,Egu032749-mRNA-1:1,,,,Egu032749-mRNA-1,,,,
3,egu.contig.1973,maker,exon,11686,11712,.,+,.,Egu032749-mRNA-1:2,,,,Egu032749-mRNA-1,,,,
4,egu.contig.1973,maker,CDS,10123,11511,.,+,0,Egu032749-mRNA-1:cds,,,,Egu032749-mRNA-1,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
615982,egu.contig.1149,maker,CDS,281718,281844,.,-,0,Egu032211-mRNA-2:cds,,,,Egu032211-mRNA-2,,,,
615983,egu.contig.1149,maker,CDS,280181,280249,.,-,2,Egu032211-mRNA-2:cds,,,,Egu032211-mRNA-2,,,,
615984,egu.contig.1149,maker,CDS,279893,280085,.,-,2,Egu032211-mRNA-2:cds,,,,Egu032211-mRNA-2,,,,
615985,egu.contig.1149,maker,CDS,278640,279072,.,-,1,Egu032211-mRNA-2:cds,,,,Egu032211-mRNA-2,,,,


In [None]:


i = 0
protein_file = '/bioinfo/palm/ref/dura/dura_proteins.fasta'
with open('dura_proteins_{}_at{}.aa'.format(gene_name, level),'w') as fh:
    for record in SeqIO.parse(prselected_rowsotein_file,'fasta'):
        if record.id in ['query'].values:
            cluster = selected_rows.loc[selected_rows['query'] == record.id, 'ODB_OG']
            tag = selected_rows.loc[selected_rows['query'] == record.id, 'Description']
            if len(tag) == 1:
                record.description = ':'.join([cluster.iloc[0], tag.iloc[0]])
            else:
                print(record.id + 'appeared in more than one cluster?')

            SeqIO.write(record, fh, 'fasta')
            i += 1
print(i)

In [11]:


# Extract values from column B and C from the filtered rows
result_values = filtered_rows[['Seqid', 'Start','End','ID']]

# Convert columns A and B to numeric
result_values['Start'] = pd.to_numeric(result_values['Start'])
result_values['End'] = pd.to_numeric(result_values['End'])

# Perform the operations and apply the conditions
result_values['Start'] = result_values['Start'] - flanking
result_values['Start'] = result_values['Start'].apply(lambda x: max(0, x))  # Set negative values to 0

result_values['End'] = result_values['End'] + flanking
bedfile = fileA.split('_blastp.tbl')[0] + '_' + str(flanking) + '.bed'
result_values.to_csv(bedfile, sep='\t', index=False)


Unnamed: 0,query,ODB_OG,evalue,score,COG_category,Description,GOs_mf,GOs_bp,EC,KEGG_ko,Interpro
75,Egu020222-mRNA-1,687at4447,2.14e-26,117.89,-,auxin-responsive protein SAUR71,-,GO:0009733,"6.5.1.1,4.1.1.15","ko00250,ko00410,ko00430,ko00650,ko03030,ko0341...",IPR003676
262,Egu026424-mRNA-1,2328at4447,4.810000000000001e-66,174.13,-,Auxin-responsive protein SAUR36,-,GO:0009733,-,-,IPR003676
663,Egu020226-mRNA-1,5943at4447,5.9600000000000004e-43,180.65,-,auxin-responsive protein SAUR71,-,GO:0009733,-,-,IPR003676
664,Egu020227-mRNA-1,5943at4447,0.0,148.897,-,auxin-responsive protein SAUR71,-,GO:0009733,-,-,IPR003676
3706,Egu011092-mRNA-1,33316at4447,4.29e-28,86.71,-,indole-3-acetic acid-induced protein ARG7,-,GO:0009733,-,"ko00010,ko01200",IPR003676
3707,Egu030210-mRNA-1,33316at4447,0.0,108.054,-,indole-3-acetic acid-induced protein ARG7,-,GO:0009733,-,"ko00010,ko01200",IPR003676
5388,Egu009584-mRNA-1,47410at4447,9.09e-29,110.76,-,auxin-responsive protein SAUR32,-,GO:0009733,2.7.2.3,"ko00010,ko00230,ko00710,ko00910,ko00941,ko0094...",IPR003676
9270,Egu019713-mRNA-1,75523at4447,0.0,78.6238,-,auxin-responsive protein SAUR41,-,GO:0009733,-,-,IPR003676
9271,Egu031500-mRNA-1,75523at4447,3.15e-23,73.9,-,auxin-responsive protein SAUR41,-,GO:0009733,-,-,IPR003676
9614,Egu020383-mRNA-2,77872at4447,0.0,110.458,-,auxin-induced protein 6B,-,GO:0009733,-,"ko00190,ko00270,ko04145,sita01100,sita01110",IPR003676
