# genelist to gene sequence

In [17]:
import pandas as pd
blast_column_names = ["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"]                         
gff_file = '/bioinfo/palm/ref/dura/dura_ref.gff'
gff_column_names = ['Seqid', 'Source', 'Type', 'Start', 'End', 'Score', 'Strand', 'Phase', 'Attributes']
bs = 0 #bitscore filtering
n = 3 #hits per gene
flanking = 2000 #flanking regions upstream and downstream
fileA = 'ABA_dura_blastp.tbl'
fileB = 'dura_ABA_blastp.tbl'

# Read data from file A
fileA_data = pd.read_csv(fileA, sep='\t', names=blast_column_names)

# Read data from file B
fileB_data = pd.read_csv(fileB, sep='\t', names=blast_column_names)

# Initialize an empty list to store the results
result_pairs = {}

# Loop through unique values in 'sseqid' column in file A

for sseqid_a in fileA_data['sseqid'].unique():
    count = 0
    # Filter rows in file A where 'sseqid' is equal to the current value in the loop
    filtered_rows = fileA_data[fileA_data['sseqid'] == sseqid_a]
    # Sort the filtered rows by 'bitscore' in descending order
    sorted_rows = filtered_rows.sort_values(by='bitscore', ascending=False)
    # Iterate through the sorted rows
    count = 0
    for _, max_bitscore_row in sorted_rows.iterrows():
        if count < n:
            # Get the value for 'qseqid' in the current row
            qseqid_b = max_bitscore_row['qseqid']

            # Check if the row exists in file B with 'sseqid' == 'b' and 'qseqid' == 'a'
            matching_row = fileB_data[(fileB_data['sseqid'] == qseqid_b) & (fileB_data['qseqid'] == sseqid_a)]

            # If the matching row exists and 'bitscore' is greater than the threshold, add 'a' and 'b' to the result_pairs dictionary
            if not matching_row.empty and max_bitscore_row['bitscore'] > bs:
                sseqid_a = sseqid_a.split('-mRNA')[0]
                qseqid_b = qseqid_b.split('-mRNA')[0]
                if sseqid_a not in result_pairs:
                    result_pairs[sseqid_a] = []
                elif qseqid_b not in result_pairs[sseqid_a]:
                    result_pairs[sseqid_a].append(qseqid_b)                    
                    count += 1
    
# get the local gene list
def invert_dict_with_lists(original_dict):
    inverted_dict = {}

    for key, values in original_dict.items():
        for value in values:
            if value not in inverted_dict:
                inverted_dict[value] = []
            inverted_dict[value].append(key)

    return inverted_dict


inverted_dict = invert_dict_with_lists(result_pairs)          
string_dict = {key: '/'.join(values) for key, values in inverted_dict.items()}
# Now we need to get the position of genes from the gff file.      

# Read data from the file
df = pd.read_csv(gff_file, sep='\t',comment='#', names=gff_column_names)
# Function to parse the 'Attributes' column and convert it into a dictionary of key-value pairs
def parse_attributes(attribute_str):
    attribute_pairs = attribute_str.split(';')
    attribute_dict = {}
    for pair in attribute_pairs:
        if len(pair.split('=')) == 2:
            key, value = pair.split('=')
        else:
            key = pair.split('=')[0]
            value = ''
        attribute_dict[key] = value
    return attribute_dict
df1 = df['Attributes'].apply(parse_attributes).apply(pd.Series)
file_data = pd.concat([df.drop('Attributes', axis=1), df1], axis=1)

# Filter rows where value from column A is in the list
filtered_rows = file_data[file_data['ID'].isin(list(string_dict))]
filtered_rows['genes'] = filtered_rows['ID'].map(string_dict)
filtered_rows['ID_genes'] = filtered_rows['ID'] + '_' + filtered_rows['genes']

# Extract values from column B and C from the filtered rows
result_values = filtered_rows[['Seqid', 'Start','End','ID_genes']]


# Convert columns A and B to numeric
result_values['Start'] = pd.to_numeric(result_values['Start'])
result_values['End'] = pd.to_numeric(result_values['End'])

# Add flanking regions
result_values['Start'] = result_values['Start'] - flanking
result_values['Start'] = result_values['Start'].apply(lambda x: max(0, x))  # Set negative values to 0

result_values['End'] = result_values['End'] + flanking
bedfile = fileA.split('_blastp.tbl')[0] + '_' + str(flanking) + '.bed'
result_values.to_csv(bedfile, sep='\t', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

# Genelist to cds (dura_transcripts.fasta)

In [22]:
import pandas as pd
blast_column_names = ["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"]                         
gff_file = '/bioinfo/palm/ref/dura/dura_ref.gff'
transcript_file = '/bioinfo/palm/ref/dura/dura_transcripts.fasta'
gff_column_names = ['Seqid', 'Source', 'Type', 'Start', 'End', 'Score', 'Strand', 'Phase', 'Attributes']
bs = 0 #bitscore filtering
n = 3 #hits per gene
flanking = 2000 #flanking regions upstream and downstream
fileA = 'ABA_dura_blastp.tbl'
fileB = 'dura_ABA_blastp.tbl'
# Read data from file A
fileA_data = pd.read_csv(fileA, sep='\t', names=blast_column_names)

# Read data from file B
fileB_data = pd.read_csv(fileB, sep='\t', names=blast_column_names)

# Initialize an empty list to store the results
result_pairs = {}

# Loop through unique values in 'sseqid' column in file A

for sseqid_a in fileA_data['sseqid'].unique():
    count = 0
    # Filter rows in file A where 'sseqid' is equal to the current value in the loop
    filtered_rows = fileA_data[fileA_data['sseqid'] == sseqid_a]
    # Sort the filtered rows by 'bitscore' in descending order
    sorted_rows = filtered_rows.sort_values(by='bitscore', ascending=False)
    # Iterate through the sorted rows
    count = 0
    for _, max_bitscore_row in sorted_rows.iterrows():
        if count < n:
            # Get the value for 'qseqid' in the current row
            qseqid_b = max_bitscore_row['qseqid']

            # Check if the row exists in file B with 'sseqid' == 'b' and 'qseqid' == 'a'
            matching_row = fileB_data[(fileB_data['sseqid'] == qseqid_b) & (fileB_data['qseqid'] == sseqid_a)]

            # If the matching row exists and 'bitscore' is greater than the threshold, add 'a' and 'b' to the result_pairs dictionary
            if not matching_row.empty and max_bitscore_row['bitscore'] > bs:
                if sseqid_a not in result_pairs:
                    result_pairs[sseqid_a] = []
                elif qseqid_b not in result_pairs[sseqid_a]:
                    result_pairs[sseqid_a].append(qseqid_b)                    
                    count += 1
    
# get the local gene list
def invert_dict_with_lists(original_dict):
    inverted_dict = {}

    for key, values in original_dict.items():
        for value in values:
            if value not in inverted_dict:
                inverted_dict[value] = []
            inverted_dict[value].append(key)

    return inverted_dict


inverted_dict = invert_dict_with_lists(result_pairs)          
string_dict = {key: '/'.join(values) for key, values in inverted_dict.items()}
# Now we get the transcript sequences for those genes (key in string_dict) from dura_transcripts.fasta
from Bio import SeqIO
out_file = '/bioinfo/palm/ref/ABA/ABA_dura_transcripts.fasta'
with open(out_file, 'w') as out_fh:
    for record in SeqIO.parse(transcript_file, "fasta"):
        if record.id in string_dict:
            record.description = string_dict[record.id]
            SeqIO.write(record, out_fh, "fasta")

In [21]:
string_dict
        


{'Egu017379-mRNA-1': 'LEC1',
 'Egu002768-mRNA-1': 'LEC1',
 'Egu002765-mRNA-1': 'LEC1',
 'Egu023380-mRNA-1': 'BBM',
 'Egu002214-mRNA-2': 'BBM',
 'Egu000576-mRNA-1': 'BBM',
 'Egu019891-mRNA-5': 'ABI5',
 'Egu023685-mRNA-1': 'ABI5',
 'Egu015962-mRNA-1': 'ABI5',
 'Egu019493-mRNA-2': 'ABI3/LEC2/FUS3',
 'Egu006921-mRNA-1': 'ABI3/LEC2',
 'Egu020554-mRNA-1': 'ABI3/LEC2/FUS3',
 'Egu019493-mRNA-1': 'FUS3',
 'Egu000535-mRNA-1': 'AGL15',
 'Egu003576-mRNA-1': 'AGL15',
 'Egu017102-mRNA-2': 'AGL15',
 'Egu009827-mRNA-1': 'WUS',
 'Egu012330-mRNA-1': 'WUS',
 'Egu028948-mRNA-1': 'WUS',
 'Egu024837-mRNA-1': 'LEA'}

In [14]:
string_dict = {key: '/'.join(values) for key, values in inverted_dict.items()}



In [16]:
list(string_dict)

['Egu017379',
 'Egu002768',
 'Egu002765',
 'Egu023380',
 'Egu002214',
 'Egu000576',
 'Egu019891',
 'Egu023685',
 'Egu015962',
 'Egu019493',
 'Egu006921',
 'Egu020554',
 'Egu005708',
 'Egu000535',
 'Egu003576',
 'Egu017102',
 'Egu009827',
 'Egu012330',
 'Egu028948',
 'Egu024837']

In [4]:
result_pairs

{'LEC1': ['Egu015173', 'Egu017379', 'Egu002768'],
 'BBM': ['Egu002214', 'Egu023380', 'Egu002214'],
 'ABI5': ['Egu019891', 'Egu019891', 'Egu023685'],
 'ABI3': ['Egu019493', 'Egu019493', 'Egu006921'],
 'LEC2': ['Egu019493', 'Egu006921', 'Egu020554'],
 'FUS3': ['Egu006921', 'Egu020554', 'Egu019493'],
 'AGL15': ['Egu000535', 'Egu000535', 'Egu003576'],
 'WUS': ['Egu026907', 'Egu009827', 'Egu012330'],
 'LEA': ['Egu005882', 'Egu024837']}