In [None]:
from pathlib import Path
import sys
sys.path.append(str(Path().resolve().parent))

import pandas as pd
from lib.genetic_data_helpers import *
from lib.data_access_helpers import load_fasta_records, parse_gff3_tmr_file
from lib.aminoacids import AA_1_TO_3, AA_3_TO_1

In [None]:
sequences1 = parse_gff3_tmr_file('../data/tmhmm/TMRs_dep_1.gff3')
sequences2 = parse_gff3_tmr_file('../data/tmhmm/TMRs_dep_2.gff3')

In [None]:
df_dict = {
    'gene': [],
    'length': [],
    'sp': [],
    'tmd': [],
    'first_hydrophobic_start': [],
    'first_hydrophobic_end': []
}

sequences = sequences1 + sequences2

for seq in sequences:
    df_dict['gene'].append(seq['sequence_id'])
    df_dict['length'].append(int(seq['length']))
    sp = False
    tmd = False
    start = None
    end = None
    
    for feature_dict in seq['features']:
        if feature_dict['type'] == 'signal':
            sp = True
            if start is None:
                start = int(feature_dict['start'])
            if end is None:
                end = int(feature_dict['end'])
        if feature_dict['type'] == 'TMhelix':
            tmd = True
            if start is None:
                start = int(feature_dict['start'])
            if end is None:
                end = int(feature_dict['end'])
    
    df_dict['sp'].append(sp)
    df_dict['tmd'].append(tmd)                
    df_dict['first_hydrophobic_start'].append(start)
    df_dict['first_hydrophobic_end'].append(end)


df = pd.DataFrame(df_dict)
df = df.set_index('gene')

df.to_csv("../data/tmhmm/dependent_hydrophobic_tmhmm_all.csv", sep=",")
    

           length     sp    tmd  first_hydrophobic_start  \
gene                                                       
YAL064W-B     126  False  False                      NaN   
YAL055W       180  False  False                      NaN   
YAL042W       415  False   True                     25.0   
YAL023C       759  False   True                     62.0   
YAL007C       215   True   True                      1.0   
...           ...    ...    ...                      ...   
YPR149W       173  False   True                      4.0   
YPR159C-A      33  False   True                     11.0   
YPR170W-B      85  False   True                     10.0   
YPR198W       543  False   True                      9.0   
YPR201W       404  False   True                     35.0   

           first_hydrophobic_end  
gene                              
YAL064W-B                    NaN  
YAL055W                      NaN  
YAL042W                     45.0  
YAL023C                     77.0  
YAL007C  

In [12]:
########### SRP-DEPENDENT ###############

# Filter the TMHMM resulting list to include only those SRP dependent genes (from Ast et al. 2013)
# with hydrophobic regions (SP or TMD) starting in the first 60 amino acids
from Bio import SeqIO
import pandas as pd

path = "../data/tmhmm/dependent_hydrophobic_tmhmm_all.csv"
df = pd.read_csv(path, sep=',')

sp_or_tmd = df[(df['sp'] == True) | (df['tmd'] == True)]

early_hydro = sp_or_tmd[sp_or_tmd['first_hydrophobic_start'] <= 60.0]

# Also, we want to remove the follwoing genes, which are too short for our window analyses.
# These genes were identified while running the calculations
too_short = [
    "YBR196C-A",
    "YCL005W-A",
    "YCR024C-A",
    "YDL232W",
    "YDL067C",
    "YDR276C",
    "YEL017C-A",
    "YER039C-A",
    "YER053C-A",
    "YGL226C-A",
    "YGL194C-A",
    "YGR105W",
    "YIL134C-A",
    "YJL205C",
    "YJL127C-B",
    "YJR010C-A",
    "YKL065W-A",
    "YLR342W-A",
    "YMR256C",
    "YOR032W-A",
    "YPL096C-A",
    "YPR010C-A",
    "YPR159C-A",
    "YPR170W-B"
]

filtered = early_hydro[~early_hydro['gene'].isin(too_short)]

filtered.to_csv("../data/dependent_hydrophobic_genes.csv", sep=',', index=False)

####### Create the filtered fasta files ##########

dna_path = '../data/orf_genomic_all.fasta'
protein_path = '../data/orf_trans_all.fasta'

genes = filtered["gene"].tolist()

# filter DNA file
filtered_dna = []

for record in SeqIO.parse(dna_path, "fasta"):
    sys_name = str(record.id)
    if sys_name in genes:
        filtered_dna.append(record)
        
out_path = "../data/dep_dna_filtered.fasta"
SeqIO.write(filtered_dna, out_path, "fasta")

# filter Proteins file
filtered_proteins = []

for record in SeqIO.parse(protein_path, "fasta"):
    sys_name = str(record.id)
    if sys_name in genes:
        filtered_proteins.append(record)
        
out_path = "../data/dep_proteins_filtered.fasta"
SeqIO.write(filtered_proteins, out_path, "fasta")


344

In [15]:
########## SRP-INDEPENDENT ##############

# Filter the TMHMM resulting list to include only those SRP INdependent genes (from Ast et al. 2013)
# with hydrophobic regions (SP or TMD) starting in the first 60 amino acids
from Bio import SeqIO
import pandas as pd

path = "../data/tmhmm/independent_hydrophobic_tmhmm_all.csv"
df = pd.read_csv(path, sep=',')

sp_or_tmd = df[(df['sp'] == True) | (df['tmd'] == True)]

early_hydro = sp_or_tmd[sp_or_tmd['first_hydrophobic_start'] <= 60.0]

too_short = [
    "YAR020C",
    "YDR119W-A",
    "YDR182W-A",
    "YDR524C-B",
    "YJL052C-A",
    "YLR406C-A",
    "YMR251W-A",
    "YNL024C-A",
    "YOL052C-A"
]

filtered = early_hydro[~early_hydro['gene'].isin(too_short)]

filtered.to_csv("../data/independent_hydrophobic_genes.csv", sep=',', index=False)

####### Create the filtered fasta files ##########

dna_path = '../data/orf_genomic_all.fasta'
protein_path = '../data/orf_trans_all.fasta'

genes = filtered["gene"].tolist()

# filter DNA file
filtered_dna = []

for record in SeqIO.parse(dna_path, "fasta"):
    sys_name = str(record.id)
    if sys_name in genes:
        filtered_dna.append(record)
        
out_path = "../data/indep_dna_filtered.fasta"
SeqIO.write(filtered_dna, out_path, "fasta")

# filter Proteins file
filtered_proteins = []

for record in SeqIO.parse(protein_path, "fasta"):
    sys_name = str(record.id)
    if sys_name in genes:
        filtered_proteins.append(record)
        
out_path = "../data/indep_proteins_filtered.fasta"
SeqIO.write(filtered_proteins, out_path, "fasta")

299