In [1]:
import pandas
import numpy
import subprocess
from concurrent.futures import ThreadPoolExecutor
import os
from intermediate import Intermediate

Performs initial read in of bioplex3 and uniprot data as well as filtration of interactions into shared and unique interactions of each cell line. 
\
\
Default input of class is a list of file paths as such         
\
input=[ "Data/BioPlex_293T_Network_10K_Dec_2019.tsv", "Data/BioPlex_HCT116_Network_5.5K_Dec_2019.tsv", "Data/Huttlin_BioPlex3_Table_S1.xlsb", "Data/uniprotkb_AND_reviewed_true_AND_model_o_2024_02_28.fasta",]
\
\
But may be changed if necessary
\
\
Write may be set to true to write csv files of unique and shared interactions to folder '/Data/Interactions'

In [2]:
#Generates necessary interaction and sequence data
seq_data = Intermediate(write ='False')

#Creates list of all proteins present in interaction data
all_proteins =  pandas.concat([seq_data.proteins_293T, seq_data.proteins_HCT116, seq_data.proteins_shared]).drop_duplicates()

#Creates list of all proteins present in interaction data
filt = seq_data.sequence_df['ID'].isin(all_proteins)
protein_sequences = seq_data.sequence_df[filt]

Functions utilizing concurrentfutures Threadpoolexecutor to run parallell executions of IuPred3 predictions
\
This greatly lowers the total runtime of the prediction as IuPred may only take one sequence at a time but of course rquires more computational resource

In [3]:
#Runs IuPred3 on one sequence
def process_sequence(sequence):
    ID, sequence_text = sequence

    #Creates sequence sepcific temp-file
    temp_file = f'Data/Temp/{ID}_sequence.fasta'
    with open(temp_file, 'w') as file:
        file.write(f'>{ID}\n{sequence_text}\n')

    #Runs 'long' prediction with IuPred3 from provided sequence stored in temp-file
    result = subprocess.run(['python', 'iupred3/iupred3.py', temp_file, 'long'], stdout=subprocess.PIPE)
    
    #Decodes and truncates prediction output
    decoded = result.stdout.decode('utf-8')
    start_index = decoded.find("POS\tRES\tIUPRED2") + len("POS\tRES\tIUPRED2")
    end_index = decoded.find("\r\n\r\n", start_index)
    output_string = decoded[start_index:end_index]

    #Organises output into nice t-separated columns or something like that
    lines = output_string.strip().split('\r\n')
    data = [line.split('\t') for line in lines]

    #Picks out and returns ID and prediction values
    pred_df = pandas.DataFrame(data, columns=['POS', 'RES', 'IUPRED2'])

    #Removes temp-file
    os.remove(temp_file)
    return ID, pred_df['IUPRED2'].tolist()

#Utilises concurrent futures ThreadPoolExecutor to run several predictions ata a time
def iupred3_prediction(sequence_df):
    #List of ID's and ascociated sequences
    sequences = sequence_df[['ID', 'Sequence']].values.tolist()

    #Runs parallel prediction and outputs a map of some sort (ID linked prediction values)
    with ThreadPoolExecutor() as executor:
        results = executor.map(process_sequence, sequences)

    # Result Map --> Dictionary --> Dataframe
    prediction_dict = {ID: predictions for ID, predictions in results}
    df = pandas.DataFrame.from_dict(prediction_dict, orient='index')
    return df

Runs IuPred3 prediction for all proteins in BioPlex3.0 interaction data and writes to hdf5 file

!!!OBSERVE!!! One run may take as much as 2 hours depending on pc-spec. This is why its commented. You have been warned

In [4]:
# prediction_df = iupred3_prediction(protein_sequences).transpose()
# prediction_df.to_hdf('Data/Predictions/iupred3_prediction.h5', key = 'prediction_df', mode = 'w')