#**DATA PREPARATION**

#### Bash commands for MMseqs2
##### These commands are needed to copy the dataset files on the VM and then run the algorithm on those files

In [None]:
import subprocess
import shlex

#copying files in the VM
subprocess.run(shlex.split("scp -i um14_id_rsa /path/to/LB2_project_group_4/Data_collection/* um14@m14.lsb.biocomp.unibo.it:/home/um14/data"), stdout=subprocess.DEVNULL)
#VM connection
subprocess.run(shlex.split("lab2vm"), stdout=subprocess.DEVNULL)
#MMseq2 execution command
subprocess.run(shlex.split("mmseqs easy-cluster input.fa cluster-results tmp --min-seq-id 0.3 \ -c 0.4 --cov-mode 0 --cluster-mode 1"), stdout=subprocess.DEVNULL)



#### Retaining only non-redundant proteins in the datasets
##### MMseqs2 identifies the proteins that must be retained, called representatives
##### In this section we filter the original dataset retaining only those ones

In [1]:
def representative_proteins(rep_fasta,sp_tsv,output_tsv,type):
  reference = []
  filtered = 0
  rep = 0
  with open (rep_fasta,"r") as input:
    for line in input:
      if line[0] == ">":
        rep += 1
        reference.append(line.split()[0][1:]) #storing only Uniprot IDs
        #print(line.split()[0][1:])
  with open(output_tsv,"w") as output:
    if type=="positive":
      print("protein_id", "organism_name", "kingdom", "protein_length", "pos_cleavage_site",sep="\t", file=output)
    elif type=="negative":
      print("protein_id", "organism_name", "kingdom", "protein_length", "TM_helix_presence",sep="\t", file=output)
    with open(sp_tsv,"r") as input:
      for line in input:
        if line.split()[0] in reference:
          filtered += 1 #comparing IDs
          output.write(line)
  print("Filtered dataset",filtered)
  print("Representatives count",rep)
print("Positive")
representative_proteins("/content/cluster-results_positive_rep_seq.fasta","/content/sp_positive.tsv","/content/sp_positive_rep.tsv","positive")
print()
print("Negative")
representative_proteins("/content/cluster-results_negative_rep_seq.fasta","/content/sp_negative.tsv","/content/sp_negative_rep.tsv","negative")

Positive
Filtered dataset 1093
Representatives count 1093

Negative
Filtered dataset 8934
Representatives count 8934


#### Data splitting into training and benchmarking
##### These commands are used to split the filtered data into training and test sets following the proportion 80-20
##### The training set is further split into 5 cross validation sets

In [2]:
import pandas as pd

positive = pd.read_csv("/content/sp_positive_rep.tsv", sep="\t")
negative = pd.read_csv("/content/sp_negative_rep.tsv", sep="\t")

positive_file = "positive_set.tsv"
negative_file = "negative_set.tsv"

def training_test_split(input, output_file,type):
  shuffled_data = input.sample(frac=1, random_state=42).reset_index(drop=True)
  nr_trainig = round((len(shuffled_data)-1)*80/100)
  nr_cv = round(nr_trainig/5)
  with open(output_file, "w") as f1:
    counter_pos=0
    if type=="positive":
        print("protein_id", "organism_name", "kingdom", "protein_length", "pos_cleavage_site","class","cv_subset",sep="\t", file=f1)
    elif type=="negative":
      print("protein_id", "organism_name", "kingdom", "protein_length", "TM_helix_presence","class","cv_subset",sep="\t", file=f1)
    for _, row in shuffled_data.iterrows():
      line = row.tolist()
      if counter_pos <= nr_trainig:
        line.append("Training")
        counter_pos += 1
        if counter_pos <= nr_cv:
          line.append("1")
          print(*line, sep="\t", file=f1)
        elif counter_pos <= (nr_cv*2):
          line.append("2")
          print(*line, sep="\t", file=f1)
        elif counter_pos <= (nr_cv*3):
          line.append("3")
          print(*line, sep="\t", file=f1)
        elif counter_pos <= (nr_cv*4):
          line.append("4")
          print(*line, sep="\t", file=f1)
        else:
          line.append("5")
          print(*line, sep="\t", file=f1)
        counter2=counter_pos
      else:
        line.append("Test")
        line.append("0")
        print(*line, sep="\t", file=f1)
        counter_pos += 1
    print("Completed for",type)
  return

training_test_split(positive, positive_file,"positive")
training_test_split(negative, negative_file,"negative")

Completed for positive
Completed for negative


#### Printing results
##### The following function is needed to print the results and check that the requested proportions among training and test are respected

In [3]:
def printing_info(input_file):
  with open (input_file,"r") as input:
    count_training=0
    count_test=0
    count_cv1=0
    count_cv2=0
    count_cv3=0
    count_cv4=0
    count_cv5=0
    for line in input:
      if line[-2:-1]=="1":
        count_cv1+=1
      elif line[-2:-1]=="2":
        count_cv2+=1
      elif line[-2:-1]=="3":
        count_cv3+=1
      elif line[-2:-1]=="4":
        count_cv4+=1
      elif line[-2:-1]=="5":
        count_cv5+=1
      elif line[-2:-1]=="0":
        count_test+=1
    count_training=count_cv1+count_cv2+count_cv3+count_cv4+count_cv5
    print("Total data: ",count_training+count_test)
    print("Total training data: ",count_training)
    print("CV set 1: ",count_cv1)
    print("CV set 2: ",count_cv2)
    print("CV set 3: ",count_cv3)
    print("CV set 4: ",count_cv4)
    print("CV set 5: ",count_cv5)
    print("Total test data: ",count_test)

print("Positive data")
printing_info("/content/positive_set.tsv")
print()
print("Negative data")
printing_info("/content/negative_set.tsv")

Positive data
Total data:  1093
Total training data:  875
CV set 1:  175
CV set 2:  175
CV set 3:  175
CV set 4:  175
CV set 5:  175
Total test data:  218

Negative data
Total data:  8934
Total training data:  7147
CV set 1:  1429
CV set 2:  1429
CV set 3:  1429
CV set 4:  1429
CV set 5:  1431
Total test data:  1787
