In [None]:
import numpy as np
import pandas as pd
import datetime
from google.colab import drive
import os

In [None]:
!pip install biopython

!pip install obonet


Collecting biopython
  Downloading biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.83
Collecting obonet
  Downloading obonet-1.0.0-py3-none-any.whl (9.2 kB)
Installing collected packages: obonet
Successfully installed obonet-1.0.0


In [None]:
from Bio import SeqIO

Reading the data from google drive

In [None]:
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
train_root = '/content/drive/My Drive/Train'
test_root = '/content/drive/My Drive/Test (Targets)'

In [None]:
import networkx
import obonet

# Read the taxrank ontology


with open(os.path.join(train_root, "go-basic.obo")) as obo_file:
    go_graph = obonet.read_obo(obo_file)

Taking SFO as our sub-ontology interest

In [None]:
Sub_Ontology='MFO'

Reading the train_fasta sequences

In [None]:
train_sequences_fasta = list(SeqIO.parse(os.path.join(train_root, "train_sequences.fasta"), "fasta"))
sequences=[rec.seq for rec in train_sequences_fasta]
#sequences[2]
ids=[rec.id for rec in train_sequences_fasta]
#ids[2]


Reading the train term

In [None]:
train_terms = pd.read_csv(os.path.join(train_root, "train_terms.tsv"), sep="\t")
train_terms.head()

Unnamed: 0,EntryID,term,aspect
0,A0A009IHW8,GO:0008152,BPO
1,A0A009IHW8,GO:0034655,BPO
2,A0A009IHW8,GO:0072523,BPO
3,A0A009IHW8,GO:0044270,BPO
4,A0A009IHW8,GO:0006753,BPO


In [None]:
train_terms_MFO=train_terms.loc[train_terms["aspect"]==Sub_Ontology]
#train_terms_MFO.term.value_counts()
#Getting the unique terms
uniqueTerms = train_terms_MFO["term"].unique()
termsArr = list(train_terms_MFO["term"].to_numpy())

unique_go={}
for index,go in enumerate(uniqueTerms):
  unique_go[index]=go

#print(unique_go)

train_terms_MFO.set_index('EntryID',inplace=True)
#train_terms_MFO.head()

print(train_terms_MFO.shape)

(670114, 2)


In [None]:
testID = train_terms_MFO.index.to_list()[0]
testID

'A0A009IHW8'

#Go Analysis

In [None]:
item_counts = train_terms_MFO["term"].value_counts()


In [None]:
GO_Name = {}
Name_GO={}

for id, data in go_graph.nodes(data=True):
  GO_Name[id]=data.get('name')



#Label encoding

In [None]:
#Extract label weights from IA

dfIA=pd.read_csv("IA.txt", sep='\t', header=None)
#dfIA.head()

dfIA.set_index(0, inplace=True)



#Extracting GO terms
#allGOs=dfIA[0].values.tolist()

allGOs=dfIA.index.tolist()
label_weights=[]

notFound=0
for go in item_counts.index.to_list():
    if go in allGOs:
        label_weights.append(dfIA.loc[go].to_numpy()[0])
    else:
        notFound += 1
        label_weights.append(0)



Getting the top GO terms

In [None]:
threshold=0

frequent_GO_Terms=item_counts.index.to_list()

frequent_GO_Terms=np.array(frequent_GO_Terms)[np.array(label_weights)>threshold]


frequent_GO_Terms

array(['GO:0005488', 'GO:0005515', 'GO:0003824', ..., 'GO:0097162',
       'GO:0048472', 'GO:0102628'], dtype='<U10')

In [None]:
# Define the path for saving/loading the GO data sizes
dfGO=train_terms_MFO.copy(deep=True)
dfGO.set_index("term", inplace=True)


file_path = os.path.join(train_root, f"GODataSizes_{Sub_Ontology}.npy")

if os.path.exists(file_path):
    print("Loading presaved data")
    GODataSizes = np.load(file_path)
else:
    # Assuming dfGo and topGOs are defined and contain relevant GO data
    GODataSizes = [dfGO.loc[g].size for g in frequent_GO_Terms]  # Use frequent_GO_Terms instead of topGOs
    np.save(file_path, GODataSizes)

Loading presaved data


In [None]:
#At least 10 samples
print(np.count_nonzero(np.array(GODataSizes)>10))
GODataSizes= np.array(GODataSizes)
OptimalGOs = frequent_GO_Terms[GODataSizes>10]

2099


In [None]:
import pickle

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
mlb.fit([OptimalGOs])


dftest=train_terms_MFO.loc[testID]
indices = dftest["term"].to_numpy()
print(indices)
print(mlb.transform([indices]))
print(len(mlb.classes_))

with open(os.path.join(train_root,'MLB_'+Sub_Ontology+'.pkl'), 'wb') as f:
    pickle.dump(mlb, f)

['GO:0003674' 'GO:0003953' 'GO:0016787' 'GO:0016799' 'GO:0016798'
 'GO:0003824']
[[0 0 0 ... 0 0 0]]
2099




#Amino Acid Encoding

In [None]:
amino_acid_enc = {'A': 1, 'B':24, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'O': 21, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'U': 22, 'V': 18, 'W': 19, 'Y': 20, 'X':30, 'Z':23}


Applying k-mer process

In [None]:
import warnings
from itertools import product
from tqdm import tqdm
import random

1. **Amino Acid Encoding**:
   - amino_acid_enc is a dictionary mapping each amino acid to a unique integer. This is used to numerically encode amino acids for machine learning models which cannot work directly with character data.

2. **K-mer Length Definition**:
   - k = 3 sets the length of the k-mers that will be generated. A k-mer is a substring of length k, which in this context, refers to a sequence of three amino acids.

3. **Generating All Possible K-mers**:
   - allAA = sorted(amino_acid_enc.keys()) creates a sorted list of all amino acids (characters).
   - allCombinations is a NumPy array that holds all possible combinations of k-mers. It uses the product function from the itertools module (which seems to be imported elsewhere in the code) to generate every possible combination of the amino acids, repeating k times to create k-mers.

4. **Mapping K-mers to Indices**:
   - positionDict is a dictionary that maps each k-mer to a unique index. This could be used to transform amino acid sequences into fixed-length numerical vectors by replacing each k-mer in a sequence with its corresponding index.

5. **Vectorizing the Mapping Function**:
   - vectMapping uses np.vectorize to create a vectorized function that applies the amino_acid_enc mapping to each character in a sequence. This is used to convert entire sequences of amino acids into numerical form, one character at a time.

6. **Shuffling the Data**:
   - The data (presumably pairs of sequences and their associated IDs) is shuffled to ensure that the training and validation sets are randomly selected. This helps to prevent overfitting and ensure that the model generalizes well. A fixed seed (random.seed(20)) is used for reproducibility of results.

7. **Training and Validation Split**:
   - TRAIN_VAL_SPLIT = 0.7 defines the proportion of data to be used for training (70% in this case).
   - split` calculates the index at which to split the shuffled list into training and validation sets based on the proportion defined.
   - trainSeq, valSeq, trainIds, valIds are the resulting splits of the sequences and IDs, with the first 70% of the data going into the training set, and the remaining 30% going into the validation set.



In [None]:
# Assuming aa_dict, sequences, and ids are defined elsewhere
k = 3  # Length of k-mer


allAA = sorted(amino_acid_enc.keys())

# Generate all combinations of k-mers and convert to numpy array for easy handling
allCombinations = np.array([''.join(el) for el in product(allAA, repeat=k)])

# Create a dictionary mapping each k-mer to its index
positionDict = {kmer: index for index, kmer in enumerate(allCombinations)}

# Vectorize the mapping function for faster processing
vectMapping = np.vectorize(lambda x: amino_acid_enc[x])

# Shuffle the data (sequences and ids) with a fixed seed for reproducibility
random.seed(20)
c = list(zip(sequences, ids))
random.shuffle(c)
sequencesShuffle, idsShuffle = zip(*c)

# Calculate split index for training and validation sets
TRAIN_VAL_SPLIT = 0.7
split = int(np.floor(len(sequencesShuffle) * TRAIN_VAL_SPLIT))

# Split the sequences and IDs into training and validation sets
trainSeq = sequencesShuffle[:split]
valSeq = sequencesShuffle[split:]
trainIds = idsShuffle[:split]
valIds = idsShuffle[split:]

#One-Vs-Rest Logistic Classifier

In [None]:
import numpy as np
from tqdm import tqdm

# Define the target GO term based on a specific index in mlb.classes_
target = mlb.classes_[671]

# Gather soEntryIds from the train_terms_MFO DataFrame
soEntryIds = train_terms_MFO.index.tolist()

def getKmers(seq, k, positionDict, allCombinations):
    # Generate k-mers for the sequence
    kmers = [seq[j:j+k] if j < len(seq)-(k-1) else '' for j in range(len(seq))]
    kmers = kmers[:-(k-1)] if k > 1 else kmers
    # Convert k-mers to string representation
    kmers = [str(el) for el in kmers]
    # Count the frequency of each unique k-mer
    values, counts = np.unique(kmers, return_counts=True)
    freqVector = np.zeros(allCombinations.shape[0])
    # Map counts to the position in the frequency vector using positionDict
    for v, count in zip(values, counts):
        if v in positionDict:
            freqVector[positionDict[v]] = count
    return freqVector

X = []
y = []
positiveClassCount = 0

for i, seq in enumerate(tqdm(trainSeq)):
    entryId = trainIds[i]
    # Check if the entry ID is in soEntryIds and get associated terms
    indices = train_terms_MFO.loc[entryId]["term"].to_numpy() if entryId in soEntryIds else []
    # Determine if the target GO term is present and update X, y accordingly
    if target in indices:
        freqVector = getKmers(seq, k, positionDict, allCombinations)
        X.append(freqVector)
        y.append(1)
        positiveClassCount += 1
    elif 0.2 * len(y) < positiveClassCount:
        freqVector = getKmers(seq, k, positionDict, allCombinations)
        X.append(freqVector)
        y.append(0)

X = np.array(X)
y = np.array(y)


100%|██████████| 99572/99572 [19:49<00:00, 83.69it/s] 


In [None]:
from tqdm import tqdm
import numpy as np

Xval = []
yval = []
positiveClassCountVal = 0

for i, seq in enumerate(tqdm(valSeq)):
    entryId = valIds[i]
    # Use train_terms_MFO for lookup, ensure you have a similar structure for validation if needed
    if entryId in soEntryIds:
        labelData = train_terms_MFO.loc[entryId]
        indices = labelData["term"].to_numpy() if 'term' in labelData else []
    else:
        indices = []
    # Check if target GO term is in the indices and append the sequence's k-mer frequency vector to Xval
    if target in indices:
        freqVector = getKmers(seq, k, positionDict, allCombinations)  # Make sure to pass necessary parameters
        Xval.append(freqVector)
        yval.append(1)
        positiveClassCountVal += 1
    # Balance the positive and negative samples in the validation set
    elif len(yval) < 2 * positiveClassCountVal:
        freqVector = getKmers(seq, k, positionDict, allCombinations)
        Xval.append(freqVector)
        yval.append(0)

Xval = np.array(Xval)
yval = np.array(yval)


100%|██████████| 42674/42674 [08:17<00:00, 85.74it/s] 


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import AdaBoostClassifier

#clf = LogisticRegression().fit(X, y)
#clf = svm.SVC(probability=True).fit(X, y)
 #clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(4, 64)).fit(X, y)
clf = make_pipeline(preprocessing.StandardScaler(), LogisticRegression(solver="liblinear"))
#clf = make_pipeline(preprocessing.StandardScaler(), AdaBoostClassifier(n_estimators=50))
#clf = make_pipeline(preprocessing.StandardScaler(), MLPClassifier(solver='lbfgs', random_state=854, hidden_layer_sizes=(32,32,32)))

clf.fit(X, y)
# print(clf.predict(Xval))
# print(clf.predict_proba(Xval))
print(clf.score(Xval, yval))

0.7941176470588235


In [None]:
def ClassificationScores(yTrue, yPred):
    tp = np.sum(np.logical_and(np.array(yTrue, dtype="bool") , np.array(yPred, dtype="bool")))
    fn = np.sum(np.logical_and(np.logical_not(yTrue) , np.array(yPred, dtype="bool")))
    fp = np.sum(np.logical_and(np.array(yTrue, dtype="bool") , np.logical_not(yPred)))
    prec = (tp)/(tp+fp+1e-20)
    rec = tp/(tp+fn+1e-20)
    f1 = 2*(prec*rec)/(prec+rec+1e-20)
    return prec, rec, f1

In [None]:
ClassificationScores(yval, clf.predict(Xval))


(0.8823529411764706, 0.75, 0.8108108108108107)

Only LR gave better accuracies compared to others