<a href="https://colab.research.google.com/github/geovalexis/TFG/blob/main/notebooks/machine_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Machine Learning

## With single classification models

When a gene has several GO terms, they are reduced to just one: the Lowest Common Ancestor (LCA). 

## With multi-label (various GO terms) classification models

### Helper functions

In [None]:
import numpy as np
import pandas as pd

def filterOutByFrequency(column: pd.Series, min_threshold: int = None, max_threshold: int = None) -> pd.Series:
    elements_counts = column.explode().value_counts()
    print(f"The element with the maximum number of ocurrences is {elements_counts.idxmax()} with {elements_counts.max()} ocurrences.", flush=True)
    print(f"The element with the maximum number of ocurrences is {elements_counts.idxmin()} with {elements_counts.min()} ocurrences.", flush=True)
    min_threshold = min_threshold if min_threshold else elements_counts.min()
    max_threshold = max_threshold if max_threshold else elements_counts.max()
    out_elements = elements_counts[(elements_counts<min_threshold) | (elements_counts>max_threshold)].index
    column_filtered = column.apply(lambda x, y=out_elements: np.setdiff1d(x, y)).copy()
    return column_filtered

In [None]:
# DEPRECATED
import pandas
y_training_matrix_one_hot_encoded = pd.get_dummies(y_training_matrix["GO_IDs"].explode(), 
                                                   prefix="", prefix_sep=""
                                                   ).reset_index().groupby("index").first() # We take the first ocurrence as they all must be the same (no matter the first, last or whatever)


### Parse Phylogenetic Profiling Matrix

In [None]:
# 1st version of the PP matrix 
# We need to assign GO terms to profiling matrix
import json
import pandas as pd
profiling_matrix = getOrthologsPresenceMatrix()
with open("drive/MyDrive/TFG/human_genes2GOtermIDs.json", "r") as input:
    human_gene2GOterms = json.load(input)
profiling_matrix["GO_IDs"] = profiling_matrix.index.map(lambda x: np.array(human_gene2GOterms.get(x)) if human_gene2GOterms.get(x) else [])

In [None]:
# 2nd version of the PP matrix
import pandas as pd

profiling_matrix = pd.read_table("bsc_cluster/pp2go/results/MTP/MtP_201601_blasted-pp_matrix_counts.tab", 
                                header=0, index_col=0,  
                                converters={"GO_IDs": lambda x:  list(filter(None, x.split(",")))}) # if we don't filter there are no empty lists but lists with empty strings: [''] (its lenght is 1, not 0))
profiling_matrix

Unnamed: 0,189518,85962,208964,122586,83333,272561,226186,190304,324602,1111708,...,684364,164328,243090,69014,436308,374847,515635,330879,224911,GO_IDs
Q96T66,1,0,1,0,1,0,1,1,1,1,...,0,1,1,0,0,0,1,1,0,"[GO:0009165, GO:0019674]"
Q6IQ20,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3,0,"[GO:0001523, GO:0001659, GO:0048874, GO:005072..."
Q9H3J6,2,0,0,0,1,0,1,0,1,0,...,1,1,0,0,0,0,0,1,0,[]
Q14397,1,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,"[GO:0006110, GO:0006606, GO:0009750, GO:003313..."
Q8NFV4,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
A6NED7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[]
M0R036,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[]
O14598,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[]
Q5T7P6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[]


In [None]:
# Filter out unnotated proteins
profiling_matrix = profiling_matrix[profiling_matrix["GO_IDs"].str.len()>0]  # Drop those genes that do not have GO term (the final goal of the project is to assign them one, but to train the model we need to give it resolved examples)
profiling_matrix

Unnamed: 0,189518,85962,208964,122586,83333,272561,226186,190304,324602,1111708,...,684364,164328,243090,69014,436308,374847,515635,330879,224911,GO_IDs
Q96T66,1,0,1,0,1,0,1,1,1,1,...,0,1,1,0,0,0,1,1,0,"[GO:0009165, GO:0019674]"
Q6IQ20,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3,0,"[GO:0001523, GO:0001659, GO:0048874, GO:005072..."
Q14397,1,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,"[GO:0006110, GO:0006606, GO:0009750, GO:003313..."
O76031,1,1,0,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,1,0,"[GO:0046034, GO:0051603]"
P00492,1,0,1,1,1,0,1,1,1,1,...,0,1,1,1,0,1,1,0,0,"[GO:0006164, GO:0006166, GO:0006178, GO:004310..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P01714,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[GO:0006898, GO:0006955, GO:0006956, GO:000695..."
Q05315,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[GO:0002667, GO:0002724, GO:0007275, GO:004600..."
Q3LHN2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[GO:0031424]
Q701N2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[GO:0031424]


In [None]:
# Filter out GO terms does not meet a certain number of ocurrences
profiling_matrix = profiling_matrix.assign(GO_IDs=filterOutByFrequency(profiling_matrix["GO_IDs"], min_threshold=100, max_threshold=None))
profiling_matrix

The element with the maximum number of ocurrences is GO:0045944 with 755 ocurrences.
The element with the maximum number of ocurrences is GO:0009438 with 1 ocurrences.


Unnamed: 0,189518,85962,208964,122586,83333,272561,226186,190304,324602,1111708,...,684364,164328,243090,69014,436308,374847,515635,330879,224911,GO_IDs
Q96T66,1,0,1,0,1,0,1,1,1,1,...,0,1,1,0,0,0,1,1,0,[]
Q6IQ20,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3,0,[]
Q14397,1,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,[]
O76031,1,1,0,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,1,0,[]
P00492,1,0,1,1,1,0,1,1,1,1,...,0,1,1,1,0,1,1,0,0,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P01714,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[GO:0006898, GO:0006955, GO:0030449, GO:003809..."
Q05315,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[GO:0007275]
Q3LHN2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[GO:0031424]
Q701N2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[GO:0031424]


In [None]:
# Prepare training dataset 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

training_matrix = profiling_matrix[profiling_matrix["GO_IDs"].str.len()>0]  # Filter out some proteins that may have remained without any annotation after passing through the ocurrences filter.

X_training_matrix = training_matrix.iloc[:, :-1]
y_training_matrix = training_matrix.iloc[:, -1:] # The last column corresponds to the GO_IDs

# We need to compute one-hot encoding for multilable classification as the models does not accept matrices with different shape.
mlb = MultiLabelBinarizer()
y_training_matrix_encoded = pd.DataFrame(mlb.fit_transform(y_training_matrix["GO_IDs"]), columns=mlb.classes_, index=y_training_matrix.index)

y_training_matrix_encoded

Unnamed: 0,GO:0000086,GO:0000122,GO:0000165,GO:0000184,GO:0000209,GO:0000398,GO:0001934,GO:0002223,GO:0002576,GO:0006281,...,GO:0055114,GO:0060271,GO:0061024,GO:0065003,GO:0070268,GO:0070374,GO:0072659,GO:0090090,GO:0090263,GO:1901796
Q13751,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
P51160,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
P35913,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
O00408,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Q9HCR9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P01714,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Q05315,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Q3LHN2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Q701N2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Model selection

In [None]:
# Support multilabel by default
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

# Does not support multilabel but multiclass does
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
#from sklearn.linear_model import RidgeClassifierCV
from sklearn.naive_bayes import MultinomialNB

from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate


In [None]:
# Simple method (without cross-validation)
X_train, X_test, y_train, y_test = train_test_split(X_training_matrix, y_training_matrix_encoded, test_size=0.25)
model = RandomForestClassifier().fit(X_train, y_train)
y_test_predictions = model.predict(X_test)
accuracy_score(y_test, y_test_predictions)

0.03076923076923077

In [None]:
# With cross-validation
model = RandomForestClassifier().fit(X_training_matrix, y_training_matrix_encoded)
cross_validate(model, X_train, y_train, cv=5, scoring=("accuracy", "f1_macro"), n_jobs=-1) # We could also choose only one scoring method

{'fit_time': array([6.23813462, 6.21307921, 6.28599119, 6.15516257, 5.2181108 ]),
 'score_time': array([0.64078617, 0.64489555, 0.61125302, 0.61550689, 0.52815628]),
 'test_accuracy': array([0.03247863, 0.03162393, 0.02735043, 0.03333333, 0.03678358]),
 'test_f1_macro': array([0.11274907, 0.10547702, 0.12313474, 0.09820239, 0.08827066])}

In [None]:
# From https://towardsdatascience.com/simple-way-to-find-a-suitable-algorithm-for-your-data-in-scikit-learn-python-9a9710c7c0fe

def create_baseline_classifiers(seed=8):
    """Create a list of baseline classifiers.
    
    Parameters
    ----------
    seed: (optional) An integer to set seed for reproducibility
    Returns
    -------
    A list containing tuple of model's name and object.
    """
    models = {}
    # Inherently multilabel
    models['Dummy'] = DummyClassifier(random_state=seed, strategy='prior')
    models['RandomForest'] = RandomForestClassifier(random_state=seed, n_jobs=1) # Random Forest consumes a lot of memory to be run in parallel
    models['KNN'] = KNeighborsClassifier(n_jobs=1)
    models['NeuralNetwork'] = MLPClassifier(random_state=seed)

    # No support for multilabel unless using OneVSRestClassifier or ClassifierChain
    models['SupportVectorMachine'] = OneVsRestClassifier(SVC(random_state=seed, probability=True), n_jobs=1)
    models['GradientBoosting'] = OneVsRestClassifier(GradientBoostingClassifier(random_state=seed), n_jobs=1)
    models['MultinomialNB'] = OneVsRestClassifier(MultinomialNB(), n_jobs=1)
    return models

def assess_models(X, y, models, cv=5, metrics=('accuracy','roc_auc', 'f1')):
    """Provide summary of cross validation results for models.
    
    Parameters
    ----------
    X: A pandas DataFrame containing feature matrix
    y: A pandas Series containing target vector
    models: A list of models to train
    cv: (optional) An integer to set number of folds in cross-validation
    metrics: (optional) A list of scoring metrics or a string for a metric
    Returns
    -------
    A pandas DataFrame containing summary of baseline models' performance.
    
    """
    summary = pd.DataFrame()
    for name, model in models.items():
        result = pd.DataFrame(cross_validate(model, X, y, cv=cv, scoring=metrics, n_jobs=-1))
        mean = result.mean().rename('{}_mean'.format)
        std = result.std().rename('{}_std'.format)
        summary[name] = pd.concat([mean, std], axis=0)
    return summary.sort_index()

### Models training

In [None]:
models = create_baseline_classifiers()
summary = assess_models(X_train, y_train, models, metrics=("accuracy", "f1_macro"))
summary

Unnamed: 0,Dummy,RandomForest,KNN,NeuralNetwork,SupportVectorMachine,GradientBoosting,MultinomialNB
fit_time_mean,0.01207,6.066271,0.04946,9.410647,218.109081,59.386899,0.332783
fit_time_std,0.000399,0.467689,0.000491,1.205765,26.750227,3.315319,0.03497
score_time_mean,0.011875,0.613065,2.749094,0.017109,6.199454,0.25687,0.180541
score_time_std,0.00088,0.055037,0.228358,0.001353,0.242055,0.0188,0.013614
test_accuracy_mean,0.0,0.032314,0.038468,0.033168,0.012139,0.035391,0.008206
test_accuracy_std,0.0,0.002299,0.003578,0.007314,0.003795,0.003504,0.002057
test_f1_macro_mean,0.0,0.106273,0.10614,0.112676,0.022425,0.131963,0.096278
test_f1_macro_std,0.0,0.01495,0.01401,0.011316,0.003732,0.007767,0.004421


In [None]:
summary.to_csv("results/MTP/MtP_201601_blasted-ML_asessment.tab")