In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import auc, accuracy_score, roc_curve, precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os

In [2]:
# List of all the lineages we will be making dataframes for - some of the DFs don't have a col named depmapid but have depmapids, will require some cleaning
lineages = ["kidney", "breast", "blood", "liver", "urinary_tract", "colorectal", "ovary", "pancreas"]
lineage_df_dict = {}

In [3]:
# List of all the feature dataframes we are going to read into the lineage dataframes
features = ["effect", "dependency", "expression", "gene_cn"]
feature_df_dict = {}

In [4]:
sample_info_df = pd.read_csv('data/sample_info.csv')

In [5]:
folder_path = "data/"

for file_name in os.listdir(folder_path):
    for feature in features:
        if file_name.endswith('.csv') and feature in file_name:
            file_path = os.path.join(folder_path, file_name)
            feature_df_dict[feature] = pd.read_csv(file_path)
            feature_df_dict[feature].columns = [column.split(' ')[0] for column in feature_df_dict[feature].columns]
            feature_df_dict[feature].rename(columns={feature_df_dict[feature].columns[0]: "DepMap_ID"}, inplace=True)
            feature_df_dict[feature] = pd.merge(feature_df_dict[feature], sample_info_df[['DepMap_ID', 'lineage']], on='DepMap_ID', how='left')
            feature_df_dict[feature].drop("DepMap_ID", axis=1, inplace=True)
            feature_df_dict[feature] = feature_df_dict[feature].pivot_table(index='lineage', aggfunc='mean')

In [6]:
for feature in features:
    for lineage in lineages:
        temp_df = feature_df_dict[feature].loc[feature_df_dict[feature].index == lineage].transpose()
        temp_df.index.name = 'gene'
        if lineage in lineage_df_dict:
            temp_df.rename(columns={lineage : feature}, inplace=True)
            lineage_df_dict[lineage] = pd.merge(lineage_df_dict[lineage], temp_df, on="gene", how="outer").dropna()
        else:
            lineage_df_dict[lineage] = pd.DataFrame({feature: temp_df[lineage]}, index=temp_df.index)
            lineage_df_dict[lineage].index.name = 'gene'
            lineage_df_dict[lineage].reset_index(inplace=True)

In [7]:
training_dfs = {}
training_lineages = {"kidney" : "kidney", 
                     "breast" : "breast", 
                     "leukemia" : "blood", 
                     "liver" : "liver", 
                     "bladder" : "urinary_tract",
                     "colon" : "colorectal", 
                     "ovarian" : "ovary", 
                     "pancreatic" : "pancreas"}
for k, v in training_lineages.items():
    df = pd.read_csv(f"training_sets_genes/{k}_training_genes_set.csv")
    df = df.merge(lineage_df_dict[v], on="gene", how='left').fillna(0)
    lineage_df_dict[v] = pd.merge(lineage_df_dict[v], df, indicator=True, how='left', on=list(lineage_df_dict[v].columns)).query('_merge == "left_only"').drop('_merge', axis=1)
    lineage_df_dict[v].drop(columns=['label'], inplace=True)
    df.set_index('gene', inplace=True)
    lineage_df_dict[v].set_index('gene', inplace=True)
    training_dfs[k] = df

In [8]:
rev_training_lineages = {value: key for key, value in training_lineages.items()}

## Support Vector Machine (SVM)

In [9]:
svm_by_lineage = {}

param_grid = [{'kernel':['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100]},
                      {'kernel':['linear'], 'C': [1, 10, 100]} ]

svm = SVC(probability=True)
cv = StratifiedKFold(n_splits=5, shuffle=True)

# Run on train data
for k, v in training_dfs.items():
    X = training_dfs[k].drop(columns=['label'])
    y= training_dfs[k]['label']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    grid_search_cv = GridSearchCV(estimator=svm, param_grid=param_grid, cv=cv)

    model = grid_search_cv.fit(X_train, y_train)

    y_pred = model.best_estimator_.predict(X_test)
    y_pred_rt = model.best_estimator_.predict_proba(X_test)[:, 1]

    accuracy = str(accuracy_score(y_test, y_pred))

    svm_by_lineage[k] = (accuracy, y_pred, y_pred_rt, model.best_estimator_)

In [10]:
# Predict on Unlabelled Data
svm_pred_labels = {}
svm_predictions = {}
for k, v in lineage_df_dict.items():
    rev_k = rev_training_lineages[k]
    best_svm = svm_by_lineage[rev_k][-1]
    svm_pred_labels[rev_k] = best_svm.predict(lineage_df_dict[k])
    svm_predictions[k] = pd.DataFrame({'gene': v.index, 'prediction': svm_pred_labels[rev_k]})

In [11]:
pos_count = 0

for key in rev_training_lineages:
    for prediction in svm_predictions[key]["prediction"]:
        if prediction == "positive":
            pos_count += 1

    print(f"{key} : {pos_count}")

kidney : 0
breast : 0
blood : 0
liver : 0
urinary_tract : 0
colorectal : 0
ovary : 0
pancreas : 0
