In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score 

os.chdir('/Users/erosmendoza/Downloads/qbio_490_erosmendoza/analysis_data')

import cptac
cptac.download(dataset='Ccrcc')
ccrcc = cptac.Ccrcc()
ccrcc.list_data()

Checking that ccrcc index is up-to-date...



Below are the dataframes contained in this dataset and their dimensions:

clinical
	194 rows
	171 columns
CNV
	110 rows
	19285 columns
followup
	352 rows
	27 columns
medical_history
	370 rows
	4 columns
methylation
	107 rows
	15885 columns
phosphoproteomics
	194 rows
	81550 columns
phosphoproteomics_gene
	194 rows
	6127 columns
proteomics
	194 rows
	11710 columns
somatic_mutation
	8350 rows
	3 columns
transcriptomics
	185 rows
	19275 columns


In [2]:
clinical_data = ccrcc.get_clinical()
protein_data = ccrcc.get_proteomics()
rna_data = ccrcc.get_transcriptomics()

In [3]:
stage_i_patients = clinical_data[clinical_data['tumor_stage_pathological'] == 'Stage I'].index
stage_iii_patients = clinical_data[clinical_data['tumor_stage_pathological'] == 'Stage III'].index

mean_expression_stage_i_protein = protein_data.loc[stage_i_patients].mean()
mean_expression_stage_iii_protein = protein_data.loc[stage_iii_patients].mean()

In [4]:
differential_expression_protein = mean_expression_stage_iii_protein - mean_expression_stage_i_protein
top_5_proteins = differential_expression_protein.abs().nlargest(5).index

rna_data_log = np.log2(rna_data + 1) 
mean_expression_stage_i_rna = rna_data_log.loc[stage_i_patients].mean()
mean_expression_stage_iii_rna = rna_data_log.loc[stage_iii_patients].mean()

In [5]:
differential_expression_rna = mean_expression_stage_iii_rna - mean_expression_stage_i_rna
top_5_rna = differential_expression_rna.abs().nlargest(5).index

selected_features = pd.concat([protein_data[top_5_proteins], rna_data_log[top_5_rna]], axis=1)
selected_features.columns = selected_features.columns.astype(str)
cancer_stages = clinical_data['tumor_stage_pathological']

In [6]:
scaler = StandardScaler()
selected_features = selected_features.iloc[:, 5:]
patient_mask = selected_features.iloc[:, :].isna().sum(axis=1)==0
selected_features = selected_features.loc[patient_mask, :]
cancer_stages = cancer_stages.loc[patient_mask]

cancer_mask = cancer_stages.isna()
cancer_stages = cancer_stages.dropna()
selected_features = selected_features.loc[~cancer_mask, :]

selected_features_scaled = scaler.fit_transform(selected_features)
encoder = LabelEncoder()
cancer_stages_encoded = encoder.fit_transform(cancer_stages)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(selected_features_scaled, cancer_stages_encoded, train_size=0.7)

X_train = pd.DataFrame(X_train, columns=selected_features.columns).dropna()
y_train = y_train[X_train.index] 

In [8]:
import warnings
warnings.filterwarnings('ignore')

models = {
    "KNeighborsClassifier": KNeighborsClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "MLPClassifier": MLPClassifier(),
    "GaussianNB": GaussianNB()
}

num_runs = 10
accuracies = {model_name: [] for model_name in models.keys()}

for _ in range(num_runs):
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        accuracies[model_name].append(accuracy)

mean_accuracies = {model_name: np.mean(accuracy_list) for model_name, accuracy_list in accuracies.items()}
best_model = max(mean_accuracies, key=mean_accuracies.get)

print("Mean accuracies:", mean_accuracies)
print("Best model:", best_model)

Mean accuracies: {'KNeighborsClassifier': 0.4545454545454545, 'DecisionTreeClassifier': 0.4424242424242425, 'MLPClassifier': 0.4606060606060606, 'GaussianNB': 0.4545454545454545}
Best model: MLPClassifier
