In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, normalized_mutual_info_score, adjusted_rand_score
from sklearn.ensemble import GradientBoostingClassifier 

In [8]:
cancer = 'Pan-cancer'

In [4]:
omics_file_1 = pd.read_csv(f'{cancer}_mRNA_top.csv', index_col=0)  # mRNA data
omics_file_2 = pd.read_csv(f'{cancer}_miRNA_top.csv', index_col=0)  # miRNA data
omics_file_3 = pd.read_csv(f'{cancer}_CNV_top.csv', index_col=0)  # CNV data
omics_file_4 = pd.read_csv(f'{cancer}_Methy_top.csv', index_col=0)  # Methylation data
labels = pd.read_csv(f'{cancer}_label_num.csv', header=0)
labels = labels.squeeze()  # Convert to 1D array

omics_data_1 = omics_file_1.T  # mRNA
omics_data_2 = omics_file_2.T  # miRNA
omics_data_3 = omics_file_3.T  # CNV
omics_data_4 = omics_file_4.T  # Methylation

omics_data_1 = omics_data_1.add_suffix('_mRNA')
omics_data_2 = omics_data_2.add_suffix('_miRNA')
omics_data_3 = omics_data_3.add_suffix('_CNV')
omics_data_4 = omics_data_4.add_suffix('_Methy')

omics_data_combined = omics_data_1.join(omics_data_2, how='inner')
omics_data_combined = omics_data_combined.join(omics_data_3, how='inner')
omics_data_combined = omics_data_combined.join(omics_data_4, how='inner')
omics_data_combined = omics_data_combined.fillna(omics_data_combined.mean())

X_train, X_test, y_train, y_test = train_test_split(omics_data_combined, labels, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize models
xgb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
svm_model = SVC(kernel='linear', random_state=42)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
lr_model = LogisticRegression(max_iter=1000, random_state=42)

# Function to compute Precision, NMI, and ARI
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Precision
    prec = precision_score(y_test, y_pred, average='weighted')
    
    # NMI (Normalized Mutual Information)
    nmi = normalized_mutual_info_score(y_test, y_pred)
    
    # ARI (Adjusted Rand Index)
    ari = adjusted_rand_score(y_test, y_pred)
    
    return prec, nmi, ari

# Evaluate each model and report results
models = {
    "XGBoost": xgb_model,
    "SVM": svm_model,
    "Random Forest": rf_model,
    "Logistic Regression": lr_model
}

results = {}

for model_name, model in models.items():
    prec, nmi, ari = evaluate_model(model, X_train, X_test, y_train, y_test)
    results[model_name] = (prec, nmi, ari)

# Print results
print(f"{'Method':<20} {'PREC':<10} {'NMI':<10} {'ARI':<10}")
for model_name, (prec, nmi, ari) in results.items():
    print(f"{model_name:<20} {prec:<10.3f} {nmi:<10.3f} {ari:<10.3f}")

Method               PREC       NMI        ARI       
XGBoost              0.848      0.629      0.618     
SVM                  0.792      0.511      0.493     
Random Forest        0.824      0.585      0.531     
Logistic Regression  0.825      0.590      0.543     
