In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif, SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import time
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score, classification_report
from sklearn.decomposition import PCA

In [2]:
data_gene_ex = pd.read_csv("data-TCGA.csv")

In [3]:
df_gene = pd.DataFrame(data_gene_ex)

In [4]:
data_label = pd.read_csv("labels-TCGA.csv")

In [5]:
df_label = pd.DataFrame(data_label)

In [6]:
df_label.drop(columns=['Unnamed: 0'], inplace=True)

In [7]:
df_label['Class'].replace({'BRCA':0, 'KIRC':1, "LUAD":2, "PRAD":3, "COAD":4}, inplace=True)

In [8]:
df_gene1 = df_gene.copy()
df_gene1 = df_gene1.drop(columns=['Unnamed: 0'])

In [9]:
df_gene2 = df_gene1.copy()
df_gene2 = df_gene2.T.drop_duplicates().T

In [10]:
x = df_gene2
y = df_label.values.reshape(-1,1)

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [13]:
# feature scaling
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

x_scaled = scaler.transform(x)

# Removing Low Variance Features

In [14]:
# Calculate variance for each gene
gene_variances_sorted = x_train.var(axis=0).sort_values(ascending=False)

In [15]:
# Keep top N genes
N = 2000
top_genes = gene_variances_sorted.index[:N]
x_train_top_var = x_train[top_genes]
x_test_top_var = x_test[top_genes]

x_new = x.copy()
x_new = x_new[top_genes]

print(f"Shape after keeping top {N} genes:", x_train_top_var.shape)
print(f"Shape after keeping top {N} genes:", x_test_top_var.shape)

Shape after keeping top 2000 genes: (640, 2000)
Shape after keeping top 2000 genes: (161, 2000)


# ANOVA

In [16]:
# Feature selection
selector = SelectKBest(f_classif, k=1000)
x_train_sel = selector.fit_transform(x_train_top_var, y_train)
x_test_sel = selector.transform(x_test_top_var)
x_sel = selector.transform(x_new)

  y = column_or_1d(y, warn=True)


In [19]:
# Get the selected feature names
selected_feature_names = x_train_top_var.columns[selector.get_support()]

# Create DataFrames with selected features and their actual values
x_train_selected_df = pd.DataFrame(x_train_sel, columns=selected_feature_names)
x_test_selected_df = pd.DataFrame(x_test_sel, columns=selected_feature_names, index=x_test.index)
x_selected_df = pd.DataFrame(x_sel, columns=selected_feature_names, index=x.index)


# PCA

In [20]:
pca = PCA(n_components=100)
x_train_final = pca.fit_transform(x_train_selected_df)
x_test_final = pca.transform(x_test_selected_df)

In [21]:
# svm model with different kernels and degree = 3 and gamma= 0.7


def svm(x_tr, x_te):
    
    for k in ['linear', 'rbf', 'poly']:
        
        svm_model = SVC(kernel=k, degree=3, gamma=0.7)
        start = time.time()
        svm_model.fit(x_tr, y_train.ravel())
        train_time = time.time() - start

        y_svm_pred = svm_model.predict(x_te)
        acc = accuracy_score(y_test, y_svm_pred)

        
        print(f"{k}, accuracy = {acc:.4f}, training_time = {train_time:.4f} sec")

In [22]:
svm(x_train_final,x_test_final)

linear, accuracy = 1.0000, training_time = 0.2494 sec
rbf, accuracy = 0.3789, training_time = 0.1099 sec
poly, accuracy = 1.0000, training_time = 0.0100 sec


In [23]:
svm_model1 = SVC(kernel='poly', degree=3, gamma=0.7)

svm_model1.fit(x_train_final, y_train.ravel())

y_svm_pred1 = svm_model1.predict(x_test_final)

In [24]:
# Report of svm with kernel = ploy and degree=3 and gamma=0.7
print(classification_report(y_test, svm_model1.predict(x_test_final)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        61
           1       1.00      1.00      1.00        25
           2       1.00      1.00      1.00        29
           3       1.00      1.00      1.00        29
           4       1.00      1.00      1.00        17

    accuracy                           1.00       161
   macro avg       1.00      1.00      1.00       161
weighted avg       1.00      1.00      1.00       161



In [25]:
# Report of svm with kernel = linear and degree=3 and gamma=0.7

svm_model2 = SVC(kernel='poly', degree=3, gamma=0.7)

svm_model2.fit(x_train_final, y_train.ravel())

y_svm_pred2 = svm_model2.predict(x_test_final)


print(classification_report(y_test, svm_model2.predict(x_test_final)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        61
           1       1.00      1.00      1.00        25
           2       1.00      1.00      1.00        29
           3       1.00      1.00      1.00        29
           4       1.00      1.00      1.00        17

    accuracy                           1.00       161
   macro avg       1.00      1.00      1.00       161
weighted avg       1.00      1.00      1.00       161



In [26]:
# Report of svm with kernel = linear and degree=3 and gamma=0.7

svm_model3 = SVC(kernel='rbf', degree=3, gamma=0.7)

svm_model3.fit(x_train_final, y_train.ravel())

y_svm_pred3 = svm_model3.predict(x_test_final)


print(classification_report(y_test, svm_model3.predict(x_test_final)))

              precision    recall  f1-score   support

           0       0.38      1.00      0.55        61
           1       0.00      0.00      0.00        25
           2       0.00      0.00      0.00        29
           3       0.00      0.00      0.00        29
           4       0.00      0.00      0.00        17

    accuracy                           0.38       161
   macro avg       0.08      0.20      0.11       161
weighted avg       0.14      0.38      0.21       161



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
def svm (x_tr, x_te):

      for k in ['linear', 'rbf', 'poly']:

        svm_model = SVC(kernel= k)

        start = time.time()

        svm_model.fit(x_tr, y_train.ravel())
        train_time = time.time() - start

        y_svm_pred = svm_model.predict(x_te)

        acc = accuracy_score(y_test, y_svm_pred)

        
        print(f"{k}, accuracy = {acc:.4f}, training_time = {train_time:.4f} sec")

        

svm(x_train_final,x_test_final)

linear, accuracy = 1.0000, training_time = 0.0148 sec
rbf, accuracy = 1.0000, training_time = 0.0152 sec
poly, accuracy = 0.9938, training_time = 0.0130 sec


In [28]:
# Report of svm with kernel = linear 

svm_model4 = SVC(kernel='linear')

svm_model4.fit(x_train_final, y_train.ravel())

y_svm_pred4 = svm_model4.predict(x_test_final)


print(classification_report(y_test, svm_model4.predict(x_test_final)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        61
           1       1.00      1.00      1.00        25
           2       1.00      1.00      1.00        29
           3       1.00      1.00      1.00        29
           4       1.00      1.00      1.00        17

    accuracy                           1.00       161
   macro avg       1.00      1.00      1.00       161
weighted avg       1.00      1.00      1.00       161



In [29]:
# Report of svm with kernel = rbf

svm_model5 = SVC(kernel='rbf')

svm_model5.fit(x_train_final, y_train.ravel())

y_svm_pred5 = svm_model5.predict(x_test_final)


print(classification_report(y_test, svm_model5.predict(x_test_final)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        61
           1       1.00      1.00      1.00        25
           2       1.00      1.00      1.00        29
           3       1.00      1.00      1.00        29
           4       1.00      1.00      1.00        17

    accuracy                           1.00       161
   macro avg       1.00      1.00      1.00       161
weighted avg       1.00      1.00      1.00       161



In [30]:
# Report of svm with kernel = poly 

svm_model6 = SVC(kernel='poly')

svm_model6.fit(x_train_final, y_train.ravel())

y_svm_pred6 = svm_model6.predict(x_test_final)


print(classification_report(y_test, svm_model6.predict(x_test_final)))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99        61
           1       1.00      0.96      0.98        25
           2       1.00      1.00      1.00        29
           3       1.00      1.00      1.00        29
           4       1.00      1.00      1.00        17

    accuracy                           0.99       161
   macro avg       1.00      0.99      0.99       161
weighted avg       0.99      0.99      0.99       161



We can see that in comparison with LDA, PCA has far better
performance and higher accuracy in most of the models to classify diffrent cancer types.