# Import Libraries

In [1]:
import numpy as np
# pandas library used to load data
import pandas as pd
# libraries from scikit-learn used to perform Feature Selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# libraries from scikit-learn used to perform Support Vector Machine
from sklearn.svm import SVC
# libraries from scikit-learn used to perform Cross Validation
from sklearn.pipeline import Pipeline 
from sklearn.model_selection import cross_val_score

  from collections import Sequence


# Load LUAD and LUSC data

In [2]:
# load 2 data from processed dataset
data_LUAD = pd.read_csv('data_LUAD.csv', index_col=['Unnamed: 0'])
data_LUSC = pd.read_csv('data_LUSC.csv', index_col=['Unnamed: 0'])
# join 2 data into one
data = pd.concat([data_LUAD, data_LUSC], join='outer', ignore_index = False, sort=False)
# fill NA values by 0
data = data.fillna(0)
# seperate class labels from the data
target = data['target']
data.drop(columns=['target'], inplace=True)

# Perform Cross-Validation

* **Feature selection method:** Univariate feature selection - select 300 highest scoring features - using chi-square score

In [3]:
skb = SelectKBest(chi2, k=300)
skb

SelectKBest(k=300, score_func=<function chi2 at 0x0000022F251E0378>)

* **Classification method:** Support Vector Machine

In [4]:
svc = SVC()
svc

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

* **Cross validation method:** 10-fold cross-validation (**_running for 10 times_**)

In [5]:
pipe = Pipeline([
        ("feature_selection", skb), ("classification", svc)
    ])
cv_score = []
for i in range(10):
    score = cross_val_score(pipe, data, target, cv=10)
    cv_score.append(score)
cv_score

[array([0.8317757 , 0.81308411, 0.85849057, 0.79245283, 0.77358491,
        0.83018868, 0.73584906, 0.81904762, 0.76190476, 0.83809524]),
 array([0.8317757 , 0.81308411, 0.85849057, 0.79245283, 0.77358491,
        0.83018868, 0.73584906, 0.81904762, 0.76190476, 0.83809524]),
 array([0.8317757 , 0.81308411, 0.85849057, 0.79245283, 0.77358491,
        0.83018868, 0.73584906, 0.81904762, 0.76190476, 0.83809524]),
 array([0.8317757 , 0.81308411, 0.85849057, 0.79245283, 0.77358491,
        0.83018868, 0.73584906, 0.81904762, 0.76190476, 0.83809524]),
 array([0.8317757 , 0.81308411, 0.85849057, 0.79245283, 0.77358491,
        0.83018868, 0.73584906, 0.81904762, 0.76190476, 0.83809524]),
 array([0.8317757 , 0.81308411, 0.85849057, 0.79245283, 0.77358491,
        0.83018868, 0.73584906, 0.81904762, 0.76190476, 0.83809524]),
 array([0.8317757 , 0.81308411, 0.85849057, 0.79245283, 0.77358491,
        0.83018868, 0.73584906, 0.81904762, 0.76190476, 0.83809524]),
 array([0.8317757 , 0.81308411, 0.

* **Average accuracy:**

In [6]:
print("Avarage accuarcy: %f" % (np.mean(cv_score)))

Avarage accuarcy: 0.805447
