Data: https://archive.ics.uci.edu/dataset/401/gene+expression+cancer+rna+seq

In [5]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score

from sklearn.svm import SVC

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [6]:
dfdata = pd.read_csv('your/pathway')
dflabels = pd.read_csv('your/pathway')
dfdata = dfdata.rename(columns={'Unnamed: 0': 'sample_id'})
dflabels = dflabels.rename(columns={'Unnamed: 0': 'sample_id'})

# 'sample_id' sütununa göre df ve dflabels verileri birleştirildi
merged_df = pd.merge(dfdata, dflabels, on='sample_id')

In [7]:
dfdata.shape

(801, 20532)

In [8]:
#bağımsız değişkenler -> X, Bağımlı değişken -> Y
X = merged_df.iloc[:,1:-1] #sample_id hariç
y = merged_df.iloc[:,-1]

In [9]:
#class encoding
label_encoder=LabelEncoder()
label_encoder.fit(y)
y_encoded=label_encoder.transform(y)
labels=label_encoder.classes_
classes=np.unique(y_encoded)

In [10]:
print(labels, classes)

['BRCA' 'COAD' 'KIRC' 'LUAD' 'PRAD'] [0 1 2 3 4]


In [11]:
# verinin %80'i eğitim %20'si test için ayırıldı
X_train,X_test,y_train,y_test=train_test_split(X,y_encoded,test_size=0.2,random_state=42,stratify=y_encoded)

In [12]:
# Featurelar normalize edildi
min_max_scaler=MinMaxScaler()
X_train_normalized=min_max_scaler.fit_transform(X_train)
X_test_normalized=min_max_scaler.fit_transform(X_test)

In [13]:
# Random Forest Sınıflandırma
RF=OneVsRestClassifier(RandomForestClassifier(max_features=0.2))
RF.fit(X_train_normalized,y_train)
y_pred_RF =RF.predict(X_test_normalized)

In [14]:
#k-fold cross validation
num_folds = 10
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)


In [15]:
# 10-fold cross-validation'ın Random Forest (eğitim verisi) için uygulanması
RF_scores = cross_val_score(RF, X_train_normalized, y_train, cv=kf, scoring='balanced_accuracy')
print(f'Random Forest Cross-Validation Scores for train data: {RF_scores}')
print(f'Random Forest Mean Cross-Validation Score for train data: {RF_scores.mean()}')

Random Forest Cross-Validation Scores for train data: [1.         1.         1.         1.         0.98823529 1.
 1.         1.         1.         1.        ]
Random Forest Mean Cross-Validation Score for train data: 0.9988235294117647


In [16]:
# 10-fold cross-validation'ın Random Forest (test verisi) için uygulanması
RF_scores_test= cross_val_score(RF, X_test_normalized, y_test, cv=kf, scoring='balanced_accuracy')
print(f'Random Forest Cross-Validation Scores for test data: {RF_scores_test}')
print(f'Random Forest Mean Cross-Validation Score for test data: {RF_scores_test.mean()}')

Random Forest Cross-Validation Scores for test data: [1.   1.   1.   1.   0.96 1.   1.   1.   1.   1.  ]
Random Forest Mean Cross-Validation Score for test data: 0.9960000000000001


In [17]:
#Random Forest için accuracy değeri
accuracy_of_RF=balanced_accuracy_score(y_test,y_pred_RF)
print(f'Random Forest Accuracy : {accuracy_of_RF}')

#Random Forest için confusion matrix
cm_RF=confusion_matrix(y_test,y_pred_RF)
cm_RF_df=pd.DataFrame(cm_RF,index=labels,columns=labels)
cm_RF_df

Random Forest Accuracy : 0.9857142857142858


Unnamed: 0,BRCA,COAD,KIRC,LUAD,PRAD
BRCA,60,0,0,0,0
COAD,0,16,0,0,0
KIRC,0,0,30,0,0
LUAD,2,0,0,26,0
PRAD,0,0,0,0,27


In [18]:
# SVM Sınıflandırma
SVM = OneVsRestClassifier(SVC())
SVM.fit(X_train_normalized, y_train)
y_pred_svm = SVM.predict(X_test_normalized)

In [19]:
# 10-fold cross-validation'ın Random Forest (eğitim verisi) için uygulanması
SVM_scores = cross_val_score(SVM, X_train_normalized, y_train, cv=kf, scoring='balanced_accuracy')
print(f'SVM Cross-Validation Scores for train data: {SVM_scores}')
print(f'SVM Mean Cross-Validation Score for train data: {SVM_scores.mean()}')

SVM Cross-Validation Scores for train data: [1.         1.         1.         1.         0.98823529 1.
 1.         1.         1.         1.        ]
SVM Mean Cross-Validation Score for train data: 0.9988235294117647


In [20]:
# 10-fold cross-validation'ın Random Forest (test verisi) için uygulanması
SVM_scores_test = cross_val_score(SVM, X_test_normalized, y_test, cv=kf, scoring='balanced_accuracy')
print(f'SVM Cross-Validation Scores for test data: {SVM_scores_test}')
print(f'SVM Mean Cross-Validation Score for test data: {SVM_scores_test.mean()}')

SVM Cross-Validation Scores for test data: [1.         1.         1.         1.         1.         1.
 0.93333333 1.         1.         1.        ]
SVM Mean Cross-Validation Score for test data: 0.9933333333333334


In [21]:
#SVM için accuracy değeri
accuracy_svm = balanced_accuracy_score(y_test, y_pred_svm)
print(f'SVM accuracy : {accuracy_svm}')

#SVM için Confussion matrix
cm_svm = confusion_matrix(y_test, y_pred_svm)
cm_df_svm = pd.DataFrame(cm_svm, index=labels, columns=labels)
print(cm_df_svm)

SVM accuracy : 0.9928571428571429
      BRCA  COAD  KIRC  LUAD  PRAD
BRCA    60     0     0     0     0
COAD     0    16     0     0     0
KIRC     0     0    30     0     0
LUAD     1     0     0    27     0
PRAD     0     0     0     0    27
