In [2]:
from sklearn.inspection import permutation_importance
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
import pandas as pd
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# SVM's with Feature Selection

## Lipids Only

### Linear

In [2]:
DATASET = 'Dataset2.0.csv'
dataset = pd.read_csv(DATASET)
dataset['Sex'] = dataset['Sex'].replace({'M': 1, 'F': 0})
lipids_only = dataset.iloc[:, 54:837]
columns_to_drop = ['LHC_ICV', 'RHC_ICV']
X = lipids_only.drop(columns=columns_to_drop, axis=1)
y = lipids_only['LHC_ICV']

svm = SVR(kernel='linear')
svm.fit(X, y)

perm_importance = permutation_importance(svm, X, y)

# Making the sum of feature importance being equal to 1.0,
# so feature importance can be understood as percentage
perm_importance_normalized = perm_importance.importances_mean/perm_importance.importances_mean.sum()

# Feature's name (considering your X a DataFrame)
feature_names = X.columns
features = np.array(feature_names)

# Sort to plot in order of importance
sorted_idx = perm_importance_normalized.argsort()

top_features = features[sorted_idx][-5:]
top_scores = perm_importance_normalized[sorted_idx][-5:]

for feature, score in zip(top_features, top_scores):
    print(f"{feature}: {score:.4f}")

PC.P.17.0.20.4...A.: nan
PC.P.17.0.20.4...B.: nan
PC.P.18.0.18.2.: nan
PC.O.34.1.: nan
PC.36.4....OH.: nan


  perm_importance_normalized = perm_importance.importances_mean/perm_importance.importances_mean.sum()


### Polynomial

In [3]:
DATASET = 'Dataset2.0.csv'
dataset = pd.read_csv(DATASET)
dataset['Sex'] = dataset['Sex'].replace({'M': 1, 'F': 0})
lipids_only = dataset.iloc[:, 54:837]
columns_to_drop = ['LHC_ICV', 'RHC_ICV']
X = lipids_only.drop(columns=columns_to_drop, axis=1)
y = lipids_only['LHC_ICV']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

svm = SVR(kernel='poly')
svm.fit(X_train, y_train)

perm_importance = permutation_importance(svm, X_test, y_test)

# Making the sum of feature importance being equal to 1.0,
# so feature importance can be understood as percentage
perm_importance_normalized = perm_importance.importances_mean/perm_importance.importances_mean.sum()

# Feature's name (considering your X a DataFrame)
feature_names = X.columns
features = np.array(feature_names)

# Sort to plot in order of importance
sorted_idx = perm_importance_normalized.argsort()

top_features = features[sorted_idx][-5:]
top_scores = perm_importance_normalized[sorted_idx][-5:]

for feature, score in zip(top_features, top_scores):
    print(f"{feature}: {score:.4f}")

Features with Importance above 0.20:
Original Mean Squared Error: 0.00


  perm_importance_normalized = perm_importance.importances_mean/perm_importance.importances_mean.sum()


### RBF (Gaussian)

In [None]:
DATASET = 'Dataset2.0.csv'
dataset = pd.read_csv(DATASET)
dataset['Sex'] = dataset['Sex'].replace({'M': 1, 'F': 0})
lipids_only = dataset.iloc[:, 54:837]
columns_to_drop = ['LHC_ICV', 'RHC_ICV']
X = lipids_only.drop(columns=columns_to_drop, axis=1)
y = lipids_only['LHC_ICV']

svm = SVR(kernel='rbf', gamma='auto')
svm.fit(X, y)

perm_importance = permutation_importance(svm, X, y)

# Making the sum of feature importance being equal to 1.0,
# so feature importance can be understood as percentage
perm_importance_normalized = perm_importance.importances_mean/perm_importance.importances_mean.sum()

# Feature's name (considering your X a DataFrame)
feature_names = X.columns
features = np.array(feature_names)

# Sort to plot in order of importance
sorted_idx = perm_importance_normalized.argsort()

top_features = features[sorted_idx][-5:]
top_scores = perm_importance_normalized[sorted_idx][-5:]

for feature, score in zip(top_features, top_scores):
    print(f"{feature}: {score:.4f}")

### Sigmoid

In [None]:
DATASET = 'Dataset2.0.csv'
dataset = pd.read_csv(DATASET)
dataset['Sex'] = dataset['Sex'].replace({'M': 1, 'F': 0})
lipids_only = dataset.iloc[:, 54:837]
columns_to_drop = ['LHC_ICV', 'RHC_ICV']
X = lipids_only.drop(columns=columns_to_drop, axis=1)
y = lipids_only['LHC_ICV']

svm = SVR(kernel='sigmoid', gamma='auto')
svm.fit(X, y)

perm_importance = permutation_importance(svm, X, y)

# Making the sum of feature importance being equal to 1.0,
# so feature importance can be understood as percentage
perm_importance_normalized = perm_importance.importances_mean/perm_importance.importances_mean.sum()

# Feature's name (considering your X a DataFrame)
feature_names = X.columns
features = np.array(feature_names)

# Sort to plot in order of importance
sorted_idx = perm_importance_normalized.argsort()

top_features = features[sorted_idx][-5:]
top_scores = perm_importance_normalized[sorted_idx][-5:]

for feature, score in zip(top_features, top_scores):
    print(f"{feature}: {score:.4f}")