In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import os
import kagglehub

I'm gonna use iris dataset:
https://www.kaggle.com/datasets/himanshunakrani/iris-dataset

In [None]:
path = kagglehub.dataset_download("himanshunakrani/iris-dataset")
print("Files in the directory:", os.listdir(path))

In [None]:
data = pd.read_csv(os.path.join(path, "iris.csv"))

In [None]:
categorical_cols = data.select_dtypes(include=['object']).columns

In [None]:
X = data.drop(columns=['species'])
y = data['species']

Number of feature of the dataset : 4

In [None]:
len(X.T)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
svm_model = SVC(kernel='linear')

I'm going to store only the one that lead me to a better solution than before

We can notice that the only feature usefull is the first one: petal_length

In [None]:
def generate():
  predBest_accuracy = 0
  best_n_features = 0
  accuracies = []

  for i in range(1, X.shape[1] + 1):
      rfe = RFE(estimator=svm_model, n_features_to_select=i)
      rfe.fit(X_train, y_train)

      X_train_rfe = rfe.transform(X_train)
      X_test_rfe = rfe.transform(X_test)

      svm_model.fit(X_train_rfe, y_train)
      y_pred = svm_model.predict(X_test_rfe)

      accuracy = accuracy_score(y_test, y_pred)
      accuracies.append(accuracy)

      if accuracy > predBest_accuracy:
          best_n_features = i
          predBest_accuracy = accuracy

  print(f"Numero ottimale di feature: {best_n_features}")
  print(f"Accuratezza migliore: {predBest_accuracy * 100:.2f}%")

  plt.figure(figsize=(10, 6))
  plt.plot(range(1, X.shape[1] + 1), accuracies, marker='o', color='b')
  plt.xlabel("Numero di feature selezionate")
  plt.ylabel("Accuratezza")
  plt.title("Accuratezza del modello SVM in funzione del numero di feature selezionate con RFE")
  plt.grid()
  plt.show()

generate()

We can notice that if we remove petal_length, the number of feature needed is 2

In [None]:
X = X.drop(columns=['petal_length'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
svm_model = SVC(kernel='linear')

In [None]:
generate()