In [1]:
import pandas as pd
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.svm import LinearSVC


In [2]:
frames: list[pd.DataFrame] = []

for i in range(10):
    df = pd.read_csv(f"../../Datasets/Crop/crop{i}.csv")
    frames.append(df)

df = pd.concat(frames)


In [3]:
for column in df.columns[1:]:
    if df[column].dtype != "float64":
        df[column] = df[column].astype("float")

df["label"] = df["label"].astype("category")

df.head()


Unnamed: 0,label,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f165,f166,f167,f168,f169,f170,f171,f172,f173,f174
0,1,-13.559,-21.407,-11.404,-15.248,-11.923,-15.291,-2.1548,-7.8474,-10.002,...,0.18519,0.72602,5.3333,6.0,0.29489,9.7778,2.4444,1.677,0.20988,0.65422
1,1,-12.802,-20.335,-10.399,-14.132,-11.096,-14.361,-2.4039,-7.533,-9.9369,...,0.33333,-0.48751,2.1111,0.098765,0.83333,0.33333,0.33333,0.84869,0.50617,-0.18898
2,1,-12.431,-19.902,-10.074,-13.598,-10.829,-14.048,-2.3566,-7.4717,-9.8283,...,0.25926,0.25298,2.2222,0.17284,0.68889,0.88889,0.66667,1.273,0.30864,0.10483
3,1,-12.689,-19.529,-10.028,-13.35,-11.056,-14.014,-2.6611,-6.8396,-9.5006,...,0.16049,0.4375,4.1111,0.32099,0.83333,0.33333,0.33333,1.1491,0.38272,0.41603
4,1,-12.686,-19.278,-9.8185,-13.108,-10.932,-13.939,-2.8675,-6.5919,-9.4594,...,0.18519,0.35,4.0,0.44444,0.68889,0.88889,0.66667,1.5811,0.20988,0.5


In [4]:
X = df.iloc[:, 1:]
y = df.iloc[:, :1]


In [5]:
import warnings
warnings.filterwarnings("ignore")

linear_svc = LinearSVC()
forward_selector = SequentialFeatureSelector(
    linear_svc, scoring="accuracy", tol=0.01, n_jobs=-1, n_features_to_select="auto")
forward_selector.fit(X, y)

selected_features_mask = forward_selector.get_support()
selected_feature_indices = [0] + [i for (i, x) in enumerate(selected_features_mask) if x]

print("Before Forward feature selection:", df.shape)
print("After Forward feature selection:", df.iloc[:, selected_feature_indices].shape)

df.iloc[:, selected_feature_indices].head()


Before Forward feature selection: (325830, 175)
After Forward feature selection: (325830, 4)


Unnamed: 0,label,f4,f57,f107
0,1,-15.248,-10.873,0.50488
1,1,-14.132,-11.172,0.42041
2,1,-13.598,-11.526,0.40217
3,1,-13.35,-11.61,0.32957
4,1,-13.108,-11.805,0.32678


In [6]:
df_new = df.iloc[:, selected_feature_indices]
df_new.to_csv("../../DatasetsFeatureSelection/Crop/crop.csv", index=False)

df_new.head()


Unnamed: 0,label,f4,f57,f107
0,1,-15.248,-10.873,0.50488
1,1,-14.132,-11.172,0.42041
2,1,-13.598,-11.526,0.40217
3,1,-13.35,-11.61,0.32957
4,1,-13.108,-11.805,0.32678


In [21]:
splits = 10
split_size = df.shape[0] // splits
dfs: list[pd.DataFrame] = [df.iloc[i:(i + split_size), :].reset_index(drop=True)
                           for i in range(0, df.shape[0], split_size)]

for i in range(splits):
    dfs[i].to_csv(f"../../datasets/Crop/crop{i}.csv", index=False)
