In [1]:
import sys
import gc  # Garbage collector interface


from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import pickle

# Balance the dataset
from imblearn.over_sampling import SMOTE



In [2]:
svm_data_list = ['svm_dataset_0_5', 'svm_dataset_0', 'svm_dataset_1', 'svm_dataset_1_5', 'svm_dataset_2', 'svm_dataset_3', 'svm_dataset_4', 'svm_dataset_5' ]

svm_data = []

for f_name in svm_data_list:
    with open(f_name + '.pkl', 'rb') as f:
        svm_data.extend(pickle.load(f))

print('Size of svm_dataset:', sys.getsizeof(svm_data) / 1024 / 1024, 'MB')


Size of svm_dataset: 7.071983337402344 MB


In [3]:
len(svm_data)

823935

In [4]:
features, labels = zip(*svm_data)
classes = set(labels)  # Unique classes from your labels

# Normalize the features
features = [x.flatten() for x in features]
scaler = StandardScaler()

# Delete the original DataFrame
del svm_data

# Manually trigger garbage collection
gc.collect()

print("Done")


Done


In [5]:
print('Size of features:', sys.getsizeof(features) / 1024 / 1024, 'MB')

Size of features: 6.366233825683594 MB


In [None]:
# perform a scaling on the data in a batch process as memory is limited
features_scaled = []
for i in range(0, len(features), 1000):
    print(i)
    features_scaled.extend(scaler.fit_transform(features[i:i+1000]))

# Delete the original DataFrame
del features

# Manually trigger garbage collection
gc.collect()

print("Done")


In [7]:
svm_classifiers = {}

for class_label in classes:
    print(f"Training SVM for class: {class_label}")
    # Create binary labels for the current class
    binary_labels = [1 if label == class_label else 0 for label in labels]

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(features_scaled, binary_labels, test_size=0.2, random_state=42)

    # smote = SMOTE(random_state=42)
    # X_train, y_train = smote.fit_resample(X_train, y_train)


    # Define the SVM and perform grid search
    parameters = {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'rbf']}
    svc = svm.SVC(class_weight='balanced')  # Adjust class weights for imbalance
    clf = GridSearchCV(svc, parameters, cv=5)
    clf.fit(X_train, y_train)

    # Store the classifier
    svm_classifiers[class_label] = clf

    # Output the best parameters and performance metrics
    print(f"Best parameters for {class_label}: {clf.best_params_}")
    print("Classification Report:")
    print(classification_report(y_test, clf.predict(X_test)))




Training SVM for class: diningtable
