# Wdbc

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from docplex.mp.model import Model
import time
import warnings
warnings.filterwarnings("ignore")
from joblib import parallel_backend
import multiprocessing

n_cores = multiprocessing.cpu_count()
import csv

# Danh sách các file dữ liệu
file_names = [
    'wdbc.data.txt',
    'wdbc_noisy_label_feature.txt',
    'wdbc_noisy_label_outlier.txt',
    'wdbc_both_noise_outlier.txt'
]

print('Model: Pin-FS-SVM')
results = []
max_time_per_file = 30 * 60  # 30 phút
B_values = [5,14,5,10]
tau_values = [0.1,1,0.5,1]

for iteration, file_name in enumerate(file_names):
    print(f"\nXử lý file: {file_name}")
    if iteration == 0: 
        print('Not noise')
        B = B_values[0]
        tau = tau_values[0]
        noise_type = 'Not noise'
    elif iteration == 1:
        print('Noise')
        B = B_values[1]
        tau = tau_values[1]
        noise_type = 'Noise'
    elif iteration == 2:
        print('Outlier')
        B = B_values[2]
        tau = tau_values[2]
        noise_type = 'Outlier'
    else:
        B = B_values[3]
        tau = tau_values[3]
        noise_type = 'Noise + Outlier'
    
    df = pd.read_csv(file_name, header= None)

    # Tách dữ liệu và nhãn
    X = df.iloc[:, 2:].values
    y = df.iloc[:, 1].values

    # Chuyển đổi nhãn từ B/M thành -1/1
    y = np.where(y == 'M', 1, -1)

    # Chuẩn hóa dữ liệu
    scaler = StandardScaler()

    # Các tham số của mô hình
    l = [-2] * X.shape[1]
    u = [2] * X.shape[1]
    C_values = [2**i for i in range(-3, 6)]

    best_mean_cv_accuracy = 0
    best_mean_cv_AUC = 0
    n = X.shape[1]

    # Create a DataFrame to store C and AUC values
    c_auc_data = []

    with parallel_backend('loky', n_jobs=n_cores):
        for C in C_values:
            kf = KFold(n_splits=10, shuffle=True, random_state=42)
            cv_accuracies = []
            cv_AUC = []
            cv_times = []

            for train_index, val_index in kf.split(X):
                start_time = time.time()
                X_cv_train, X_cv_val = X[train_index], X[val_index]
                y_cv_train, y_cv_val = y[train_index], y[val_index]

                # Chuẩn hóa dữ liệu
                X_cv_train = scaler.fit_transform(X_cv_train)
                X_cv_val = scaler.transform(X_cv_val)

                # Khởi tạo mô hình
                opt_mod = Model(name='Pin-FS-SVM')
                m, n = X_cv_train.shape

                # Các biến quyết định
                w = opt_mod.continuous_var_list(n, name='w')
                b = opt_mod.continuous_var(name='b')
                v = opt_mod.binary_var_list(n, name='v')
                xi = opt_mod.continuous_var_list(m, lb=0, name='xi')
                z = opt_mod.continuous_var_list(n, lb=0, name='z')

                # Hàm mục tiêu
                opt_mod.minimize(opt_mod.sum(z[j] for j in range(n)) + C * opt_mod.sum(xi[i] for i in range(m)))

                # Ràng buộc phân loại
                for i in range(m):
                    opt_mod.add_constraint(y_cv_train[i] * (opt_mod.sum(w[j] * X_cv_train[i, j] for j in range(n)) + b) >= 1 - xi[i])
                    opt_mod.add_constraint(y_cv_train[i] * (opt_mod.sum(w[j] * X_cv_train[i, j] for j in range(n)) + b) <= 1 + xi[i] * (1/tau))

                # Ràng buộc tổng các đặc trưng được chọn
                opt_mod.add_constraint(opt_mod.sum(v[j] for j in range(n)) <= B)

                # Ràng buộc cho w_j^+ và w_j^-
                for j in range(n):
                    opt_mod.add_constraint(w[j] <= v[j] * u[j])
                    opt_mod.add_constraint(w[j] >= l[j] * v[j])
                    opt_mod.add_constraint(w[j] <= z[j])
                    opt_mod.add_constraint(w[j] >= -z[j])

                # Giải mô hình
                solution = opt_mod.solve()

                end_time = time.time()
                fold_time = end_time - start_time
                cv_times.append(fold_time)

                if solution:
                    w_opt = np.array([solution.get_value(f'w_{j}') for j in range(n)])
                    b_opt = solution.get_value('b')
                    v_opt = np.array([solution.get_value(f'v_{j}') for j in range(n)])
                    y_cv_pred = np.sign(np.dot(X_cv_val, w_opt) + b_opt)
                    cv_accuracy = accuracy_score(y_cv_val, y_cv_pred)
                    cv_auc = roc_auc_score(y_cv_val, y_cv_pred)
                    cv_accuracies.append(cv_accuracy)
                    cv_AUC.append(cv_auc)
                else:
                    print("Solution not found for fold.")

            mean_cv_accuracy = np.mean(cv_accuracies)
            mean_cv_AUC = np.mean(cv_AUC)
            mean_time_per_fold = np.mean(cv_times)

            # Log C and AUC values
            c_auc_data.append({
                'C': C,
                'AUC': mean_cv_AUC,
                'Dataset': file_name
            })

            if mean_cv_AUC > best_mean_cv_AUC:
                best_C = C
                best_mean_cv_accuracy = mean_cv_accuracy
                best_mean_cv_AUC = mean_cv_AUC
                best_mean_time_per_fold = mean_time_per_fold
                best_w = w_opt
                best_b = b_opt
                best_v = v_opt

    # Save C-AUC data to CSV for this dataset
    c_auc_df = pd.DataFrame(c_auc_data)
    csv_filename = f'wdbc_c_auc_log_{noise_type.lower().replace(" + ", "_")}.csv'
    c_auc_df.to_csv(csv_filename, index=False)
    print(f"C-AUC log saved to {csv_filename}")

    # Rest of the result logging code...
    print(f'Best C: {best_C}')
    print(f'Best tau: {tau}')
    print(f'Best Accuracy on 10-CV: {best_mean_cv_accuracy}')
    print(f'Best AUC on 10-CV:  {best_mean_cv_AUC}')
    print(f'Average time per fold: {best_mean_time_per_fold:.2f} seconds')

    selected_features = [j + 1 for j in range(n) if best_w[j] != 0]

    print(f'v: {best_v}')
    print(f'Features selected: {selected_features}')
    print(f'Number of selected features: {len(selected_features)}')
    print('-' * 100)
    
    result = {
        'Model': 'Pin-FS-SVM',
        'Type of model': noise_type,
        'Accuracy': best_mean_cv_accuracy,
        'AUC': best_mean_cv_AUC,
        'Time': best_mean_time_per_fold,
        'Features selected': ', '.join(map(str, selected_features)),
        'Number of features': len(selected_features),
        'C': best_C,
        'tau': tau
    }
    results.append(result)

# Diabetes

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from docplex.mp.model import Model
import time
import warnings
warnings.filterwarnings("ignore")
from joblib import parallel_backend
import multiprocessing

n_cores = multiprocessing.cpu_count()
import csv

# Danh sách các file dữ liệu
file_names = [
    'diabetes.csv',
    'diabetes_noise_label_feature.csv',
    'diabetes_outlier.csv',
    "diabetes_both_noise_outlier.csv"
]
print('Model: Pin-FS-SVM')
results = []
max_time_per_file = 30 * 60  # 30 phút
B_values = [5,5,4,5]
tau_values = [1,0.5,0.1,0.1]

for iteration, file_name in enumerate(file_names):
    print(f"\nXử lý file: {file_name}")
    if iteration == 0: 
        print('Not noise')
        B = B_values[0]
        tau = tau_values[0]
        noise_type = 'Not noise'
    elif iteration == 1:
        print('Noise')
        B = B_values[1]
        tau = tau_values[1]
        noise_type = 'Noise'
    elif iteration == 2:
        print('Outlier')
        B = B_values[2]
        tau = tau_values[2]
        noise_type = 'Outlier'
    else:
        B = B_values[3]
        tau = tau_values[3]
        noise_type = 'Noise + Outlier'
    
    df = pd.read_csv(file_name)

    X = df.iloc[:, :-1].values
    y = df.iloc[:, -1].values
    # Chuyển đổi nhãn từ 0/1 thành -1/1
    y = np.where(y == 0, -1, 1)

    # Chuẩn hóa dữ liệu
    scaler = StandardScaler()

    # Các tham số của mô hình
    l = [-2] * X.shape[1]
    u = [2] * X.shape[1]
    C_values = [2**i for i in range(-3, 6)]

    best_mean_cv_accuracy = 0
    best_mean_cv_AUC = 0
    n = X.shape[1]

    # Create a DataFrame to store C and AUC values
    c_auc_data = []

    with parallel_backend('loky', n_jobs=n_cores):
        for C in C_values:
            kf = KFold(n_splits=10, shuffle=True, random_state=42)
            cv_accuracies = []
            cv_AUC = []
            cv_times = []

            for train_index, val_index in kf.split(X):
                start_time = time.time()
                X_cv_train, X_cv_val = X[train_index], X[val_index]
                y_cv_train, y_cv_val = y[train_index], y[val_index]

                # Chuẩn hóa dữ liệu
                X_cv_train = scaler.fit_transform(X_cv_train)
                X_cv_val = scaler.transform(X_cv_val)

                # Khởi tạo mô hình
                opt_mod = Model(name='Pin-FS-SVM')
                m, n = X_cv_train.shape

                # Các biến quyết định
                w = opt_mod.continuous_var_list(n, name='w')
                b = opt_mod.continuous_var(name='b')
                v = opt_mod.binary_var_list(n, name='v')
                xi = opt_mod.continuous_var_list(m, lb=0, name='xi')
                z = opt_mod.continuous_var_list(n, lb=0, name='z')

                # Hàm mục tiêu
                opt_mod.minimize(opt_mod.sum(z[j] for j in range(n)) + C * opt_mod.sum(xi[i] for i in range(m)))

                # Ràng buộc phân loại
                for i in range(m):
                    opt_mod.add_constraint(y_cv_train[i] * (opt_mod.sum(w[j] * X_cv_train[i, j] for j in range(n)) + b) >= 1 - xi[i])
                    opt_mod.add_constraint(y_cv_train[i] * (opt_mod.sum(w[j] * X_cv_train[i, j] for j in range(n)) + b) <= 1 + xi[i] * (1/tau))

                # Ràng buộc tổng các đặc trưng được chọn
                opt_mod.add_constraint(opt_mod.sum(v[j] for j in range(n)) <= B)

                # Ràng buộc cho w_j^+ và w_j^-
                for j in range(n):
                    opt_mod.add_constraint(w[j] <= v[j] * u[j])
                    opt_mod.add_constraint(w[j] >= l[j] * v[j])
                    opt_mod.add_constraint(w[j] <= z[j])
                    opt_mod.add_constraint(w[j] >= -z[j])

                # Giải mô hình
                solution = opt_mod.solve()

                end_time = time.time()
                fold_time = end_time - start_time
                cv_times.append(fold_time)

                if solution:
                    w_opt = np.array([solution.get_value(f'w_{j}') for j in range(n)])
                    b_opt = solution.get_value('b')
                    v_opt = np.array([solution.get_value(f'v_{j}') for j in range(n)])
                    y_cv_pred = np.sign(np.dot(X_cv_val, w_opt) + b_opt)
                    cv_accuracy = accuracy_score(y_cv_val, y_cv_pred)
                    cv_auc = roc_auc_score(y_cv_val, y_cv_pred)
                    cv_accuracies.append(cv_accuracy)
                    cv_AUC.append(cv_auc)
                else:
                    print("Solution not found for fold.")

            mean_cv_accuracy = np.mean(cv_accuracies)
            mean_cv_AUC = np.mean(cv_AUC)
            mean_time_per_fold = np.mean(cv_times)

            # Log C and AUC values
            c_auc_data.append({
                'C': C,
                'AUC': mean_cv_AUC,
                'Dataset': file_name
            })

            if mean_cv_AUC > best_mean_cv_AUC:
                best_C = C
                best_mean_cv_accuracy = mean_cv_accuracy
                best_mean_cv_AUC = mean_cv_AUC
                best_mean_time_per_fold = mean_time_per_fold
                best_w = w_opt
                best_b = b_opt
                best_v = v_opt

    # Save C-AUC data to CSV for this dataset
    c_auc_df = pd.DataFrame(c_auc_data)
    csv_filename = f'diabetes_c_auc_log_{noise_type.lower().replace(" + ", "_")}.csv'
    c_auc_df.to_csv(csv_filename, index=False)
    print(f"C-AUC log saved to {csv_filename}")

    # Rest of the result logging code...
    print(f'Best C: {best_C}')
    print(f'Best tau: {tau}')
    print(f'Best Accuracy on 10-CV: {best_mean_cv_accuracy}')
    print(f'Best AUC on 10-CV:  {best_mean_cv_AUC}')
    print(f'Average time per fold: {best_mean_time_per_fold:.2f} seconds')

    selected_features = [j + 1 for j in range(n) if best_w[j] != 0]

    print(f'v: {best_v}')
    print(f'Features selected: {selected_features}')
    print(f'Number of selected features: {len(selected_features)}')
    print('-' * 100)
    
    result = {
        'Model': 'Pin-FS-SVM',
        'Type of model': noise_type,
        'Accuracy': best_mean_cv_accuracy,
        'AUC': best_mean_cv_AUC,
        'Time': best_mean_time_per_fold,
        'Features selected': ', '.join(map(str, selected_features)),
        'Number of features': len(selected_features),
        'C': best_C,
        'tau': tau
    }
    results.append(result)

# Ionosphere

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from docplex.mp.model import Model
import time
import warnings
warnings.filterwarnings("ignore")
from joblib import parallel_backend
import multiprocessing

n_cores = multiprocessing.cpu_count()
import csv

# Danh sách các file dữ liệu
file_names = [
    'ionosphere.data',
    'ionosphere_noise_label_feature.txt',
    'ionosphere_outlier.txt',
    "ionosphere_both_noise_outlier.txt"
]
print('Model: Pin-FS-SVM')
results = []
max_time_per_file = 30 * 60  # 30 phút
B_values = [12,14,12,2]
tau_values = [2,1,2,0.5]

for iteration, file_name in enumerate(file_names):
    print(f"\nXử lý file: {file_name}")
    if iteration == 0: 
        print('Not noise')
        B = B_values[0]
        tau = tau_values[0]
        noise_type = 'Not noise'
    elif iteration == 1:
        print('Noise')
        B = B_values[1]
        tau = tau_values[1]
        noise_type = 'Noise'
    elif iteration == 2:
        print('Outlier')
        B = B_values[2]
        tau = tau_values[2]
        noise_type = 'Outlier'
    else:
        B = B_values[3]
        tau = tau_values[3]
        noise_type = 'Noise + Outlier'
    
    df = pd.read_csv(file_name, header=None)

    # Tách dữ liệu và nhãn
    X = df.iloc[:, :-1].values
    y = df.iloc[:, -1].values
    # Chuyển đổi nhãn từ B/M thành -1/1
    y = np.where(y == 'g', 1, -1)

    # Chuẩn hóa dữ liệu
    scaler = StandardScaler()

    # Các tham số của mô hình
    l = [-2] * X.shape[1]
    u = [2] * X.shape[1]
    C_values = [2**i for i in range(-3, 6)]

    best_mean_cv_accuracy = 0
    best_mean_cv_AUC = 0
    n = X.shape[1]

    # Create a DataFrame to store C and AUC values
    c_auc_data = []

    with parallel_backend('loky', n_jobs=n_cores):
        for C in C_values:
            kf = KFold(n_splits=10, shuffle=True, random_state=42)
            cv_accuracies = []
            cv_AUC = []
            cv_times = []

            for train_index, val_index in kf.split(X):
                start_time = time.time()
                X_cv_train, X_cv_val = X[train_index], X[val_index]
                y_cv_train, y_cv_val = y[train_index], y[val_index]

                # Chuẩn hóa dữ liệu
                X_cv_train = scaler.fit_transform(X_cv_train)
                X_cv_val = scaler.transform(X_cv_val)

                # Khởi tạo mô hình
                opt_mod = Model(name='Pin-FS-SVM')
                m, n = X_cv_train.shape

                # Các biến quyết định
                w = opt_mod.continuous_var_list(n, name='w')
                b = opt_mod.continuous_var(name='b')
                v = opt_mod.binary_var_list(n, name='v')
                xi = opt_mod.continuous_var_list(m, lb=0, name='xi')
                z = opt_mod.continuous_var_list(n, lb=0, name='z')

                # Hàm mục tiêu
                opt_mod.minimize(opt_mod.sum(z[j] for j in range(n)) + C * opt_mod.sum(xi[i] for i in range(m)))

                # Ràng buộc phân loại
                for i in range(m):
                    opt_mod.add_constraint(y_cv_train[i] * (opt_mod.sum(w[j] * X_cv_train[i, j] for j in range(n)) + b) >= 1 - xi[i])
                    opt_mod.add_constraint(y_cv_train[i] * (opt_mod.sum(w[j] * X_cv_train[i, j] for j in range(n)) + b) <= 1 + xi[i] * (1/tau))

                # Ràng buộc tổng các đặc trưng được chọn
                opt_mod.add_constraint(opt_mod.sum(v[j] for j in range(n)) <= B)

                # Ràng buộc cho w_j^+ và w_j^-
                for j in range(n):
                    opt_mod.add_constraint(w[j] <= v[j] * u[j])
                    opt_mod.add_constraint(w[j] >= l[j] * v[j])
                    opt_mod.add_constraint(w[j] <= z[j])
                    opt_mod.add_constraint(w[j] >= -z[j])

                # Giải mô hình
                solution = opt_mod.solve()

                end_time = time.time()
                fold_time = end_time - start_time
                cv_times.append(fold_time)

                if solution:
                    w_opt = np.array([solution.get_value(f'w_{j}') for j in range(n)])
                    b_opt = solution.get_value('b')
                    v_opt = np.array([solution.get_value(f'v_{j}') for j in range(n)])
                    y_cv_pred = np.sign(np.dot(X_cv_val, w_opt) + b_opt)
                    cv_accuracy = accuracy_score(y_cv_val, y_cv_pred)
                    cv_auc = roc_auc_score(y_cv_val, y_cv_pred)
                    cv_accuracies.append(cv_accuracy)
                    cv_AUC.append(cv_auc)
                else:
                    print("Solution not found for fold.")

            mean_cv_accuracy = np.mean(cv_accuracies)
            mean_cv_AUC = np.mean(cv_AUC)
            mean_time_per_fold = np.mean(cv_times)

            # Log C and AUC values
            c_auc_data.append({
                'C': C,
                'AUC': mean_cv_AUC,
                'Dataset': file_name
            })

            if mean_cv_AUC > best_mean_cv_AUC:
                best_C = C
                best_mean_cv_accuracy = mean_cv_accuracy
                best_mean_cv_AUC = mean_cv_AUC
                best_mean_time_per_fold = mean_time_per_fold
                best_w = w_opt
                best_b = b_opt
                best_v = v_opt

    # Save C-AUC data to CSV for this dataset
    c_auc_df = pd.DataFrame(c_auc_data)
    csv_filename = f'ionosphere_c_auc_log_{noise_type.lower().replace(" + ", "_")}.csv'
    c_auc_df.to_csv(csv_filename, index=False)
    print(f"C-AUC log saved to {csv_filename}")

    # Rest of the result logging code...
    print(f'Best C: {best_C}')
    print(f'Best tau: {tau}')
    print(f'Best Accuracy on 10-CV: {best_mean_cv_accuracy}')
    print(f'Best AUC on 10-CV:  {best_mean_cv_AUC}')
    print(f'Average time per fold: {best_mean_time_per_fold:.2f} seconds')

    selected_features = [j + 1 for j in range(n) if best_w[j] != 0]

    print(f'v: {best_v}')
    print(f'Features selected: {selected_features}')
    print(f'Number of selected features: {len(selected_features)}')
    print('-' * 100)
    
    result = {
        'Model': 'Pin-FS-SVM',
        'Type of model': noise_type,
        'Accuracy': best_mean_cv_accuracy,
        'AUC': best_mean_cv_AUC,
        'Time': best_mean_time_per_fold,
        'Features selected': ', '.join(map(str, selected_features)),
        'Number of features': len(selected_features),
        'C': best_C,
        'tau': tau
    }
    results.append(result)

# Sonar

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from docplex.mp.model import Model
import time
import warnings
warnings.filterwarnings("ignore")
from joblib import parallel_backend
import multiprocessing

n_cores = multiprocessing.cpu_count()
import csv

# Danh sách các file dữ liệu
file_names = [
    'sonar.txt',
    'sonar_noise_label_feature.txt',
    'sonar_outlier.txt',
    'sonar_both_noise_outlier.txt'
]
print('Model: Pin-FS-SVM')
results = []
max_time_per_file = 30 * 60  # 30 phút
B_values = [8,5,2,7]
tau_values = [0.5,0.5,0.1,0.1]

for iteration, file_name in enumerate(file_names):
    print(f"\nXử lý file: {file_name}")
    if iteration == 0: 
        print('Not noise')
        B = B_values[0]
        tau = tau_values[0]
        noise_type = 'Not noise'
    elif iteration == 1:
        print('Noise')
        B = B_values[1]
        tau = tau_values[1]
        noise_type = 'Noise'
    elif iteration == 2:
        print('Outlier')
        B = B_values[2]
        tau = tau_values[2]
        noise_type = 'Outlier'
    else:
        B = B_values[3]
        tau = tau_values[3]
        noise_type = 'Noise + Outlier'
    
    df = pd.read_csv(file_name, header=None)

    X = df.iloc[:, 0:60].values
    y = df .iloc[:,60].values
    # Chuyển đổi nhãn từ B/M thành -1/1
    y = np.where(y == 'M', 1, -1)

    # Chuẩn hóa dữ liệu
    scaler = StandardScaler()

    # Các tham số của mô hình
    l = [-2] * X.shape[1]
    u = [2] * X.shape[1]
    C_values = [2**i for i in range(-3, 6)]

    best_mean_cv_accuracy = 0
    best_mean_cv_AUC = 0
    n = X.shape[1]

    # Create a DataFrame to store C and AUC values
    c_auc_data = []

    with parallel_backend('loky', n_jobs=n_cores):
        for C in C_values:
            kf = KFold(n_splits=10, shuffle=True, random_state=42)
            cv_accuracies = []
            cv_AUC = []
            cv_times = []

            for train_index, val_index in kf.split(X):
                start_time = time.time()
                X_cv_train, X_cv_val = X[train_index], X[val_index]
                y_cv_train, y_cv_val = y[train_index], y[val_index]

                # Chuẩn hóa dữ liệu
                X_cv_train = scaler.fit_transform(X_cv_train)
                X_cv_val = scaler.transform(X_cv_val)

                # Khởi tạo mô hình
                opt_mod = Model(name='Pin-FS-SVM')
                m, n = X_cv_train.shape

                # Các biến quyết định
                w = opt_mod.continuous_var_list(n, name='w')
                b = opt_mod.continuous_var(name='b')
                v = opt_mod.binary_var_list(n, name='v')
                xi = opt_mod.continuous_var_list(m, lb=0, name='xi')
                z = opt_mod.continuous_var_list(n, lb=0, name='z')

                # Hàm mục tiêu
                opt_mod.minimize(opt_mod.sum(z[j] for j in range(n)) + C * opt_mod.sum(xi[i] for i in range(m)))

                # Ràng buộc phân loại
                for i in range(m):
                    opt_mod.add_constraint(y_cv_train[i] * (opt_mod.sum(w[j] * X_cv_train[i, j] for j in range(n)) + b) >= 1 - xi[i])
                    opt_mod.add_constraint(y_cv_train[i] * (opt_mod.sum(w[j] * X_cv_train[i, j] for j in range(n)) + b) <= 1 + xi[i] * (1/tau))

                # Ràng buộc tổng các đặc trưng được chọn
                opt_mod.add_constraint(opt_mod.sum(v[j] for j in range(n)) <= B)

                # Ràng buộc cho w_j^+ và w_j^-
                for j in range(n):
                    opt_mod.add_constraint(w[j] <= v[j] * u[j])
                    opt_mod.add_constraint(w[j] >= l[j] * v[j])
                    opt_mod.add_constraint(w[j] <= z[j])
                    opt_mod.add_constraint(w[j] >= -z[j])

                # Giải mô hình
                solution = opt_mod.solve()

                end_time = time.time()
                fold_time = end_time - start_time
                cv_times.append(fold_time)

                if solution:
                    w_opt = np.array([solution.get_value(f'w_{j}') for j in range(n)])
                    b_opt = solution.get_value('b')
                    v_opt = np.array([solution.get_value(f'v_{j}') for j in range(n)])
                    y_cv_pred = np.sign(np.dot(X_cv_val, w_opt) + b_opt)
                    cv_accuracy = accuracy_score(y_cv_val, y_cv_pred)
                    cv_auc = roc_auc_score(y_cv_val, y_cv_pred)
                    cv_accuracies.append(cv_accuracy)
                    cv_AUC.append(cv_auc)
                else:
                    print("Solution not found for fold.")

            mean_cv_accuracy = np.mean(cv_accuracies)
            mean_cv_AUC = np.mean(cv_AUC)
            mean_time_per_fold = np.mean(cv_times)

            # Log C and AUC values
            c_auc_data.append({
                'C': C,
                'AUC': mean_cv_AUC,
                'Dataset': file_name
            })

            if mean_cv_AUC > best_mean_cv_AUC:
                best_C = C
                best_mean_cv_accuracy = mean_cv_accuracy
                best_mean_cv_AUC = mean_cv_AUC
                best_mean_time_per_fold = mean_time_per_fold
                best_w = w_opt
                best_b = b_opt
                best_v = v_opt

    # Save C-AUC data to CSV for this dataset
    c_auc_df = pd.DataFrame(c_auc_data)
    csv_filename = f'sonar_c_auc_log_{noise_type.lower().replace(" + ", "_")}.csv'
    c_auc_df.to_csv(csv_filename, index=False)
    print(f"C-AUC log saved to {csv_filename}")

    # Rest of the result logging code...
    print(f'Best C: {best_C}')
    print(f'Best tau: {tau}')
    print(f'Best Accuracy on 10-CV: {best_mean_cv_accuracy}')
    print(f'Best AUC on 10-CV:  {best_mean_cv_AUC}')
    print(f'Average time per fold: {best_mean_time_per_fold:.2f} seconds')

    selected_features = [j + 1 for j in range(n) if best_w[j] != 0]

    print(f'v: {best_v}')
    print(f'Features selected: {selected_features}')
    print(f'Number of selected features: {len(selected_features)}')
    print('-' * 100)
    
    result = {
        'Model': 'Pin-FS-SVM',
        'Type of model': noise_type,
        'Accuracy': best_mean_cv_accuracy,
        'AUC': best_mean_cv_AUC,
        'Time': best_mean_time_per_fold,
        'Features selected': ', '.join(map(str, selected_features)),
        'Number of features': len(selected_features),
        'C': best_C,
        'tau': tau
    }
    results.append(result)

# Australia

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from docplex.mp.model import Model
import time
import warnings
warnings.filterwarnings("ignore")
from joblib import parallel_backend
import multiprocessing

n_cores = multiprocessing.cpu_count()
import csv

# Danh sách các file dữ liệu
file_names = [
    "australia.txt",
    "australia_noise_label_feature.txt",
    "australia_outlier.txt",
    'australia_both_noise_outlier.txt'
]
print('Model: Pin-FS-SVM')
results = []
max_time_per_file = 30 * 60  # 30 phút
B_values = [1,1,1,1]
tau_values = [0.1,0.1,0.1,0.1]

for iteration, file_name in enumerate(file_names):
    print(f"\nXử lý file: {file_name}")
    if iteration == 0: 
        print('Not noise')
        B = B_values[0]
        tau = tau_values[0]
        noise_type = 'Not noise'
        df = pd.read_csv(file_name, header= None, sep = ' ')
    elif iteration == 1:
        print('Noise')
        B = B_values[1]
        tau = tau_values[1]
        noise_type = 'Noise'
        df = pd.read_csv(file_name, header= None)
    elif iteration == 2:
        print('Outlier')
        B = B_values[2]
        tau = tau_values[2]
        noise_type = 'Outlier'
        df = pd.read_csv(file_name, header= None)
    else:
        B = B_values[3]
        tau = tau_values[3]
        noise_type = 'Noise + Outlier'
        df = pd.read_csv(file_name, header= None)
    # df = pd.read_csv(file_name)

    X = df.iloc[:,:14].values
    y = df.iloc[:,14].values
    y = np.where(y == 0, -1, y)

    # Chuẩn hóa dữ liệu
    scaler = StandardScaler()

    # Các tham số của mô hình
    l = [-2] * X.shape[1]
    u = [2] * X.shape[1]
    C_values = [2**i for i in range(-3, 6)]

    best_mean_cv_accuracy = 0
    best_mean_cv_AUC = 0
    n = X.shape[1]

    # Create a DataFrame to store C and AUC values
    c_auc_data = []

    with parallel_backend('loky', n_jobs=n_cores):
        for C in C_values:
            kf = KFold(n_splits=10, shuffle=True, random_state=42)
            cv_accuracies = []
            cv_AUC = []
            cv_times = []

            for train_index, val_index in kf.split(X):
                start_time = time.time()
                X_cv_train, X_cv_val = X[train_index], X[val_index]
                y_cv_train, y_cv_val = y[train_index], y[val_index]

                # Chuẩn hóa dữ liệu
                X_cv_train = scaler.fit_transform(X_cv_train)
                X_cv_val = scaler.transform(X_cv_val)

                # Khởi tạo mô hình
                opt_mod = Model(name='Pin-FS-SVM')
                m, n = X_cv_train.shape

                # Các biến quyết định
                w = opt_mod.continuous_var_list(n, name='w')
                b = opt_mod.continuous_var(name='b')
                v = opt_mod.binary_var_list(n, name='v')
                xi = opt_mod.continuous_var_list(m, lb=0, name='xi')
                z = opt_mod.continuous_var_list(n, lb=0, name='z')

                # Hàm mục tiêu
                opt_mod.minimize(opt_mod.sum(z[j] for j in range(n)) + C * opt_mod.sum(xi[i] for i in range(m)))

                # Ràng buộc phân loại
                for i in range(m):
                    opt_mod.add_constraint(y_cv_train[i] * (opt_mod.sum(w[j] * X_cv_train[i, j] for j in range(n)) + b) >= 1 - xi[i])
                    opt_mod.add_constraint(y_cv_train[i] * (opt_mod.sum(w[j] * X_cv_train[i, j] for j in range(n)) + b) <= 1 + xi[i] * (1/tau))

                # Ràng buộc tổng các đặc trưng được chọn
                opt_mod.add_constraint(opt_mod.sum(v[j] for j in range(n)) <= B)

                # Ràng buộc cho w_j^+ và w_j^-
                for j in range(n):
                    opt_mod.add_constraint(w[j] <= v[j] * u[j])
                    opt_mod.add_constraint(w[j] >= l[j] * v[j])
                    opt_mod.add_constraint(w[j] <= z[j])
                    opt_mod.add_constraint(w[j] >= -z[j])

                # Giải mô hình
                solution = opt_mod.solve()

                end_time = time.time()
                fold_time = end_time - start_time
                cv_times.append(fold_time)

                if solution:
                    w_opt = np.array([solution.get_value(f'w_{j}') for j in range(n)])
                    b_opt = solution.get_value('b')
                    v_opt = np.array([solution.get_value(f'v_{j}') for j in range(n)])
                    y_cv_pred = np.sign(np.dot(X_cv_val, w_opt) + b_opt)
                    cv_accuracy = accuracy_score(y_cv_val, y_cv_pred)
                    cv_auc = roc_auc_score(y_cv_val, y_cv_pred)
                    cv_accuracies.append(cv_accuracy)
                    cv_AUC.append(cv_auc)
                else:
                    print("Solution not found for fold.")

            mean_cv_accuracy = np.mean(cv_accuracies)
            mean_cv_AUC = np.mean(cv_AUC)
            mean_time_per_fold = np.mean(cv_times)

            # Log C and AUC values
            c_auc_data.append({
                'C': C,
                'AUC': mean_cv_AUC,
                'Dataset': file_name
            })

            if mean_cv_AUC > best_mean_cv_AUC:
                best_C = C
                best_mean_cv_accuracy = mean_cv_accuracy
                best_mean_cv_AUC = mean_cv_AUC
                best_mean_time_per_fold = mean_time_per_fold
                best_w = w_opt
                best_b = b_opt
                best_v = v_opt

    # Save C-AUC data to CSV for this dataset
    c_auc_df = pd.DataFrame(c_auc_data)
    csv_filename = f'aus_c_auc_log_{noise_type.lower().replace(" + ", "_")}.csv'
    c_auc_df.to_csv(csv_filename, index=False)
    print(f"C-AUC log saved to {csv_filename}")

    # Rest of the result logging code...
    print(f'Best C: {best_C}')
    print(f'Best tau: {tau}')
    print(f'Best Accuracy on 10-CV: {best_mean_cv_accuracy}')
    print(f'Best AUC on 10-CV:  {best_mean_cv_AUC}')
    print(f'Average time per fold: {best_mean_time_per_fold:.2f} seconds')

    selected_features = [j + 1 for j in range(n) if best_w[j] != 0]

    print(f'v: {best_v}')
    print(f'Features selected: {selected_features}')
    print(f'Number of selected features: {len(selected_features)}')
    print('-' * 100)
    
    result = {
        'Model': 'Pin-FS-SVM',
        'Type of model': noise_type,
        'Accuracy': best_mean_cv_accuracy,
        'AUC': best_mean_cv_AUC,
        'Time': best_mean_time_per_fold,
        'Features selected': ', '.join(map(str, selected_features)),
        'Number of features': len(selected_features),
        'C': best_C,
        'tau': tau
    }
    results.append(result)

# Cleveland

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from docplex.mp.model import Model
import time
import warnings
warnings.filterwarnings("ignore")
from joblib import parallel_backend
import multiprocessing

n_cores = multiprocessing.cpu_count()
import csv

# Danh sách các file dữ liệu
file_names = [
    'Heart_disease_cleveland_new.csv',
    'clevaland_noise_label_feature.csv',
    'clevaland_outlier.csv',
    'cleveland_both_noise_outlier.csv'
]
print('Model: Pin-FS-SVM')
results = []
max_time_per_file = 30 * 60  # 30 phút
B_values = [6,4,6,9]
tau_values = [0.1,0.1,0.1,0.1]

for iteration, file_name in enumerate(file_names):
    print(f"\nXử lý file: {file_name}")
    if iteration == 0: 
        print('Not noise')
        B = B_values[0]
        tau = tau_values[0]
        noise_type = 'Not noise'
    elif iteration == 1:
        print('Noise')
        B = B_values[1]
        tau = tau_values[1]
        noise_type = 'Noise'
    elif iteration == 2:
        print('Outlier')
        B = B_values[2]
        tau = tau_values[2]
        noise_type = 'Outlier'
    else:
        B = B_values[3]
        tau = tau_values[3]
        noise_type = 'Noise + Outlier'
    
    df = pd.read_csv(file_name, header=None)

    X = df.iloc[1:,0:13].values
    X = X.astype(float)
    y = df.iloc[1:, 13].values
    y = y.astype(float)
    y = np.where(y == 0 , -1 , y)

    # Chuẩn hóa dữ liệu
    scaler = StandardScaler()

    # Các tham số của mô hình
    l = [-4] * X.shape[1]
    u = [4] * X.shape[1]
    C_values = [2**i for i in range(-3, 6)]

    best_mean_cv_accuracy = 0
    best_mean_cv_AUC = 0
    n = X.shape[1]

    # Create a DataFrame to store C and AUC values
    c_auc_data = []

    with parallel_backend('loky', n_jobs=n_cores):
        for C in C_values:
            kf = KFold(n_splits=10, shuffle=True, random_state=42)
            cv_accuracies = []
            cv_AUC = []
            cv_times = []

            for train_index, val_index in kf.split(X):
                start_time = time.time()
                X_cv_train, X_cv_val = X[train_index], X[val_index]
                y_cv_train, y_cv_val = y[train_index], y[val_index]

                # Chuẩn hóa dữ liệu
                X_cv_train = scaler.fit_transform(X_cv_train)
                X_cv_val = scaler.transform(X_cv_val)

                # Khởi tạo mô hình
                opt_mod = Model(name='Pin-FS-SVM')
                m, n = X_cv_train.shape

                # Các biến quyết định
                w = opt_mod.continuous_var_list(n, name='w')
                b = opt_mod.continuous_var(name='b')
                v = opt_mod.binary_var_list(n, name='v')
                xi = opt_mod.continuous_var_list(m, lb=0, name='xi')
                z = opt_mod.continuous_var_list(n, lb=0, name='z')

                # Hàm mục tiêu
                opt_mod.minimize(opt_mod.sum(z[j] for j in range(n)) + C * opt_mod.sum(xi[i] for i in range(m)))

                # Ràng buộc phân loại
                for i in range(m):
                    opt_mod.add_constraint(y_cv_train[i] * (opt_mod.sum(w[j] * X_cv_train[i, j] for j in range(n)) + b) >= 1 - xi[i])
                    opt_mod.add_constraint(y_cv_train[i] * (opt_mod.sum(w[j] * X_cv_train[i, j] for j in range(n)) + b) <= 1 + xi[i] * (1/tau))

                # Ràng buộc tổng các đặc trưng được chọn
                opt_mod.add_constraint(opt_mod.sum(v[j] for j in range(n)) <= B)

                # Ràng buộc cho w_j^+ và w_j^-
                for j in range(n):
                    opt_mod.add_constraint(w[j] <= v[j] * u[j])
                    opt_mod.add_constraint(w[j] >= l[j] * v[j])
                    opt_mod.add_constraint(w[j] <= z[j])
                    opt_mod.add_constraint(w[j] >= -z[j])

                # Giải mô hình
                solution = opt_mod.solve()

                end_time = time.time()
                fold_time = end_time - start_time
                cv_times.append(fold_time)

                if solution:
                    w_opt = np.array([solution.get_value(f'w_{j}') for j in range(n)])
                    b_opt = solution.get_value('b')
                    v_opt = np.array([solution.get_value(f'v_{j}') for j in range(n)])
                    y_cv_pred = np.sign(np.dot(X_cv_val, w_opt) + b_opt)
                    cv_accuracy = accuracy_score(y_cv_val, y_cv_pred)
                    cv_auc = roc_auc_score(y_cv_val, y_cv_pred)
                    cv_accuracies.append(cv_accuracy)
                    cv_AUC.append(cv_auc)
                else:
                    print("Solution not found for fold.")

            mean_cv_accuracy = np.mean(cv_accuracies)
            mean_cv_AUC = np.mean(cv_AUC)
            mean_time_per_fold = np.mean(cv_times)

            # Log C and AUC values
            c_auc_data.append({
                'C': C,
                'AUC': mean_cv_AUC,
                'Dataset': file_name
            })

            if mean_cv_AUC > best_mean_cv_AUC:
                best_C = C
                best_mean_cv_accuracy = mean_cv_accuracy
                best_mean_cv_AUC = mean_cv_AUC
                best_mean_time_per_fold = mean_time_per_fold
                best_w = w_opt
                best_b = b_opt
                best_v = v_opt

    # Save C-AUC data to CSV for this dataset
    c_auc_df = pd.DataFrame(c_auc_data)
    csv_filename = f'cleveland_c_auc_log_{noise_type.lower().replace(" + ", "_")}.csv'
    c_auc_df.to_csv(csv_filename, index=False)
    print(f"C-AUC log saved to {csv_filename}")

    # Rest of the result logging code...
    print(f'Best C: {best_C}')
    print(f'Best tau: {tau}')
    print(f'Best Accuracy on 10-CV: {best_mean_cv_accuracy}')
    print(f'Best AUC on 10-CV:  {best_mean_cv_AUC}')
    print(f'Average time per fold: {best_mean_time_per_fold:.2f} seconds')

    selected_features = [j + 1 for j in range(n) if best_w[j] != 0]

    print(f'v: {best_v}')
    print(f'Features selected: {selected_features}')
    print(f'Number of selected features: {len(selected_features)}')
    print('-' * 100)
    
    result = {
        'Model': 'Pin-FS-SVM',
        'Type of model': noise_type,
        'Accuracy': best_mean_cv_accuracy,
        'AUC': best_mean_cv_AUC,
        'Time': best_mean_time_per_fold,
        'Features selected': ', '.join(map(str, selected_features)),
        'Number of features': len(selected_features),
        'C': best_C,
        'tau': tau
    }
    results.append(result)