In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import numpy as np

df = pd.read_csv('/Users/anchanghun/Downloads/kddcup.data', header=None)

feature_names = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes",
    "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in",
    "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations",
    "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login", "is_guest_login",
    "count", "srv_count", "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate",
    "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
    "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate",
    "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label"
]

# Rename the columns of the DataFrame
df.columns = feature_names

df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,http,SF,215,45076,0,0,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,162,4528,0,0,0,0,...,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,236,1228,0,0,0,0,...,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,233,2032,0,0,0,0,...,3,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,239,486,0,0,0,0,...,4,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,normal.


In [7]:
# 10000개 샘플 뽑기
df_split = df.sample(n=10000, random_state=42)  # Sample for demonstration
X = df_split.iloc[:, :-1]
y = df_split.iloc[:, -1]

#X = df.iloc[:, :-1]
#y = df.iloc[:, -1]

# Encode categorical features
encoders = {col: LabelEncoder().fit(X[col]) for col in X.select_dtypes(include=['object']).columns}
for col, encoder in encoders.items():
    X[col] = encoder.transform(X[col])

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [8]:
def evaluate_solution(solution):
    selected_features = [i for i in range(len(solution)) if solution[i] == 1]
    if len(selected_features) == 0:
        return 0  # Avoid division by zero
    
    X_train_selected = X_train.iloc[:, selected_features]
    X_test_selected = X_test.iloc[:, selected_features]
    
    classifier = SVC(kernel='linear')
    classifier.fit(X_train_selected, y_train)
    predictions = classifier.predict(X_test_selected)
    accuracy = accuracy_score(y_test, predictions)
    
    return accuracy


In [None]:
import random

# PSO Parameters
w = 0.5
c1 = 1.5
c2 = 1.5
num_particles = 30
num_dimensions = X_train.shape[1]
max_iter = 50

# GA Parameters
mutation_rate = 0.1
crossover_rate = 0.7

# Initialize particles
particles = np.random.randint(2, size=(num_particles, num_dimensions))
velocities = np.random.rand(num_particles, num_dimensions)
pbest_positions = particles.copy()
pbest_scores = np.array([evaluate_solution(p) for p in particles])
gbest_position = pbest_positions[np.argmax(pbest_scores)]
gbest_score = np.max(pbest_scores)

def crossover(parent1, parent2):
    if np.random.rand() < crossover_rate:
        point = np.random.randint(1, num_dimensions - 1)
        child1 = np.concatenate([parent1[:point], parent2[point:]])
        child2 = np.concatenate([parent2[:point], parent1[point:]])
        return child1, child2
    else:
        return parent1, parent2

def mutate(particle):
    for i in range(num_dimensions):
        if np.random.rand() < mutation_rate:
            particle[i] = 1 - particle[i]
    return particle

# Main loop
for t in range(max_iter):
    # PSO update
    for i in range(num_particles):
        r1 = np.random.rand(num_dimensions)
        r2 = np.random.rand(num_dimensions)
        velocities[i] = (w * velocities[i] + 
                         c1 * r1 * (pbest_positions[i] - particles[i]) + 
                         c2 * r2 * (gbest_position - particles[i]))
        
        # Update particles with sigmoid function for binary representation
        particles[i] = np.where(np.random.rand(num_dimensions) < 1 / (1 + np.exp(-velocities[i])), 1, 0)
        
        score = evaluate_solution(particles[i])
        if score > pbest_scores[i]:
            pbest_positions[i] = particles[i]
            pbest_scores[i] = score

    # Update global best
    best_particle_index = np.argmax(pbest_scores)
    if pbest_scores[best_particle_index] > gbest_score:
        gbest_score = pbest_scores[best_particle_index]
        gbest_position = pbest_positions[best_particle_index]

    # GA operations
    new_particles = []
    for _ in range(num_particles // 2):
        parent1, parent2 = particles[np.random.randint(0, num_particles, 2)]
        child1, child2 = crossover(parent1, parent2)
        new_particles.append(mutate(child1))
        new_particles.append(mutate(child2))
    
    particles = np.array(new_particles)

    print(f'Iteration {t+1}/{max_iter}, Best Score: {gbest_score}')

print(f'Global Best Position: {gbest_position}')
print(f'Global Best Score: {gbest_score}')


시그모이드 함수는 입자의 연속 속도 값을 특징 선택 여부를 결정하는 확률로 변환하는 데 사용

속도는 입자가 현재 상태(0에서 1 또는 1에서 0으로)를 변경

시그모이드 함수는 BPSO에서 연속 속도 값을 확률로 변환하는 데 사용되며, 확률은 입자의 각 차원의 이진 상태(0 또는 1)를 결정합니다. 이를 통해 PSO는 이진 검색 공간에서 작동할 수 있으므로 각 기능을 선택하거나 선택하지 않을 수 있는 기능 선택 작업에 적합합니다.