In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import xgboost as xgb
from xgboost import XGBClassifier
from deap import base, creator, tools, algorithms

In [2]:
# Load dataset (Ganti 'dataset.csv' dengan nama file yang sesuai)
df = pd.read_csv("Lung Cancer Dataset.csv")

df['PULMONARY_DISEASE'] = df['PULMONARY_DISEASE'].map({'YES': 1, 'NO': 0})  # Encoding target variable

# Pisahkan fitur dan target
X = df.drop(columns=['PULMONARY_DISEASE'])
y = df['PULMONARY_DISEASE']

# Konversi fitur kategori ke numerik (jika ada)
X = pd.get_dummies(X)

In [3]:

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:

# === Genetic Algorithm for Feature Selection ===
POPULATION_SIZE = 20  # Ukuran populasi
N_GENERATIONS = 50  # Jumlah iterasi generasi
MUTATION_RATE = 0.2  # Peluang mutasi

In [5]:
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

In [6]:

# GA for Random Forest
toolbox = base.Toolbox()
toolbox.register("attr_bool", np.random.randint, 0, 2)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=len(X.columns))
toolbox.register("population", tools.initRepeat, list, toolbox.individual)


In [7]:

def evaluate_rf(individual):
    selected_features = [col for i, col in enumerate(X.columns) if individual[i] == 1]
    if len(selected_features) == 0:
        return 0,
    X_selected = X[selected_features]
    X_train_sel, X_test_sel, y_train_sel, y_test_sel = train_test_split(X_selected, y, test_size=0.2, random_state=42)
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    scores = accuracy_score(y_test_sel, model.fit(X_train_sel, y_train_sel).predict(X_test_sel))
    return scores,

toolbox.register("evaluate", evaluate_rf)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=MUTATION_RATE)
toolbox.register("select", tools.selTournament, tournsize=3)


In [8]:

population = toolbox.population(n=POPULATION_SIZE)
hof = tools.HallOfFame(1)
algorithms.eaSimple(population, toolbox, cxpb=0.5, mutpb=0.2, ngen=N_GENERATIONS, stats=None, halloffame=hof, verbose=True)
best_features_rf = [col for i, col in enumerate(X.columns) if hof[0][i] == 1]


gen	nevals
0  	20    
1  	10    
2  	16    
3  	10    
4  	9     
5  	13    
6  	13    
7  	11    
8  	15    
9  	9     
10 	8     
11 	12    
12 	14    
13 	15    
14 	14    
15 	9     
16 	10    
17 	5     
18 	16    
19 	9     
20 	14    
21 	14    
22 	8     
23 	13    
24 	14    
25 	13    
26 	11    
27 	13    
28 	15    
29 	15    
30 	5     
31 	9     
32 	13    
33 	13    
34 	12    
35 	12    
36 	13    
37 	11    
38 	10    
39 	12    
40 	10    
41 	16    
42 	8     
43 	6     
44 	12    
45 	14    
46 	8     
47 	18    
48 	6     
49 	13    
50 	11    


In [9]:

# GA for XGBoost
def evaluate_xgb(individual):
    selected_features = [col for i, col in enumerate(X.columns) if individual[i] == 1]
    if len(selected_features) == 0:
        return 0,
    X_selected = X[selected_features]
    X_train_sel, X_test_sel, y_train_sel, y_test_sel = train_test_split(X_selected, y, test_size=0.2, random_state=42)
    model = XGBClassifier(n_estimators=100, random_state=42,  eval_metric="logloss")
    scores = accuracy_score(y_test_sel, model.fit(X_train_sel, y_train_sel).predict(X_test_sel))
    return scores,


In [11]:

toolbox.register("evaluate", evaluate_xgb)
population = toolbox.population(n=POPULATION_SIZE)
hof = tools.HallOfFame(1)
algorithms.eaSimple(population, toolbox, cxpb=0.5, mutpb=0.2, ngen=N_GENERATIONS, stats=None, halloffame=hof, verbose=True)
best_features_xgb = [col for i, col in enumerate(X.columns) if hof[0][i] == 1]


gen	nevals
0  	20    
1  	14    
2  	11    
3  	8     
4  	12    
5  	14    
6  	12    
7  	14    
8  	10    
9  	9     
10 	9     
11 	10    
12 	13    
13 	10    
14 	14    
15 	16    
16 	11    
17 	15    
18 	10    
19 	9     
20 	9     
21 	14    
22 	10    
23 	13    
24 	9     
25 	8     
26 	11    
27 	13    
28 	13    
29 	12    
30 	12    
31 	10    
32 	11    
33 	18    
34 	15    
35 	15    
36 	12    
37 	12    
38 	10    
39 	14    
40 	12    
41 	14    
42 	6     
43 	12    
44 	6     
45 	9     
46 	13    
47 	14    
48 	16    
49 	15    
50 	18    


In [12]:
feature_comparison = pd.DataFrame({
    'Feature': X.columns,
    'RF': ['✓' if f in best_features_rf else '✗' for f in X.columns],
    'XGBoost': ['✓' if f in best_features_xgb else '✗' for f in X.columns]
})

print("\nFitur Terbaik yang Dipilih oleh Genetic Algorithm untuk Random Forest:")
print(best_features_rf)
print("\nFitur Terbaik yang Dipilih oleh Genetic Algorithm untuk XGBoost:")
print(best_features_xgb)
print("\nPerbandingan fitur yang dipilih:")
print(feature_comparison)



Fitur Terbaik yang Dipilih oleh Genetic Algorithm untuk Random Forest:
['AGE', 'SMOKING', 'EXPOSURE_TO_POLLUTION', 'ENERGY_LEVEL', 'BREATHING_ISSUE', 'ALCOHOL_CONSUMPTION', 'THROAT_DISCOMFORT', 'FAMILY_HISTORY', 'SMOKING_FAMILY_HISTORY', 'STRESS_IMMUNE']

Fitur Terbaik yang Dipilih oleh Genetic Algorithm untuk XGBoost:
['GENDER', 'SMOKING', 'EXPOSURE_TO_POLLUTION', 'ENERGY_LEVEL', 'BREATHING_ISSUE', 'ALCOHOL_CONSUMPTION', 'THROAT_DISCOMFORT', 'FAMILY_HISTORY', 'SMOKING_FAMILY_HISTORY', 'STRESS_IMMUNE']

Perbandingan fitur yang dipilih:
                   Feature RF XGBoost
0                      AGE  ✓       ✗
1                   GENDER  ✗       ✓
2                  SMOKING  ✓       ✓
3     FINGER_DISCOLORATION  ✗       ✗
4            MENTAL_STRESS  ✗       ✗
5    EXPOSURE_TO_POLLUTION  ✓       ✓
6        LONG_TERM_ILLNESS  ✗       ✗
7             ENERGY_LEVEL  ✓       ✓
8          IMMUNE_WEAKNESS  ✗       ✗
9          BREATHING_ISSUE  ✓       ✓
10     ALCOHOL_CONSUMPTION  ✓       ✓
1

In [13]:

# Evaluasi Model
def evaluate_model(model, X_train, X_test, y_train, y_test, desc):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    return [desc, acc, precision, recall, f1, auc]


In [18]:

results=[]

# Evaluasi Random Forest
results.append(evaluate_model(RandomForestClassifier(n_estimators=100, random_state=42), X_train, X_test, y_train, y_test, "Random Forest Sebelum GA"))
results.append(evaluate_model(RandomForestClassifier(n_estimators=100, random_state=42), X_train[best_features_rf], X_test[best_features_rf], y_train, y_test, "Random Forest Setelah GA"))

# Evaluasi XGBoost
results.append(evaluate_model(XGBClassifier(n_estimators=100, random_state=42,  eval_metric="logloss"), X_train, X_test, y_train, y_test, "XGBoost Sebelum GA"))
results.append(evaluate_model(XGBClassifier(n_estimators=100, random_state=42,  eval_metric="logloss"), X_train[best_features_xgb], X_test[best_features_xgb], y_train, y_test, "XGBoost Setelah GA"))

# Menampilkan hasil evaluasi dalam bentuk tabel
df_results = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1-score", "AUC-ROC"])
print(df_results)


                      Model  Accuracy  Precision    Recall  F1-score   AUC-ROC
0  Random Forest Sebelum GA     0.913   0.903302  0.892774  0.898007  0.910485
1  Random Forest Setelah GA     0.922   0.916865  0.899767  0.908235  0.919235
2        XGBoost Sebelum GA     0.903   0.880734  0.895105  0.887861  0.902018
3        XGBoost Setelah GA     0.911   0.889908  0.904429  0.897110  0.910183
