In [1]:
import pandas as pd
from imblearn.combine import SMOTETomek  # Hibrit yöntem
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import RobustScaler
from xgboost import XGBClassifier

from src.evalution import evaluatelog_result
from src.logger import Logger

In [2]:
logger = Logger(log_filename="metrics.txt")

UNIQUE_LABELS = [0, 1, 2, 3]
FEATURES = ['project_name', 'project_version', 'label', 'code', 'code_comment',
            'code_no_comment', 'lc', 'pi', 'ma', 'nbd', 'ml', 'd', 'mi', 'fo', 'r',
            'e']
TRAIN_COLS = ['lc', 'pi', 'ma', 'nbd', 'ml', 'd', 'mi', 'fo', 'r', 'e']

In [3]:
train = pd.read_csv("../../datasets/data/train.csv")
x_train = train[TRAIN_COLS]
y_train = train["label"]

In [4]:
test = pd.read_csv("../../datasets/data/test.csv")
x_test = test[TRAIN_COLS]
y_test = test["label"]

In [5]:
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(x_train)
X_test_scaled = scaler.transform(x_test)

k = 9
selector = SelectKBest(score_func=f_classif, k=k)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)

# Sampling Yöntemlerini Tanımla
sampling_methods = {
    "No Sampling": None,
    "Random Undersampling": RandomUnderSampler(random_state=42),
    "Random Oversampling": RandomOverSampler(random_state=42),
    "SMOTE": SMOTE(random_state=42),
    "Hybrid (SMOTE + Undersampling)": SMOTETomek(random_state=42)  # Hibrit yöntem
}

results = []

for method_name, sampler in sampling_methods.items():
    # Sampling işlemi uygulanacaksa
    if sampler:
        X_train_resampled, y_train_resampled = sampler.fit_resample(X_train_selected, y_train)
    else:
        X_train_resampled, y_train_resampled = X_train_selected, y_train  # Orijinal veri

    model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', objective='multi:softprob',
                          num_class=4, random_state=42)
    model.fit(X_train_resampled, y_train_resampled)

    y_pred = model.predict(X_test_selected)
    y_prob = model.predict_proba(X_test_selected)
    eval_results = evaluatelog_result(y_test, y_pred, f"XGBoost_{method_name}", logger, y_prob)
    eval_results["Sampling Method"] = method_name

    results.append(eval_results)

results_df = pd.DataFrame(results)
results_df.to_excel("xgboost_sampling_comparison_results.xlsx", index=False)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

