In [1]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import RobustScaler
from xgboost import XGBClassifier

from src.evalution import evaluate_result

In [2]:
UNIQUE_LABELS = [0, 1, 2, 3]
FEATURES = ['project_name', 'project_version', 'label', 'code', 'code_comment',
            'code_no_comment', 'lc', 'pi', 'ma', 'nbd', 'ml', 'd', 'mi', 'fo', 'r',
            'e']
TRAIN_COLS = ['lc', 'pi', 'ma', 'nbd', 'ml', 'd', 'mi', 'fo', 'r', 'e']

In [3]:
train = pd.read_csv("../../datasets/data/train.csv")
x_train = train[TRAIN_COLS]
y_train = train["label"]

In [4]:
test = pd.read_csv("../../datasets/data/test.csv")
x_test = test[TRAIN_COLS]
y_test = test["label"]

In [5]:
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import PolynomialFeatures

# Parametreler
k = 9
degree = 3
n_splits = 5

results = []

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

for fold, (train_idx, test_idx) in enumerate(skf.split(x_train, y_train), 1):
    print(f"Fold {fold}/{n_splits} işleniyor...")

    X_train_fold, X_test_fold = x_train.iloc[train_idx], x_train.iloc[test_idx]
    y_train_fold, y_test_fold = y_train.iloc[train_idx], y_train.iloc[test_idx]

    scaler = RobustScaler()
    X_train_scaled = scaler.fit_transform(X_train_fold)
    X_test_scaled = scaler.transform(x_test)

    selector = SelectKBest(score_func=f_classif, k=k)
    X_train_selected = selector.fit_transform(X_train_scaled, y_train_fold)
    X_test_selected = selector.transform(X_test_scaled)

    poly = PolynomialFeatures(degree=degree, interaction_only=True)
    X_train_poly = poly.fit_transform(X_train_selected)
    X_test_poly = poly.transform(X_test_selected)

    model_poly = XGBClassifier(objective="multi:softprob",
                               num_class=4,
                               eval_metric="mlogloss",
                               num_parallel_tree=None,
                               n_estimators=200,
                               tree_method="hist",
                               random_state=42, )
    model_poly.fit(X_train_poly, y_train_fold)

    y_pred_poly = model_poly.predict(X_test_poly)
    y_prob_poly = model_poly.predict_proba(X_test_poly)

    eval_results_poly = evaluate_result(y_test, y_pred_poly, y_prob_poly)
    print(eval_results_poly)
    eval_results_poly["Fold"] = fold
    results.append(eval_results_poly)

results_df = pd.DataFrame(results)
results_df.to_excel("xgboost_results.xlsx", index=False)

Fold 1/5 işleniyor...
{'eval_f1': 0.552554579400366, 'eval_f1_class1': 0.30303030303030304, 'eval_f1_class2': 0.74235807860262, 'eval_f1_class3': 0.2857142857142857, 'eval_f1_class4': 0.27586206896551724, 'eval_acc': 0.5976095617529881, 'eval_precision': 0.5541157645997129, 'eval_recall': 0.5976095617529881, 'eval_ROC-UAC': 0.6424106054093096, 'eval_mcc': 0.22081759544303117, 'eval_cohen_kappa_score': 0.20332484837057296, 'eval_gmean': 0.5909605026939689}
Fold 2/5 işleniyor...
{'eval_f1': 0.5372404942137208, 'eval_f1_class1': 0.3283582089552239, 'eval_f1_class2': 0.7244318181818182, 'eval_f1_class3': 0.2153846153846154, 'eval_f1_class4': 0.27380952380952384, 'eval_acc': 0.5896414342629482, 'eval_precision': 0.5492488244491708, 'eval_recall': 0.5896414342629482, 'eval_ROC-UAC': 0.6501214592627739, 'eval_mcc': 0.18415145485270376, 'eval_cohen_kappa_score': 0.16260840695424028, 'eval_gmean': 0.5648481605756623}
Fold 3/5 işleniyor...
{'eval_f1': 0.5559040941125115, 'eval_f1_class1': 0.2857