In [26]:
import pandas as pd
from src.evalution import evaluate_result

In [27]:
UNIQUE_LABELS = [0, 1, 2, 3]
FEATURES = ['project_name', 'project_version', 'label', 'code', 'code_comment',
            'code_no_comment', 'lc', 'pi', 'ma', 'nbd', 'ml', 'd', 'mi', 'fo', 'r',
            'e']
TRAIN_COLS = ['lc', 'pi', 'ma', 'nbd', 'ml', 'd', 'mi', 'fo', 'r', 'e']

In [28]:
train = pd.read_csv("../../datasets/data/train.csv")
x_train = train[TRAIN_COLS]
y_train = train["label"]

In [29]:
test = pd.read_csv("../../datasets/data/test.csv")
x_test = test[TRAIN_COLS]
y_test = test["label"]

In [30]:
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import RobustScaler, PolynomialFeatures
from mrmr import mrmr_classif
from xgboost import XGBClassifier

# Parametreler
k = 5
degree = 3
n_splits = 5

results = []

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

for fold, (train_idx, test_idx) in enumerate(skf.split(x_train, y_train), 1):
    print(f"Fold {fold}/{n_splits} işleniyor...")

    X_train_fold, X_test_fold = x_train.iloc[train_idx], x_train.iloc[test_idx]
    y_train_fold, y_test_fold = y_train.iloc[train_idx], y_train.iloc[test_idx]

    scaler = RobustScaler()
    X_train_scaled = scaler.fit_transform(X_train_fold)
    X_test_scaled = scaler.transform(x_test)

    X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=x_train.columns, index=X_train_fold.index)
    X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=x_train.columns, index=x_test.index)

    X_train_scaled_df.reset_index(drop=True, inplace=True)
    y_train_fold.reset_index(drop=True, inplace=True)

    selected_features = mrmr_classif(X_train_scaled_df, y_train_fold, K=k)

    X_train_selected = X_train_scaled_df[selected_features]
    X_test_selected = X_test_scaled_df[selected_features]

    poly = PolynomialFeatures(degree=degree, interaction_only=True)
    X_train_poly = poly.fit_transform(X_train_selected)
    X_test_poly = poly.transform(X_test_selected)

    model_poly = XGBClassifier(objective="multi:softprob",
                               num_class=4,
                               eval_metric="mlogloss",
                               n_estimators=200,
                               tree_method="hist",
                               random_state=42)
    model_poly.fit(X_train_poly, y_train_fold)

    y_pred_poly = model_poly.predict(X_test_poly)
    y_prob_poly = model_poly.predict_proba(X_test_poly)

    eval_results_poly = evaluate_result(y_test, y_pred_poly, y_prob_poly)
    eval_results_poly["Fold"] = fold
    results.append(eval_results_poly)

results_df = pd.DataFrame(results)
results_df.to_excel("mrmr.xlsx", index=False)

Fold 1/5 işleniyor...


100%|██████████| 5/5 [00:00<00:00, 12.81it/s]


Fold 2/5 işleniyor...


100%|██████████| 5/5 [00:00<00:00,  5.56it/s]


Fold 3/5 işleniyor...


100%|██████████| 5/5 [00:00<00:00,  6.86it/s]


Fold 4/5 işleniyor...


100%|██████████| 5/5 [00:00<00:00,  7.56it/s]


Fold 5/5 işleniyor...


100%|██████████| 5/5 [00:00<00:00,  7.48it/s]
