In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, RFE, VarianceThreshold
from sklearn.linear_model import LogisticRegression, SGDClassifier, PassiveAggressiveClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from xgboost import XGBClassifier

from src.evalution import evaluatelog_result
from src.logger import Logger

In [2]:
logger = Logger(log_filename="metrics.txt")

UNIQUE_LABELS = [0, 1, 2, 3]
FEATURES = ['project_name', 'project_version', 'label', 'code', 'code_comment',
            'code_no_comment', 'lc', 'pi', 'ma', 'nbd', 'ml', 'd', 'mi', 'fo', 'r',
            'e']
TRAIN_COLS = ['lc', 'pi', 'ma', 'nbd', 'ml', 'd', 'mi', 'fo', 'r', 'e']

In [3]:
train = pd.read_csv("../../datasets/data/train.csv")
x_train = train[TRAIN_COLS]
y_train = train["label"]

In [4]:
test = pd.read_csv("../../datasets/data/test.csv")
x_test = test[TRAIN_COLS]
y_test = test["label"]

In [5]:
max_features = min(x_train.shape[1], 9)


def correlation_feature_selection(X, y, threshold=0.8):
    corr_matrix = np.corrcoef(X, rowvar=False)
    selected_features = []
    for i in range(len(corr_matrix)):
        if not any(abs(corr_matrix[i][j]) > threshold for j in selected_features):
            selected_features.append(i)
    return np.array(selected_features[:max_features])

In [6]:
scalers = {
    "StandardScaler": StandardScaler(),
    "MinMaxScaler": MinMaxScaler(),
    "MaxAbsScaler": MaxAbsScaler(),
    "RobustScaler": RobustScaler()
}

feature_selectors = {
    "SelectKBest_f_classif": SelectKBest(score_func=f_classif, k=max_features),
    "SelectKBest_mutual_info": SelectKBest(score_func=mutual_info_classif, k=max_features),
    "TreeBased_RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "RFE_LogisticRegression": RFE(estimator=LogisticRegression(max_iter=1000, random_state=42),
                                  n_features_to_select=max_features),
    "VarianceThreshold": VarianceThreshold(threshold=0.01),
    "L1Based_Logistic": SelectFromModel(LinearSVC(C=0.01, penalty='l1', dual=False, max_iter=1000, random_state=42)),
    "CorrelationBased": correlation_feature_selection
}

classifiers = {
    "LogisticRegression": LogisticRegression(max_iter=1000, solver='lbfgs', multi_class='multinomial',
                                             class_weight='balanced', random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(kernel='rbf', probability=True, decision_function_shape='ovo', random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=7, weights='distance'),
    "NaiveBayes": GaussianNB(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', objective='multi:softprob', num_class=4,
                             random_state=42),
    "PassiveAggressive": PassiveAggressiveClassifier(max_iter=1000, tol=1e-3, random_state=42),
    "SGDClassifier": SGDClassifier(loss='log_loss', max_iter=1000, tol=1e-3, random_state=42)
}


In [7]:
results = []

for scaler_name, scaler in scalers.items():
    # Veriyi ölçeklendir
    X_train_scaled = scaler.fit_transform(x_train)
    X_test_scaled = scaler.transform(x_test)

    for selector_name, selector in feature_selectors.items():
        if isinstance(selector, (SelectKBest, RFE, VarianceThreshold, SelectFromModel)):
            X_train_selected = selector.fit_transform(X_train_scaled, y_train)
            X_test_selected = selector.transform(X_test_scaled)
        elif isinstance(selector, RandomForestClassifier):
            selector.fit(X_train_scaled, y_train)
            feature_importances = selector.feature_importances_
            top_features = np.argsort(feature_importances)[-max_features:]
            X_train_selected = X_train_scaled[:, top_features]
            X_test_selected = X_test_scaled[:, top_features]
        elif selector_name == "CorrelationBased":
            selected_features = selector(X_train_scaled, y_train)
            X_train_selected = X_train_scaled[:, selected_features]
            X_test_selected = X_test_scaled[:, selected_features]

        for clf_name, clf in classifiers.items():
            clf.fit(X_train_selected, y_train)

            y_pred = clf.predict(X_test_selected)
            y_prob = clf.predict_proba(X_test_selected) if hasattr(clf, "predict_proba") else None

            eval_result = evaluatelog_result(y_test, y_pred, clf_name, logger, y_prob)
            eval_result["Scaler"] = scaler_name
            eval_result["FeatureSelection"] = selector_name
            eval_result["Classifier"] = clf_name
            results.append(eval_result)

results_df = pd.DataFrame(results)

results_df.to_excel("comparison.xlsx", index=False)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Parameters: { "use_label_encoder" } are not used.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Parameters: { "use_label_encoder" } are not used.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Parameters: { "use_label_encoder" } are not used.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.cap