In [1]:

import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import RobustScaler
from xgboost import XGBClassifier

from src.evalution import evaluatelog_result
from src.logger import Logger

In [2]:
logger = Logger(log_filename="metrics.txt")

UNIQUE_LABELS = [0, 1, 2, 3]
FEATURES = ['project_name', 'project_version', 'label', 'code', 'code_comment',
            'code_no_comment', 'lc', 'pi', 'ma', 'nbd', 'ml', 'd', 'mi', 'fo', 'r',
            'e']
TRAIN_COLS = ['lc', 'pi', 'ma', 'nbd', 'ml', 'd', 'mi', 'fo', 'r', 'e']

In [3]:
train = pd.read_csv("../../datasets/data/train.csv")
x_train = train[TRAIN_COLS]
y_train = train["label"]

In [4]:
test = pd.read_csv("../../datasets/data/test.csv")
x_test = test[TRAIN_COLS]
y_test = test["label"]

In [8]:
from sklearn.decomposition import PCA

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(x_train)
X_test_scaled = scaler.transform(x_test)

k = 9
selector = SelectKBest(score_func=f_classif, k=k)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)

model_no_pca = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', objective='multi:softprob', num_class=4,
                             random_state=42)
model_no_pca.fit(X_train_selected, y_train)

y_pred_no_pca = model_no_pca.predict(X_test_selected)
y_prob_no_pca = model_no_pca.predict_proba(X_test_selected) 
eval_results_no_pca = evaluatelog_result(y_test, y_pred_no_pca, "XGBoost_No_PCA", logger, y_prob_no_pca)

n_components = 5
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_selected)
X_test_pca = pca.transform(X_test_selected)

model_pca = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', objective='multi:softprob', num_class=4,
                          random_state=42)
model_pca.fit(X_train_pca, y_train)

y_pred_pca = model_pca.predict(X_test_pca)
y_prob_pca = model_pca.predict_proba(X_test_pca)
eval_results_pca = evaluatelog_result(y_test, y_pred_pca, "XGBoost_PCA", logger, y_prob_pca)

eval_results_no_pca["PCA"] = "No"
eval_results_pca["PCA"] = "Yes"
results_df = pd.DataFrame([eval_results_no_pca, eval_results_pca])
results_df.to_excel("xgboost_pca_comparison_results.xlsx", index=False)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

