In [1]:
import xgboost as xgb
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, cohen_kappa_score



from utils.nilc_nlp import *
from utils import dataset_setup
from utils.save_results import *
from utils.read_results import *

In [2]:
def filter_elements_by_text(dataset_A, dataset_B, text_column='essay_text'):
    """
    Filtra os elementos do dataset A que possuem o mesmo `essay_text` presente no dataset B.
    
    Parameters:
        dataset_A (pd.DataFrame): Dataset maior com a coluna `essay_text`.
        dataset_B (pd.DataFrame): Dataset menor com a coluna `essay_text`.
        text_column (str): Nome da coluna que contém os textos nos dois datasets.
    
    Returns:
        pd.DataFrame: Subconjunto do dataset A com os textos encontrados no dataset B.
    """
    # Converte os textos do dataset B para um conjunto para busca rápida
    texts_in_B = set(dataset_B[text_column])
    
    # Filtra o dataset A onde o texto está presente no conjunto de textos do B
    filtered_A = dataset_A[dataset_A[text_column].isin(texts_in_B)]
    
    return filtered_A

In [36]:
def decode_predictions(predictions):
    """
    Decode normalized predictions (0-5) to the original format (0, 40, 80, 120, 160, 200).
    
    Parameters:
        predictions (list or numpy array): A list or array of normalized predictions (values 0-5).
    
    Returns:
        numpy array: Decoded predictions in the original format (values 0, 40, 80, 120, 160, 200).
    """
    reverse_grade_mapping = {0: 0, 1: 40, 2: 80, 3: 120, 4: 160, 5: 200}
    return np.vectorize(reverse_grade_mapping.get)(predictions)

In [20]:
# Sample Data Preprocessing
def process_y(y):
    # Remove brackets, split by space, and take the first 5 values
    y_processed = [list(map(int, label.strip("[]").replace(",", " ").split()[:5])) for label in y]
    return np.array(y_processed)

# Encoding class labels to integers
def encode_classes(y):
    encoders = [LabelEncoder() for _ in range(y.shape[1])]
    y_encoded = np.array([encoders[i].fit_transform(y[:, i]) for i in range(y.shape[1])]).T
    return y_encoded, encoders

In [21]:
# Map grades to normalized values
grade_mapping = {0: 0, 40: 1, 80: 2, 120: 3, 160: 4, 200: 5}

# Normalize y labels
def normalize_y(y):
    return np.vectorize(grade_mapping.get)(y)

In [22]:
path_to = './data/'

In [23]:
"""
Setup Dataset
"""
dataset_name, dataset_code = dataset_setup.setup_dataset(3)
essays_dataset = dataset_setup.getDataset(path_to, dataset_name)

Dataset escolhido: propor2024
Propor2024 train size = 744
Propor2024 train and validation size = 960
Propor2024 total size = 1155


In [24]:
model_name = "nilc_metrix"
experiment_name = "exp0"

In [25]:
"""
Read Nilc-Metrix dataset 
"""

dataset_name = "propor2024"
path_to_save = create_experiment_folder(path_to, model_name, experiment_name, dataset_name)
filename_to_save = build_filename_to_save(model_name, experiment_name, dataset_name, "0")
dataset_propor = read_csv(path_to_save, filename_to_save)

dataset_name = "essaysFullGrade"
path_to_save = create_experiment_folder(path_to, model_name, experiment_name, dataset_name)
filename_to_save = build_filename_to_save(model_name, experiment_name, dataset_name, "0")
dataset_full_grade = read_csv(path_to_save, filename_to_save)

Folder './data/results/nilc_metrix' already exists.
Folder './data/results/nilc_metrix/exp0' already exists.
Folder './data/results/nilc_metrix/exp0/propor2024' already exists.
CSV with the answer will be saved in: propor2024-nilc_metrix-exp0-0
Folder './data/results/nilc_metrix' already exists.
Folder './data/results/nilc_metrix/exp0' already exists.
Folder './data/results/nilc_metrix/exp0/essaysFullGrade' already exists.
CSV with the answer will be saved in: essaysFullGrade-nilc_metrix-exp0-0


In [26]:
"""
Read Train, Test and Validation file
"""
dataset_base_path = "./data/Datasets/"

path_to_propor = dataset_base_path + "propor2024/"
train_dataset_propor = read_csv(path_to_propor, "train")
validation_dataset_propor = read_csv(path_to_propor, "validation")
test_dataset_propor = read_csv(path_to_propor, "test")

path_to_dataset_full_grade = dataset_base_path + "fullGradeEnemEssays2024/"
train_dataset_full_grade = read_csv(path_to_dataset_full_grade, "train")
validation_dataset_full_grade = read_csv(path_to_dataset_full_grade, "train")
test_dataset_full_grade = read_csv(path_to_dataset_full_grade, "test")

In [27]:
test_propor = filter_elements_by_text(dataset_propor, test_dataset_propor)
test_full_grade = filter_elements_by_text(dataset_full_grade, test_dataset_full_grade)
test_dataset = pd.concat([test_propor, test_full_grade], ignore_index=True)

validation_propor = filter_elements_by_text(dataset_propor, validation_dataset_propor)
validation_full_grade = filter_elements_by_text(dataset_full_grade, validation_dataset_full_grade)
train_propor = filter_elements_by_text(dataset_propor, train_dataset_propor)
train_full_grade = filter_elements_by_text(dataset_full_grade, train_dataset_full_grade)
train_dataset = pd.concat([validation_propor, validation_full_grade, train_propor, train_full_grade], ignore_index=True)

In [33]:
test_dataset = test_dataset.sample(frac=1, random_state=0).reset_index(drop=True)
train_dataset = train_dataset.sample(frac=1, random_state=0).reset_index(drop=True)

columns_to_drop = ['author', 'source']
X_train = train_dataset.iloc[:, 6:].drop(columns=columns_to_drop)
y_train = normalize_y(process_y(train_dataset["grades"]))
X_test = test_dataset.iloc[:, 6:].drop(columns=columns_to_drop)
y_test = normalize_y(process_y(test_dataset["grades"]))

In [34]:
X_train

Unnamed: 0,adjective_ratio,adverbs,content_words,flesch,function_words,sentences_per_paragraph,syllables_per_content_word,words_per_sentence,noun_ratio,paragraphs,...,lsa_all_std,lsa_paragraph_mean,lsa_paragraph_std,lsa_givenness_mean,lsa_givenness_std,lsa_span_mean,lsa_span_std,negative_words,positive_words,ratio_function_to_content_words
0,0.09677,0.03763,0.54839,26.86732,0.45161,1.25000,2.87255,37.20000,0.25269,4,...,0.00000,0.94768,0.00200,0.93859,0.03030,0.94074,0.03158,0.30392,0.51961,0.82353
1,0.07722,0.03475,0.56371,26.72788,0.43629,1.75000,2.90411,37.00000,0.30116,4,...,0.00475,0.94982,0.00888,0.92336,0.02623,0.92598,0.02786,0.39041,0.64384,0.77397
2,0.11422,0.03233,0.57974,16.10474,0.42026,4.00000,3.21933,29.00000,0.31034,4,...,0.02973,0.96210,0.01033,0.92901,0.02870,0.94098,0.03241,0.27881,0.40892,0.72491
3,0.06604,0.06604,0.59434,25.01083,0.40566,2.50000,2.96296,31.80000,0.31761,4,...,0.03573,0.94256,0.01270,0.92729,0.02018,0.93454,0.01985,0.38095,0.57672,0.68254
4,0.06949,0.03625,0.54381,11.44846,0.45619,1.75000,3.00556,47.28571,0.28399,4,...,0.02005,0.94850,0.01435,0.93875,0.02211,0.94432,0.02450,0.37778,0.63333,0.83889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1192,0.08782,0.02266,0.54958,25.60644,0.45042,4.00000,3.22165,22.06250,0.29745,4,...,0.04475,0.96526,0.00474,0.91248,0.03783,0.92664,0.03326,0.38660,0.53093,0.81959
1193,0.09314,0.04902,0.60784,21.02559,0.39216,2.33333,2.96774,29.14286,0.24020,3,...,0.07313,0.95407,0.01101,0.91885,0.06844,0.92214,0.06990,0.21774,0.58065,0.64516
1194,0.07527,0.04301,0.60215,10.67677,0.39785,1.40000,3.02976,39.85714,0.27957,5,...,0.00690,0.92241,0.00967,0.91082,0.04010,0.92014,0.04215,0.20833,0.44048,0.66071
1195,0.09677,0.03763,0.54839,26.86732,0.45161,1.25000,2.87255,37.20000,0.25269,4,...,0.00000,0.94768,0.00200,0.93859,0.03030,0.94074,0.03158,0.30392,0.51961,0.82353


In [35]:
# Create and train a separate model for each competency
models = {}
predictions = {}
accuracies = []

best_params = {
    'colsample_bytree': 1.0,
    'learning_rate': 0.1,
    'max_depth': 7,
    'n_estimators': 50,
    'subsample': 0.6
}

for i in range(5):  # c1, c2, ..., c5
    print(f"Training model for c{i+1}...")
    
    # Extract labels for the current competency
    y_train_current = y_train[:, i]
    print(y_train_current)
    y_test_current = y_test[:, i]
    
    # Initialize XGBoost model
    model = xgb.XGBClassifier(
        objective='multi:softmax',  # Multiclass classification
        num_class=6,               # 6 classes: {0, 40, 80, 120, 160, 200}
        eval_metric='mlogloss',    # Multiclass log loss
        **best_params
    )
    
    # Train the model
    model.fit(X_train, y_train_current)
    
    # Store the model
    models[f"c{i+1}"] = model
    
    # Predict on the test set
    predictions[f"c{i+1}"] = model.predict(X_test)
    
    # Evaluate accuracy
    accuracy = accuracy_score(y_test_current, predictions[f"c{i+1}"])
    accuracies.append(accuracy)
    print(f"Accuracy for c{i+1}: {accuracy:.2f}")
    qwk = cohen_kappa_score(y_test_current, predictions[f"c{i+1}"],  weights="quadratic")
    print(f"QWK for c{i+1}: {qwk:.2f}")

# Combine all predictions
final_predictions = np.column_stack([predictions[f"c{i+1}"] for i in range(5)])
print("Final Predictions:\n", final_predictions)

Training model for c1...
[4 3 5 ... 4 3 4]
Accuracy for c1: 0.44
QWK for c1: 0.38
Training model for c2...
[3 3 5 ... 4 5 4]
Accuracy for c2: 0.42
QWK for c2: 0.48
Training model for c3...
[3 2 5 ... 4 3 5]
Accuracy for c3: 0.37
QWK for c3: 0.55
Training model for c4...
[3 2 5 ... 3 3 4]
Accuracy for c4: 0.51
QWK for c4: 0.45
Training model for c5...
[1 0 5 ... 2 2 1]
Accuracy for c5: 0.30
QWK for c5: 0.42
Final Predictions:
 [[3 3 4 4 3]
 [3 3 3 3 1]
 [3 3 1 3 0]
 ...
 [3 3 3 3 2]
 [3 3 2 3 1]
 [3 3 3 3 3]]


In [40]:
decodedPredictions = decode_predictions(final_predictions)

In [41]:
decodedPredictions

array([[120, 120, 160, 160, 120],
       [120, 120, 120, 120,  40],
       [120, 120,  40, 120,   0],
       ...,
       [120, 120, 120, 120,  80],
       [120, 120,  80, 120,  40],
       [120, 120, 120, 120, 120]])

In [50]:
model_name = "gxboost"
dataset_name = "extended2024"
experiment_name = "exp1"

In [51]:
final_predictions = []
for decodedPrediction in decodedPredictions:
    final_predictions.append(str(decodedPrediction))


test_dataset[model_name + "_grades"] = final_predictions

In [52]:
path_to_save = create_experiment_folder(path_to, model_name, experiment_name, dataset_name)
filename_to_save = build_filename_to_save(model_name, experiment_name, dataset_name, "1")

Folder './data/results/gxboost' already exists.
Folder './data/results/gxboost/exp1' created.
Folder './data/results/gxboost/exp1/extended2024' created.
CSV with the answer will be saved in: extended2024-gxboost-exp1-1


In [53]:
test_dataset.to_csv(path_to_save + "/" + filename_to_save + ".csv", index=False)