In [181]:
import xgboost as xgb
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, cohen_kappa_score



from utils.nilc_nlp import *
from utils import dataset_setup
from utils.save_results import *
from utils.read_results import *

In [182]:
def filter_elements_by_text(dataset_A, dataset_B, text_column='essay_text'):
    """
    Filtra os elementos do dataset A que possuem o mesmo `essay_text` presente no dataset B.
    
    Parameters:
        dataset_A (pd.DataFrame): Dataset maior com a coluna `essay_text`.
        dataset_B (pd.DataFrame): Dataset menor com a coluna `essay_text`.
        text_column (str): Nome da coluna que contém os textos nos dois datasets.
    
    Returns:
        pd.DataFrame: Subconjunto do dataset A com os textos encontrados no dataset B.
    """
    # Converte os textos do dataset B para um conjunto para busca rápida
    texts_in_B = set(dataset_B[text_column])
    
    # Filtra o dataset A onde o texto está presente no conjunto de textos do B
    filtered_A = dataset_A[dataset_A[text_column].isin(texts_in_B)]
    
    return filtered_A

In [183]:
def decode_predictions(predictions):
    """
    Decode normalized predictions (0-5) to the original format (0, 40, 80, 120, 160, 200).
    
    Parameters:
        predictions (list or numpy array): A list or array of normalized predictions (values 0-5).
    
    Returns:
        numpy array: Decoded predictions in the original format (values 0, 40, 80, 120, 160, 200).
    """
    reverse_grade_mapping = {0: 0, 1: 40, 2: 80, 3: 120, 4: 160, 5: 200}
    return np.vectorize(reverse_grade_mapping.get)(predictions)

In [184]:
# Sample Data Preprocessing
def process_y(y):
    # Remove brackets, split by space, and take the first 5 values
    y_processed = [list(map(int, label.strip("[]").replace(",", " ").split()[:5])) for label in y]
    return np.array(y_processed)

# Encoding class labels to integers
def encode_classes(y):
    encoders = [LabelEncoder() for _ in range(y.shape[1])]
    y_encoded = np.array([encoders[i].fit_transform(y[:, i]) for i in range(y.shape[1])]).T
    return y_encoded, encoders

In [185]:
# Map grades to normalized values
grade_mapping = {0: 0, 40: 1, 80: 2, 120: 3, 160: 4, 200: 5}

# Normalize y labels
def normalize_y(y):
    return np.vectorize(grade_mapping.get)(y)

In [186]:
path_to = './data/'

In [187]:
"""
Setup Dataset
"""
dataset_name, dataset_code = dataset_setup.setup_dataset(3)
essays_dataset = dataset_setup.getDataset(path_to, dataset_name)

Dataset escolhido: propor2024
Propor2024 train size = 744
Propor2024 train and validation size = 960
Propor2024 total size = 1155


In [188]:
model_name = "nilc_metrix"
experiment_name = "exp0"

In [190]:
"""
Read Nilc-Metrix dataset 
"""

dataset_name = "propor2024"
path_to_save = create_experiment_folder(path_to, model_name, experiment_name, dataset_name)
filename_to_save = build_filename_to_save(model_name, experiment_name, dataset_name, "0")
dataset_propor = read_csv(path_to_save, filename_to_save)

dataset_name = "essaysFullGrade"
path_to_save = create_experiment_folder(path_to, model_name, experiment_name, dataset_name)
filename_to_save = build_filename_to_save(model_name, experiment_name, dataset_name, "0")
dataset_full_grade = read_csv(path_to_save, filename_to_save)

Folder './data/results/nilc_metrix' already exists.
Folder './data/results/nilc_metrix/exp0' already exists.
Folder './data/results/nilc_metrix/exp0/propor2024' already exists.
CSV with the answer will be saved in: propor2024-nilc_metrix-exp0-0
Folder './data/results/nilc_metrix' already exists.
Folder './data/results/nilc_metrix/exp0' already exists.
Folder './data/results/nilc_metrix/exp0/essaysFullGrade' already exists.
CSV with the answer will be saved in: essaysFullGrade-nilc_metrix-exp0-0


In [191]:
"""
Read Train, Test and Validation file
"""
dataset_base_path = "./data/Datasets/"

path_to_propor = dataset_base_path + "propor2024/"
train_dataset_propor = read_csv(path_to_propor, "train")
validation_dataset_propor = read_csv(path_to_propor, "validation")
test_dataset_propor = read_csv(path_to_propor, "test")

path_to_dataset_full_grade = dataset_base_path + "fullGradeEnemEssays2024/"
train_dataset_full_grade = read_csv(path_to_dataset_full_grade, "train")
validation_dataset_full_grade = read_csv(path_to_dataset_full_grade, "train")
test_dataset_full_grade = read_csv(path_to_dataset_full_grade, "test")

In [192]:
test_propor = filter_elements_by_text(dataset_propor, test_dataset_propor)
test_full_grade = filter_elements_by_text(dataset_full_grade, test_dataset_full_grade)
test_dataset = pd.concat([test_propor, test_full_grade], ignore_index=True)

validation_propor = filter_elements_by_text(dataset_propor, validation_dataset_propor)
validation_full_grade = filter_elements_by_text(dataset_full_grade, validation_dataset_full_grade)
train_propor = filter_elements_by_text(dataset_propor, train_dataset_propor)
train_full_grade = filter_elements_by_text(dataset_full_grade, train_dataset_full_grade)
train_dataset = pd.concat([validation_propor, validation_full_grade, train_propor, train_full_grade], ignore_index=True)

In [193]:
test_dataset["id_control"] = range(1, len(test_dataset) + 1)
train_dataset["id_control"] = range(1, len(train_dataset) + 1)

In [165]:
def gxboost(X_train, y_train, X_test, y_test):
    # Create and train a separate model for each competency
    models = {}
    predictions = {}
    accuracies = []
    
    best_params = {
        'colsample_bytree': 1.0,
        'learning_rate': 0.01,
        'max_depth': 11,
        'n_estimators': 100,
        'subsample': 0.4,
        'gamma': 0,
        'max_delta_step': 0,
        'min_child_weight': 1
    }
    
    for i in range(5):  # c1, c2, ..., c5
        print(f"Training model for c{i+1}...")
        
        # Extract labels for the current competency
        y_train_current = y_train[:, i]
        print(y_train_current)
        y_test_current = y_test[:, i]
        
        # Initialize XGBoost model
        model = xgb.XGBClassifier(
            objective='multi:softmax',  # Multiclass classification
            num_class=6,               # 6 classes: {0, 40, 80, 120, 160, 200}
            eval_metric='mlogloss',    # Multiclass log loss
            **best_params
        )
        
        # Train the model
        model.fit(X_train, y_train_current)
        
        # Store the model
        models[f"c{i+1}"] = model
        
        # Predict on the test set
        predictions[f"c{i+1}"] = model.predict(X_test)
        
        # Evaluate accuracy
        accuracy = accuracy_score(y_test_current, predictions[f"c{i+1}"])
        accuracies.append(accuracy)
        print(f"Accuracy for c{i+1}: {accuracy:.2f}")
        qwk = cohen_kappa_score(y_test_current, predictions[f"c{i+1}"],  weights="quadratic")
        print(f"QWK for c{i+1}: {qwk:.2f}")
    
    # Combine all predictions
    final_predictions = np.column_stack([predictions[f"c{i+1}"] for i in range(5)])
    print("Final Predictions:\n", final_predictions)
    
    decodedPredictions = decode_predictions(final_predictions)
    
    return decodedPredictions

In [166]:
def save_results(decodedPredictions, test_dataset, experimento=1):
    model_name = "gxboost"
    dataset_name = "extended2024"
    experiment_name = "exp" + str(experimento)
    
    final_predictions = []
    for decodedPrediction in decodedPredictions:
        final_predictions.append(str(decodedPrediction))
    
    test_dataset[model_name + "_grades"] = final_predictions
    
    # Save Tests Dataset
    path_to_save = create_experiment_folder(path_to, model_name, experiment_name, dataset_name)
    filename_to_save = build_filename_to_save(model_name, experiment_name, dataset_name, "1")
    test_dataset.to_csv(path_to_save + "/" + filename_to_save + ".csv", index=False)

## Experimento 1

In [163]:
columns_to_drop = ['author', 'source',  'id_control']
X_train = train_dataset.iloc[:, 6:].drop(columns=columns_to_drop)
y_train = normalize_y(process_y(train_dataset["grades"]))
X_test = test_dataset.iloc[:, 6:].drop(columns=columns_to_drop)
y_test = normalize_y(process_y(test_dataset["grades"]))

In [120]:
decodedPredictions = gxboost(X_train, y_train, X_test, y_test)

Training model for c1...
[0 2 3 ... 5 5 5]
Accuracy for c1: 0.46
QWK for c1: 0.45
Training model for c2...
[0 2 1 ... 5 5 5]
Accuracy for c2: 0.44
QWK for c2: 0.48
Training model for c3...
[0 2 1 ... 5 5 5]
Accuracy for c3: 0.38
QWK for c3: 0.52
Training model for c4...
[0 2 4 ... 5 5 5]
Accuracy for c4: 0.47
QWK for c4: 0.40
Training model for c5...
[0 2 0 ... 5 5 5]
Accuracy for c5: 0.31
QWK for c5: 0.50
Final Predictions:
 [[3 3 2 3 0]
 [3 3 3 3 1]
 [2 1 1 2 0]
 ...
 [5 5 5 5 5]
 [5 5 5 5 5]
 [5 5 5 5 5]]


In [121]:
save_results(decodedPredictions, test_dataset, 1)

Folder './data/results/gxboost' already exists.
Folder './data/results/gxboost/exp1' already exists.
Folder './data/results/gxboost/exp1/extended2024' already exists.
CSV with the answer will be saved in: extended2024-gxboost-exp1-1


## Experimento 2

In [176]:
model_name = "gemini-1.5-flash"

In [310]:
def execute_gxboost_with_additional_grades(experiment_id, experiment_names, model_names, train_dataset, test_dataset):
    dataset_names = ["propor2024", "extended_complete"]
    columns_to_drop = []
    dfs2 = []
    for model_name in model_names:
        
        for experiment_name in experiment_names:
            dfs = []
            for dataset_name in dataset_names:
                path_to_save = create_experiment_folder(path_to, model_name, experiment_name, dataset_name)
                filename_to_save = build_filename_to_save(model_name, experiment_name, dataset_name, "1")
                
                dataset = read_csv(path_to_save, filename_to_save)
                dfs.append(dataset)
        
            full_dataset = pd.concat(dfs, ignore_index=True)
            grade_key = model_name.split("-")[0] + "_grades"
            df = full_dataset[["essay_text", grade_key]]
            columns = []
            # Create new column names for expanded grades
            df[grade_key] = df[grade_key].apply(eval)  # Convert string to Python list
            columns = [f"Competência {i} - {grade_key} - {experiment_name}" for i in range(1, 6)]
            columns.append(f"Total {i} - {grade_key} - {experiment_name}")
            
            grades_expanded = pd.DataFrame(df[grade_key].tolist(), columns=columns, index=df.index)
            df = pd.concat([df, grades_expanded], axis=1)
            dfs2.append(df)
        columns_to_drop.append(model_name.split("-")[0] + "_grades")
    df = pd.concat(dfs2, axis=1)
    df = df.loc[:, ~df.columns.duplicated()]
        
    test_dataset_with_model_grades =  pd.merge(test_dataset, df, on="essay_text", how="left").drop_duplicates(subset=["id_control", "essay_text"])
    train_dataset_with_model_grades =  pd.merge(train_dataset, df, on="essay_text", how="left").drop_duplicates(subset=["id_control", "essay_text"])
    #return train_dataset_with_model_grades
    cut_value = 6
    if (experiment_id >= 5):
        cut_value = 78
    
    columns_to_drop += ['author', 'source', 'id_control']
    X_train = train_dataset_with_model_grades.iloc[:, cut_value:].drop(columns=columns_to_drop)
    y_train = normalize_y(process_y(train_dataset_with_model_grades["grades"]))
    X_test = test_dataset_with_model_grades.iloc[:, cut_value:].drop(columns=columns_to_drop)
    y_test = normalize_y(process_y(test_dataset_with_model_grades["grades"]))
    
    for col in X_train.columns:
        if X_train[col].dtype == 'object':
            print(col)
            try:
                X_train[col] = X_train[col].astype(int)
                X_test[col] = X_test[col].astype(int)
            except ValueError:
                print(f"Coluna {col} não pode ser convertida para float.")
            
    decodedPredictions = gxboost(X_train, y_train, X_test, y_test)
    save_results(decodedPredictions, test_dataset_with_model_grades, experiment_id)
    #return X_train

In [313]:
experiment_ids = {
    2 : ["gemini-1.5-flash"],
    3 : ["llama-3.2-90b-text-preview"],
    4 : ["gemini-1.5-flash", "llama-3.2-90b-text-preview"],
    5 : ["gemini-1.5-flash"],
    6 : ["llama-3.2-90b-text-preview"],
    7 : ["gemini-1.5-flash", "llama-3.2-90b-text-preview"],
}

In [314]:
for experiment_id in experiment_ids.keys():
     X_train = execute_gxboost_with_additional_grades(experiment_id, ["exp5", "exp6"], experiment_ids[experiment_id], train_dataset, test_dataset)

Folder './data/results/gemini-1.5-flash' already exists.
Folder './data/results/gemini-1.5-flash/exp5' already exists.
Folder './data/results/gemini-1.5-flash/exp5/propor2024' already exists.
CSV with the answer will be saved in: propor2024-gemini-1.5-flash-exp5-1
Folder './data/results/gemini-1.5-flash' already exists.
Folder './data/results/gemini-1.5-flash/exp5' already exists.
Folder './data/results/gemini-1.5-flash/exp5/extended_complete' already exists.
CSV with the answer will be saved in: extended_complete-gemini-1.5-flash-exp5-1
Folder './data/results/gemini-1.5-flash' already exists.
Folder './data/results/gemini-1.5-flash/exp6' already exists.
Folder './data/results/gemini-1.5-flash/exp6/propor2024' already exists.
CSV with the answer will be saved in: propor2024-gemini-1.5-flash-exp6-1
Folder './data/results/gemini-1.5-flash' already exists.
Folder './data/results/gemini-1.5-flash/exp6' already exists.
Folder './data/results/gemini-1.5-flash/exp6/extended_complete' already 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[grade_key] = df[grade_key].apply(eval)  # Convert string to Python list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[grade_key] = df[grade_key].apply(eval)  # Convert string to Python list


Training model for c1...
[0 2 3 ... 5 5 5]
Accuracy for c1: 0.47
QWK for c1: 0.48
Training model for c2...
[0 2 1 ... 5 5 5]
Accuracy for c2: 0.47
QWK for c2: 0.53
Training model for c3...
[0 2 1 ... 5 5 5]
Accuracy for c3: 0.40
QWK for c3: 0.65
Training model for c4...
[0 2 4 ... 5 5 5]
Accuracy for c4: 0.47
QWK for c4: 0.45
Training model for c5...
[0 2 0 ... 5 5 5]
Accuracy for c5: 0.34
QWK for c5: 0.53
Final Predictions:
 [[3 3 2 3 0]
 [2 3 3 3 0]
 [2 3 3 2 0]
 ...
 [5 5 5 5 5]
 [5 5 5 5 5]
 [5 5 5 5 5]]
Folder './data/results/gxboost' already exists.
Folder './data/results/gxboost/exp2' already exists.
Folder './data/results/gxboost/exp2/extended2024' already exists.
CSV with the answer will be saved in: extended2024-gxboost-exp2-1
Folder './data/results/llama-3.2-90b-text-preview' already exists.
Folder './data/results/llama-3.2-90b-text-preview/exp5' already exists.
Folder './data/results/llama-3.2-90b-text-preview/exp5/propor2024' already exists.
CSV with the answer will be sav

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[grade_key] = df[grade_key].apply(eval)  # Convert string to Python list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[grade_key] = df[grade_key].apply(eval)  # Convert string to Python list


Accuracy for c1: 0.45
QWK for c1: 0.44
Training model for c2...
[0 2 1 ... 5 5 5]
Accuracy for c2: 0.45
QWK for c2: 0.52
Training model for c3...
[0 2 1 ... 5 5 5]
Accuracy for c3: 0.42
QWK for c3: 0.65
Training model for c4...
[0 2 4 ... 5 5 5]
Accuracy for c4: 0.46
QWK for c4: 0.42
Training model for c5...
[0 2 0 ... 5 5 5]
Accuracy for c5: 0.32
QWK for c5: 0.50
Final Predictions:
 [[3 3 1 3 0]
 [3 3 3 3 3]
 [2 1 3 2 0]
 ...
 [5 5 5 5 5]
 [5 5 5 5 5]
 [5 5 5 5 5]]
Folder './data/results/gxboost' already exists.
Folder './data/results/gxboost/exp3' already exists.
Folder './data/results/gxboost/exp3/extended2024' already exists.
CSV with the answer will be saved in: extended2024-gxboost-exp3-1
Folder './data/results/gemini-1.5-flash' already exists.
Folder './data/results/gemini-1.5-flash/exp5' already exists.
Folder './data/results/gemini-1.5-flash/exp5/propor2024' already exists.
CSV with the answer will be saved in: propor2024-gemini-1.5-flash-exp5-1
Folder './data/results/gemini-1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[grade_key] = df[grade_key].apply(eval)  # Convert string to Python list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[grade_key] = df[grade_key].apply(eval)  # Convert string to Python list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[grade_key] = df[grade_key].apply(eval)  # Convert st

Folder './data/results/llama-3.2-90b-text-preview' already exists.
Folder './data/results/llama-3.2-90b-text-preview/exp6' already exists.
Folder './data/results/llama-3.2-90b-text-preview/exp6/extended_complete' already exists.
CSV with the answer will be saved in: extended_complete-llama-3.2-90b-text-preview-exp6-1
Competência 1 - llama_grades - exp5
Training model for c1...
[0 2 3 ... 5 5 5]
Accuracy for c1: 0.49
QWK for c1: 0.57
Training model for c2...
[0 2 1 ... 5 5 5]
Accuracy for c2: 0.50
QWK for c2: 0.60
Training model for c3...
[0 2 1 ... 5 5 5]
Accuracy for c3: 0.43
QWK for c3: 0.70
Training model for c4...
[0 2 4 ... 5 5 5]
Accuracy for c4: 0.52
QWK for c4: 0.46
Training model for c5...
[0 2 0 ... 5 5 5]
Accuracy for c5: 0.37
QWK for c5: 0.57
Final Predictions:
 [[3 3 2 3 0]
 [3 3 3 3 0]
 [2 3 3 2 0]
 ...
 [5 5 5 5 5]
 [5 5 5 5 5]
 [5 5 5 5 5]]
Folder './data/results/gxboost' already exists.
Folder './data/results/gxboost/exp4' already exists.
Folder './data/results/gxboost

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[grade_key] = df[grade_key].apply(eval)  # Convert string to Python list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[grade_key] = df[grade_key].apply(eval)  # Convert string to Python list


Training model for c1...
[0 2 3 ... 5 5 5]
Accuracy for c1: 0.50
QWK for c1: 0.61
Training model for c2...
[0 2 1 ... 5 5 5]
Accuracy for c2: 0.44
QWK for c2: 0.57
Training model for c3...
[0 2 1 ... 5 5 5]
Accuracy for c3: 0.40
QWK for c3: 0.68
Training model for c4...
[0 2 4 ... 5 5 5]
Accuracy for c4: 0.51
QWK for c4: 0.47
Training model for c5...
[0 2 0 ... 5 5 5]
Accuracy for c5: 0.33
QWK for c5: 0.55
Final Predictions:
 [[3 3 1 3 0]
 [2 3 3 3 0]
 [3 3 3 3 2]
 ...
 [5 5 5 5 5]
 [5 5 5 5 5]
 [5 5 5 5 5]]
Folder './data/results/gxboost' already exists.
Folder './data/results/gxboost/exp5' created.
Folder './data/results/gxboost/exp5/extended2024' created.
CSV with the answer will be saved in: extended2024-gxboost-exp5-1
Folder './data/results/llama-3.2-90b-text-preview' already exists.
Folder './data/results/llama-3.2-90b-text-preview/exp5' already exists.
Folder './data/results/llama-3.2-90b-text-preview/exp5/propor2024' already exists.
CSV with the answer will be saved in: propor2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[grade_key] = df[grade_key].apply(eval)  # Convert string to Python list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[grade_key] = df[grade_key].apply(eval)  # Convert string to Python list


Accuracy for c1: 0.50
QWK for c1: 0.49
Training model for c2...
[0 2 1 ... 5 5 5]
Accuracy for c2: 0.45
QWK for c2: 0.49
Training model for c3...
[0 2 1 ... 5 5 5]
Accuracy for c3: 0.37
QWK for c3: 0.55
Training model for c4...
[0 2 4 ... 5 5 5]
Accuracy for c4: 0.44
QWK for c4: 0.45
Training model for c5...
[0 2 0 ... 5 5 5]
Accuracy for c5: 0.31
QWK for c5: 0.54
Final Predictions:
 [[3 1 1 3 1]
 [3 3 3 3 1]
 [3 3 3 3 3]
 ...
 [5 5 5 5 5]
 [5 5 5 5 5]
 [5 5 5 5 5]]
Folder './data/results/gxboost' already exists.
Folder './data/results/gxboost/exp6' created.
Folder './data/results/gxboost/exp6/extended2024' created.
CSV with the answer will be saved in: extended2024-gxboost-exp6-1
Folder './data/results/gemini-1.5-flash' already exists.
Folder './data/results/gemini-1.5-flash/exp5' already exists.
Folder './data/results/gemini-1.5-flash/exp5/propor2024' already exists.
CSV with the answer will be saved in: propor2024-gemini-1.5-flash-exp5-1
Folder './data/results/gemini-1.5-flash' alre

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[grade_key] = df[grade_key].apply(eval)  # Convert string to Python list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[grade_key] = df[grade_key].apply(eval)  # Convert string to Python list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[grade_key] = df[grade_key].apply(eval)  # Convert st

Folder './data/results/llama-3.2-90b-text-preview' already exists.
Folder './data/results/llama-3.2-90b-text-preview/exp6' already exists.
Folder './data/results/llama-3.2-90b-text-preview/exp6/extended_complete' already exists.
CSV with the answer will be saved in: extended_complete-llama-3.2-90b-text-preview-exp6-1
Competência 1 - llama_grades - exp5
Training model for c1...
[0 2 3 ... 5 5 5]
Accuracy for c1: 0.50
QWK for c1: 0.58
Training model for c2...
[0 2 1 ... 5 5 5]
Accuracy for c2: 0.47
QWK for c2: 0.62
Training model for c3...
[0 2 1 ... 5 5 5]
Accuracy for c3: 0.41
QWK for c3: 0.63
Training model for c4...
[0 2 4 ... 5 5 5]
Accuracy for c4: 0.50
QWK for c4: 0.52
Training model for c5...
[0 2 0 ... 5 5 5]
Accuracy for c5: 0.36
QWK for c5: 0.62
Final Predictions:
 [[3 3 1 3 0]
 [3 3 3 3 0]
 [3 3 3 3 2]
 ...
 [5 5 5 5 5]
 [5 5 5 5 5]
 [5 5 5 5 5]]
Folder './data/results/gxboost' already exists.
Folder './data/results/gxboost/exp7' created.
Folder './data/results/gxboost/exp7/e

In [296]:
print(X_train["Competência 1 - gemini_grades - exp5"].dtypes)

KeyError: 'Competência 1 - gemini_grades - exp5'

In [305]:
for col in X_train.columns:
    if X_train[col].dtype == 'object':
        print(col)
        try:
            X_train[col] = X_train[col].astype(int)
        except ValueError:
            print(f"Coluna {col} não pode ser convertida para float.")

In [269]:
# Filter rows where column "A" does not contain integers
filtered_df = y_test[y_test["Competência 1 - gemini_grades - exp5"].apply(lambda x: isinstance(x, int))]

print(filtered_df)

           id                                          id_prompt  \
0      9.html                    carnaval-e-apropriacao-cultural   
3     14.html               a-fe-e-decisiva-para-uma-vida-melhor   
6      7.html               a-fe-e-decisiva-para-uma-vida-melhor   
9     10.html               a-fe-e-decisiva-para-uma-vida-melhor   
12    10.html                    carnaval-e-apropriacao-cultural   
...       ...                                                ...   
3061      103  Desafios para a valorização de comunidades e p...   
3062       45       Democratização do acesso ao cinema no Brasil   
3063      169          Publicidade infantil em questão no Brasil   
3064      207  Manipulação do comportamento do usuário pelo c...   
3065       18  Manipulação do comportamento do usuário pelo c...   

                                            essay_title  \
0          Apropriação cultural significa uma pessoa...   
3                                   Desejo pelo Sucesso   
6     

In [233]:
train_dataset

Unnamed: 0,id,id_prompt,essay_title,essay_text,grades,essay_year,adjective_ratio,adverbs,content_words,flesch,...,lsa_givenness_mean,lsa_givenness_std,lsa_span_mean,lsa_span_std,negative_words,positive_words,ratio_function_to_content_words,author,source,id_control
0,9.html,carnaval-e-apropriacao-cultural,Apropriação cultural significa uma pessoa...,Apropriação cultural significa uma pessoa pode...,[0 0 0 0 0 0],2020,0.03817,0.04580,0.54198,42.62073,...,0.91404,0.02360,0.92045,0.02386,0.16901,0.57746,0.84507,,,1
1,14.html,a-fe-e-decisiva-para-uma-vida-melhor,Desejo pelo Sucesso,Hoje em dia a maioria dos brasileiros veem a f...,[ 80 80 80 80 80 400],2019,0.07782,0.07004,0.55253,-8.62155,...,0.94540,0.00651,0.95438,0.00304,0.21831,0.60563,0.80986,,,2
2,7.html,a-fe-e-decisiva-para-uma-vida-melhor,“E essa é a vitória que vence o mundo: a nossa...,"O ""vencer na vida"" pode ser interpretado como ...",[120 40 40 160 0 360],2019,0.08202,0.05678,0.59621,37.05130,...,0.91939,0.02514,0.93469,0.02308,0.20635,0.70370,0.67725,,,3
3,10.html,a-fe-e-decisiva-para-uma-vida-melhor,Em uma pesquisa recente,Em uma pesquisa recente realizada pela ONG Oxf...,[160 160 120 120 120 680],2019,0.07296,0.04292,0.57082,31.20928,...,0.90544,0.06490,0.91865,0.06316,0.21805,0.51880,0.75188,,,4
4,10.html,carnaval-e-apropriacao-cultural,Dar voz para quem não a possui,A apropriação cultural acontece quando um indi...,[ 80 80 120 80 80 440],2020,0.03896,0.03896,0.55195,13.21456,...,0.94205,0.01725,0.94449,0.01753,0.27059,0.55294,0.81176,,,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1192,103,Desafios para a valorização de comunidades e p...,,"""Na série brasileira ""Cidade Invisível"", parte...","[200, 200, 200, 200, 200, 1000]",2022,0.09725,0.04651,0.57717,31.11437,...,0.79953,0.12819,0.88535,0.08990,0.40293,0.60073,0.73260,Fernanda Barbosa,https://www.lucasfelpi.com.br/redamil,1193
1193,45,Democratização do acesso ao cinema no Brasil,,"""No Artigo 215° da Constituição Federal Brasil...","[200, 200, 200, 200, 200, 1000]",2019,0.10181,0.03167,0.59050,37.55407,...,0.75792,0.18520,0.84216,0.14319,0.36015,0.43295,0.69349,Emmanuelle Gomes de Faria,https://www.lucasfelpi.com.br/redamil,1194
1194,169,Publicidade infantil em questão no Brasil,,A propaganda é a principal arma das grandes em...,"[200, 200, 200, 200, 200, 1000]",2014,0.09880,0.01796,0.59281,28.55825,...,0.86692,0.06177,0.88608,0.06165,0.22727,0.67677,0.68687,Giovana Lazzaretti Segat,https://g1.globo.com/,1195
1195,207,Manipulação do comportamento do usuário pelo c...,,A série britânica “Black Mirror” é caracteriza...,"[200, 200, 200, 200, 200, 1000]",2018,0.05349,0.03488,0.56279,24.81269,...,0.89137,0.04377,0.90983,0.04855,0.26446,0.35124,0.77686,Jamille Borges,https://g1.globo.com/,1196
