In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

llm_classification_finetuning_path = kagglehub.competition_download('llm-classification-finetuning')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llm-classification-finetuning/sample_submission.csv
/kaggle/input/llm-classification-finetuning/train.csv
/kaggle/input/llm-classification-finetuning/test.csv


In [None]:
import pandas as pd

# Caminhos dos arquivos dentro do ambiente Kaggle
train_csv_path = '/kaggle/input/llm-classification-finetuning/train.csv'
test_csv_path = '/kaggle/input/llm-classification-finetuning/test.csv'
sample_submission_path = '/kaggle/input/llm-classification-finetuning/sample_submission.csv'

# Carregar os arquivos CSV em DataFrames do pandas
print("Carregando os dados...")
train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)
sample_submission_df = pd.read_csv(sample_submission_path)
print("Dados carregados com sucesso!")

# É uma boa prática verificar se os dados foram carregados corretamente
print("\nInformações do DataFrame de Treino:")
train_df.info()

print("\nPrimeiras 5 linhas do DataFrame de Teste:")
print(test_df.head())

Carregando os dados...
Dados carregados com sucesso!

Informações do DataFrame de Treino:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57477 entries, 0 to 57476
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              57477 non-null  int64 
 1   model_a         57477 non-null  object
 2   model_b         57477 non-null  object
 3   prompt          57477 non-null  object
 4   response_a      57477 non-null  object
 5   response_b      57477 non-null  object
 6   winner_model_a  57477 non-null  int64 
 7   winner_model_b  57477 non-null  int64 
 8   winner_tie      57477 non-null  int64 
dtypes: int64(4), object(5)
memory usage: 3.9+ MB

Primeiras 5 linhas do DataFrame de Teste:
        id                                             prompt  \
0   136060  ["I have three oranges today, I ate an orange ...   
1   211333  ["You are a mediator in a heated political deb...   
2  1233961  ["How to initializ

In [None]:
# Comprimento dos textos (pode indicar verbosidade)
train_df['prompt_len'] = train_df['prompt'].str.len()
train_df['response_a_len'] = train_df['response_a'].str.len()
train_df['response_b_len'] = train_df['response_b'].str.len()

# Diferença de comprimento (uma feature de comparação direta)
train_df['len_diff'] = train_df['response_a_len'] - train_df['response_b_len']

# Repita o mesmo para o test_df
test_df['prompt_len'] = test_df['prompt'].str.len()
test_df['response_a_len'] = test_df['response_a'].str.len()
test_df['response_b_len'] = test_df['response_b'].str.len()
test_df['len_diff'] = test_df['response_a_len'] - test_df['response_b_len']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
import lightgbm as lgb
from scipy.sparse import hstack, csr_matrix
import numpy as np

In [None]:
# Mapear as colunas de vencedor para um único alvo
target_map = {'winner_model_a': 0, 'winner_model_b': 1, 'winner_tie': 2}
train_df['target'] = np.argmax(train_df[['winner_model_a', 'winner_model_b', 'winner_tie']].values, axis=1)

In [None]:
# Inicializar os vetorizadores
# max_features limita o número de palavras (features) para controlar o uso de memória
# ngram_range=(1, 2) considera palavras individuais e pares de palavras (bigramas)
prompt_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
response_a_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 3))
response_b_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 3))

# Treinar os vetorizadores nos dados de treino
print("Treinando vetorizadores...")
X_prompt = prompt_vectorizer.fit_transform(train_df['prompt'])
X_response_a = response_a_vectorizer.fit_transform(train_df['response_a'])
X_response_b = response_b_vectorizer.fit_transform(train_df['response_b'])
print("Vetorizadores treinados.")

# Aplicar os mesmos vetorizadores (já treinados) nos dados de teste
X_test_prompt = prompt_vectorizer.transform(test_df['prompt'])
X_test_response_a = response_a_vectorizer.transform(test_df['response_a'])
X_test_response_b = response_b_vectorizer.transform(test_df['response_b'])

Treinando vetorizadores...
Vetorizadores treinados.


In [None]:
# Extrair as features de comprimento
meta_features_train = train_df[['prompt_len', 'response_a_len', 'response_b_len', 'len_diff']].values
meta_features_test = test_df[['prompt_len', 'response_a_len', 'response_b_len', 'len_diff']].values

# Combinar tudo usando hstack (horizontal stack)
X_train_final = hstack([
    X_prompt,
    X_response_a,
    X_response_b,
    csr_matrix(meta_features_train)
])

X_test_final = hstack([
    X_test_prompt,
    X_test_response_a,
    X_test_response_b,
    csr_matrix(meta_features_test)
])

print(f"Shape final da matriz de treino: {X_train_final.shape}")

Shape final da matriz de treino: (57477, 25004)


In [None]:
# Parâmetros para o LightGBM (pode ser otimizado)
params = {
    'objective': 'multiclass',
    'num_class': 3,
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'n_estimators': 2000, # Aumentar para mais performance, mas demora mais
    'learning_rate': 0.02,
    'num_leaves': 31,
    'max_depth': -1,
    'seed': 42,
    'n_jobs': -1,
    'verbose': -1,
}

# Inicializar o modelo
model = lgb.LGBMClassifier(**params)

# Treinar o modelo
print("Treinando o modelo LightGBM...")
# É uma boa prática usar callbacks para parar o treino se a performance não melhorar
callbacks = [lgb.early_stopping(100, verbose=False)] # Para isso, precisa de um eval_set

# Para um treino simples:
model.fit(X_train_final, train_df['target'])
print("Modelo treinado.")

# Fazer previsões no conjunto de teste
print("Fazendo previsões...")
predictions_proba = model.predict_proba(X_test_final)

Treinando o modelo LightGBM...


In [None]:
# Criar o DataFrame de submissão
submission_df = pd.DataFrame(predictions_proba, columns=['winner_model_a', 'winner_model_b', 'winner_tie'])
submission_df['id'] = test_df['id']
submission_df = submission_df[['id', 'winner_model_a', 'winner_model_b', 'winner_tie']]

submission_df.to_csv('submission.csv', index=False)
print("Arquivo submission.csv criado com sucesso!")

In [None]:
submission_df = pd.DataFrame(
    predictions_proba,
    columns=['winner_model_a', 'winner_model_b', 'winner_tie']
)
submission_df['id'] = test_df['id']
submission_df = submission_df[['id', 'winner_model_a', 'winner_model_b', 'winner_tie']]
submission_df.to_csv('submission.csv', index=False)