In [1]:
%load_ext autoreload
%autoreload 2

# Load packages and basic definitions

In [1]:
import os
os.chdir('../')
import sys
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from src import api_utils

In [2]:
# You can update batch_size or backup_training_file if needed
api_utils.batch_size = 5
api_utils.backup_training_file = 'data/processed/labeled_output.csv'

In [3]:
pd.set_option('display.max_colwidth', None)

# Load data and basic preprocessing

In [4]:
labeled_path = "data/processed/labeled_output.csv"
raw_path = "data/raw/dataset_train.csv"

In [5]:
if os.path.exists(labeled_path):
    print("Found existing labeled data. Resuming...")
    api_utils.df = pd.read_csv(labeled_path)
else:
    print("No labeled data found. Starting from raw dataset...")
    api_utils.df = pd.read_csv(raw_path, sep='|')
    api_utils.df = api_utils.df.iloc[:, 1:]
    api_utils.df.rename(columns={'input;': 'text'}, inplace=True)
    api_utils.df['text'] = api_utils.df['text'].str.rstrip(';')

Found existing labeled data. Resuming...


In [6]:
valid_set = pd.read_csv('data/raw/dataset_valid.csv', sep='|')
valid_set = valid_set.iloc[:, 1:]
valid_set.rename(columns={'input': 'text'}, inplace=True)

In [7]:
# Stripping trailing semicolons since the validation set doesn't include them — probably added by mistake in the original dataset
print('valid set\n', valid_set['text'].str.endswith(';').value_counts())
print('\ntrain set\n', api_utils.df['text'].str.endswith(';').value_counts())

valid set
 text
False    199
Name: count, dtype: int64

train set
 text
False    651
Name: count, dtype: int64


# Create target in train dataset

In [11]:
model_list = list(api_utils.model_dict.keys())

In [12]:
for model in model_list:
    col = f"rating_{model}"
    if col not in api_utils.df.columns:
        api_utils.df[col] = pd.NA

In [1]:
for i in range(10):
    with ThreadPoolExecutor(max_workers=len(model_list)) as executor:
        executor.map(api_utils.run_batches_for_model, model_list)

    remaining = {
        model: api_utils.df[f"rating_{model}"].isna().sum()
        for model in api_utils.model_dict
    }
    print(f"Iteration {i} remaining:", remaining)

    if all(v == 0 for v in remaining.values()):
        print("All models complete.")
        break