In [2]:
import pandas as pd
import numpy as np
import os
import csv
from zipfile import ZipFile
import torch
import krippendorff
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from catboost import CatBoostRegressor
import xgboost as xgb
from transformers import AutoTokenizer, XLMRobertaModel
from typing import Dict, List, Tuple, Optional
import logging
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostClassifier
import xgboost as xgb
import numpy as np
from sklearn.metrics import (accuracy_score, balanced_accuracy_score, f1_score,
                           precision_recall_fscore_support)
from catboost import CatBoostClassifier
import json
import pickle
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

In [5]:
class DataLoader:
    def __init__(self):
        self.setup_paths()
        self.setup_logging()

    def setup_logging(self):
        logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
        self.logger = logging.getLogger(__name__)

    def setup_paths(self):
        self.path_dev = 'dev/'
        self.path_train = 'train/'
        self.path_test = 'test/'
        self.path_output = 'answer/'

        for path in [self.path_dev, self.path_train, self.path_test, self.path_output]:
            if not os.path.exists(path):
                os.makedirs(path)

    def extract_zip_files(self):
        # Extract dev.zip
        if not os.listdir(self.path_dev):
            with ZipFile('dev.zip', 'r') as dev:
                dev.extractall(self.path_dev)

        # Extract train.zip
        if not os.listdir(self.path_train):
            with ZipFile('train.zip', 'r') as train:
                train.extractall(self.path_train)

        # Extract test.zip
        if not os.listdir(self.path_test):
            with ZipFile('test.zip', 'r') as test:
                test.extractall(self.path_test)

    def load_tsv_files(self):
        languages = os.listdir(self.path_train)
        self.logger.info(f"Found languages: {languages}")

        # Initialize file paths
        label_file_paths_train = []
        uses_file_paths_train = []
        label_file_paths_dev = []
        uses_file_paths_dev = []
        instance_file_paths_test = []
        uses_file_paths_test = []

        for lang in languages:
            label_file_paths_train.append(f"{self.path_train}{lang}/labels.tsv")
            uses_file_paths_train.append(f"{self.path_train}{lang}/uses.tsv")
            label_file_paths_dev.append(f"{self.path_dev}{lang}/labels.tsv")
            uses_file_paths_dev.append(f"{self.path_dev}{lang}/uses.tsv")
            instance_file_paths_test.append(f"{self.path_test}{lang}/instances.tsv")
            uses_file_paths_test.append(f"{self.path_test}{lang}/uses.tsv")
        paths = {
            'train_labels_list': label_file_paths_train,
            'train_uses_list': uses_file_paths_train,
            'dev_labels_list': label_file_paths_dev,
            'dev_uses_list': uses_file_paths_dev,
            'test_uses_list': uses_file_paths_test,
            'test_instances_list': instance_file_paths_test
        }

        data_dict = {key: [] for key in paths.keys()}

        for save_path, path_list in paths.items():
            for path in path_list:
                with open(path, encoding='utf-8') as tsvfile:
                    language = path.split('/')[1]
                    reader = csv.DictReader(tsvfile, delimiter='\t',
                                         quoting=csv.QUOTE_MINIMAL, quotechar='"')
                    for row in reader:
                        row['language'] = language
                        data_dict[save_path].append(row)

        return data_dict

    def create_merged_dataframes(self, data_dict):
        def create_mappings(uses_list):
            id2context = {}
            id2idx = {}
            for row in uses_list:
                identifier = row['identifier']
                id2context[identifier] = row['context']
                id2idx[identifier] = row['indices_target_token']
            return id2context, id2idx

        # Create mappings
        train_id2context, train_id2idx = create_mappings(data_dict['train_uses_list'])
        dev_id2context, dev_id2idx = create_mappings(data_dict['dev_uses_list'])
        test_id2context, test_id2idx = create_mappings(data_dict['test_uses_list'])

        # Merge train data
        train_uses_merged = []
        for row in data_dict['train_labels_list']:
            identifier1_train = row['identifier1']
            identifier2_train = row['identifier2']

            data_row = {
                'context1': train_id2context.get(identifier1_train),
                'context2': train_id2context.get(identifier2_train),
                'index_target_token1': train_id2idx.get(identifier1_train),
                'index_target_token2': train_id2idx.get(identifier2_train),
                'identifier1': identifier1_train,
                'identifier2': identifier2_train,
                'lemma': row['lemma'],
                'median_cleaned': row['median_cleaned'],
                'judgments': row['judgments'],
                'language': row['language']
            }
            train_uses_merged.append(data_row)


        dev_uses_merged = []
        for row in data_dict['dev_labels_list']:
            identifier1_dev = row['identifier1']
            identifier2_dev = row['identifier2']

            data_row = {
                'context1': dev_id2context.get(identifier1_dev),
                'context2': dev_id2context.get(identifier2_dev),
                'index_target_token1': dev_id2idx.get(identifier1_dev),
                'index_target_token2': dev_id2idx.get(identifier2_dev),
                'identifier1': identifier1_dev,
                'identifier2': identifier2_dev,
                'lemma': row['lemma'],
                'median_cleaned': row['median_cleaned'],
                'judgments': row['judgments'],
                'language': row['language']
            }
            dev_uses_merged.append(data_row)

        # Merge test data
        test_uses_merged = []
        for row in data_dict['test_instances_list']:
            identifier1_test = row['identifier1']
            identifier2_test = row['identifier2']

            data_row = {
                'context1': test_id2context.get(identifier1_test),
                'context2': test_id2context.get(identifier2_test),
                'index_target_token1': test_id2idx.get(identifier1_test),
                'index_target_token2': test_id2idx.get(identifier2_test),
                'identifier1': identifier1_test,
                'identifier2': identifier2_test,
                'lemma': row['lemma'],
                'language': row['language']
            }
            test_uses_merged.append(data_row)

        return pd.DataFrame(train_uses_merged), pd.DataFrame(dev_uses_merged), pd.DataFrame(test_uses_merged)

In [7]:
class EmbeddingGenerator:
    def __init__(self, model_name="FacebookAI/xlm-roberta-base"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = XLMRobertaModel.from_pretrained(model_name)

    def truncation_indices(self, target_subword_indices, truncation_tokens_before_target=0.5):
        max_tokens = 512
        n_target_subtokens = target_subword_indices.count(True)
        tokens_before = int((max_tokens - n_target_subtokens) * truncation_tokens_before_target)
        tokens_after = max_tokens - tokens_before - n_target_subtokens

        lindex_target = target_subword_indices.index(True)
        rindex_target = lindex_target + n_target_subtokens
        lindex = max(lindex_target - tokens_before, 0)
        rindex = rindex_target + tokens_after

        return lindex, rindex

    def get_target_token_embedding(self, context, index):
        start_idx = int(str(index).strip().split(':')[0])
        end_idx = int(str(index).strip().split(':')[1])

        inputs = self.tokenizer(context, return_tensors="pt",
                              return_offsets_mapping=True, add_special_tokens=False)

        offset_mapping = inputs['offset_mapping'][0].tolist()
        input_ids = inputs['input_ids']
        tokens = self.tokenizer.convert_ids_to_tokens(input_ids[0])

        subwords_bool_mask = [
            (start <= start_idx < end) or (start < end_idx <= end)
            or (start_idx <= start and end <= end_idx)
            for start, end in offset_mapping
        ]

        if len(input_ids[0]) > 512:
            lindex, rindex = self.truncation_indices(subwords_bool_mask)
            tokens = tokens[lindex:rindex]
            input_ids = input_ids[:, lindex:rindex]
            subwords_bool_mask = subwords_bool_mask[lindex:rindex]
            inputs['input_ids'] = input_ids

        with torch.no_grad():
            outputs = self.model(inputs['input_ids'])

        target_embeddings = outputs.last_hidden_state[0][subwords_bool_mask]
        return target_embeddings.mean(dim=0).numpy()

    def generate_embeddings(self, df, file_name):
        id2embedding = {}

        for _, row in df.iterrows():
            identifier1 = row['identifier1']
            identifier2 = row['identifier2']

            if identifier1 not in id2embedding:
                embedding1 = self.get_target_token_embedding(row['context1'],
                                                          row['index_target_token1'])
                id2embedding[identifier1] = embedding1

            if identifier2 not in id2embedding:
                embedding2 = self.get_target_token_embedding(row['context2'],
                                                          row['index_target_token2'])
                id2embedding[identifier2] = embedding2

        np.savez(file_name, **id2embedding)
        return id2embedding

In [8]:
class ModelTrainer:
    def __init__(self, class_labels=None, model_dir='saved_models'):
        self.scaler = StandardScaler()
        self.trained_models = {}
        self.model_dir = model_dir
        self.class_labels = class_labels
        self.label_encoder = LabelEncoder()
        os.makedirs(model_dir, exist_ok=True)

    def prepare_data(self, embeddings_file, df):
        """
        Prepare feature vectors from embeddings and dataframe
        """
        loaded_embeddings = np.load(embeddings_file)
        features_list = []

        for _, row in df.iterrows():
            embedding1 = loaded_embeddings[row['identifier1']]
            embedding2 = loaded_embeddings[row['identifier2']]
            features = self.create_features(embedding1, embedding2)
            features_list.append(features)

        return np.array(features_list)

    def create_features(self, embedding1, embedding2):
        """
        Create feature vector from pair of embeddings
        """
        concatenated = np.concatenate([embedding1, embedding2])
        difference = embedding1 - embedding2
        product = embedding1 * embedding2

        cos_sim = np.dot(embedding1, embedding2) / (
            np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
        euclidean_dist = np.linalg.norm(embedding1 - embedding2)
        manhattan_dist = np.sum(np.abs(embedding1 - embedding2))

        return np.concatenate([
            concatenated,
            difference,
            product,
            [cos_sim, euclidean_dist, manhattan_dist]
        ])

    def fit_label_encoder(self, y):
        """
        Fit label encoder to the target values
        """
        self.label_encoder.fit(y)
        self.num_classes = len(self.label_encoder.classes_)
        return self.label_encoder.transform(y)

    def calculate_class_weights(self, y):
        """
        Calculate balanced class weights
        """
        counter = Counter(y)
        max_samples = max(counter.values())
        weights = {cls: max_samples/count for cls, count in counter.items()}
        return weights

    def calculate_metrics(self, y_true, y_pred, y_prob=None):
        """
        Calculate comprehensive metrics for imbalanced classification
        """
        metrics = {
            'accuracy': float(accuracy_score(y_true, y_pred)),
            'balanced_accuracy': float(balanced_accuracy_score(y_true, y_pred)),
            'macro_f1': float(f1_score(y_true, y_pred, average='macro')),
            'weighted_f1': float(f1_score(y_true, y_pred, average='weighted'))
        }

        # Calculate per-class precision, recall, and f1
        precision, recall, f1, support = precision_recall_fscore_support(y_true, y_pred)
        for i in range(len(precision)):
            metrics[f'class_{i}_precision'] = float(precision[i])
            metrics[f'class_{i}_recall'] = float(recall[i])
            metrics[f'class_{i}_f1'] = float(f1[i])
            metrics[f'class_{i}_support'] = int(support[i])

        return metrics

    def save_fold_model(self, model, model_name, fold_num, metrics):
        """
        Save a trained model and its metrics for a specific fold
        """
        fold_dir = os.path.join(self.model_dir, f'fold_{fold_num}')
        os.makedirs(fold_dir, exist_ok=True)

        model_path = os.path.join(fold_dir, f'{model_name}_model.pkl')
        if model_name == 'cat':
            model.save_model(model_path)
        else:  # xgb
            model.save_model(model_path)

        metrics_path = os.path.join(fold_dir, f'{model_name}_metrics.json')
        with open(metrics_path, 'w') as f:
            json.dump(metrics, f, indent=4)

    def load_fold_model(self, model_name, fold_num):
        """
        Load a model and its metrics for a specific fold
        """
        fold_dir = os.path.join(self.model_dir, f'fold_{fold_num}')
        model_path = os.path.join(fold_dir, f'{model_name}_model.pkl')
        metrics_path = os.path.join(fold_dir, f'{model_name}_metrics.json')

        if model_name == 'cat':
            model = CatBoostClassifier()
            model.load_model(model_path)
        else:  # xgb
            model = xgb.XGBClassifier()
            model.load_model(model_path)

        with open(metrics_path, 'r') as f:
            metrics = json.load(f)

        return model, metrics

    def train_and_predict(self, X_train, y_train, X_dev, y_dev, X_test):
        """
        Train models and make predictions
        """
        # Transform labels to 0-based indices
        y_train_encoded = self.fit_label_encoder(y_train)
        y_dev_encoded = self.label_encoder.transform(y_dev)

        print("Original class distribution:", Counter(y_train))
        print("Encoded class distribution:", Counter(y_train_encoded))
        print("Original class distribution (Dev):", Counter(y_dev))
        print("Encoded class distribution (Dev):", Counter(y_dev_encoded))

        # Calculate class weights using encoded labels
        class_weights = self.calculate_class_weights(y_train_encoded)
        print("Calculated class weights:", class_weights)

        # Scale features
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_dev_scaled = self.scaler.transform(X_dev)
        X_test_scaled = self.scaler.transform(X_test)

        # Save scaler and label encoder
        scaler_path = os.path.join(self.model_dir, 'scaler.pkl')
        encoder_path = os.path.join(self.model_dir, 'label_encoder.pkl')
        with open(scaler_path, 'wb') as f:
            pickle.dump(self.scaler, f)
        with open(encoder_path, 'wb') as f:
            pickle.dump(self.label_encoder, f)

        # Convert class weights to model format
        catboost_class_weights = [class_weights[i] for i in range(self.num_classes)]

        models = {
            'cat': CatBoostClassifier(
                iterations=500,
                learning_rate=0.05,
                depth=6,
                random_seed=42,
                verbose=0,
                loss_function='MultiClass',
                classes_count=self.num_classes,
                class_weights=catboost_class_weights
            ),
            'xgb': xgb.XGBClassifier(
                n_estimators=500,
                learning_rate=0.05,
                max_depth=6,
                subsample=0.8,
                colsample_bytree=0.8,
                objective='multi:softmax',
                num_class=self.num_classes,
                eval_metric='mlogloss',
                scale_pos_weight=class_weights[1]/class_weights[0] if self.num_classes == 2 else 1
            )
        }

        num_split = 3
        skf = StratifiedKFold(n_splits=num_split, shuffle=True, random_state=42)
        cv_scores = {name: {
            'accuracy': [],
            'balanced_accuracy': [],
            'macro_f1': [],
            'weighted_f1': []
        } for name in models.keys()}

        self.fold_models = {f'fold_{i+1}': {} for i in range(num_split)}

        for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_scaled, y_train_encoded)):
            fold_num = fold + 1
            X_fold_train, X_fold_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
            y_fold_train, y_fold_val = y_train_encoded[train_idx], y_train_encoded[val_idx]

            print(f"\nFold {fold_num}/{num_split}")
            print(f"Fold {fold_num} class distribution - Training:", Counter(y_fold_train))
            print(f"Fold {fold_num} class distribution - Validation:", Counter(y_fold_val))

            fold_dir = os.path.join(self.model_dir, f'fold_{fold_num}')
            os.makedirs(fold_dir, exist_ok=True)
            np.save(os.path.join(fold_dir, 'train_indices.npy'), train_idx)
            np.save(os.path.join(fold_dir, 'val_indices.npy'), val_idx)

            for name, model in models.items():
                print(f"Training {name}...")
                model.fit(X_fold_train, y_fold_train)
                y_pred = model.predict(X_fold_val)
                y_prob = model.predict_proba(X_fold_val)

                metrics = self.calculate_metrics(y_fold_val, y_pred, y_prob)

                for metric in ['accuracy', 'balanced_accuracy', 'macro_f1', 'weighted_f1']:
                    cv_scores[name][metric].append(metrics[metric])

                self.save_fold_model(model, name, fold_num, metrics)

                print(f"\n{name} metrics for fold {fold_num}:")
                for metric, value in metrics.items():
                    print(f"{metric}: {value:.4f}")

        # Save overall CV scores
        cv_results = {}
        print("\nAverage CV Scores:")
        for name, scores in cv_scores.items():
            model_results = {}
            for metric, values in scores.items():
                mean_value = np.mean(values)
                std_value = np.std(values)
                model_results[metric] = {
                    'mean': float(mean_value),
                    'std': float(std_value)
                }
                print(f"{name} {metric}: {mean_value:.4f} ± {std_value:.4f}")
            cv_results[name] = model_results

        cv_results_path = os.path.join(self.model_dir, 'cv_results.json')
        with open(cv_results_path, 'w') as f:
            json.dump(cv_results, f, indent=4)

        # Train final models
        predictions = {}
        weights = {'cat': 0.6, 'xgb': 0.4}

        final_models_dir = os.path.join(self.model_dir, 'final_models')
        os.makedirs(final_models_dir, exist_ok=True)

        for name, model in models.items():
            print(f"\nTraining final {name} model...")
            model.fit(X_train_scaled, y_train_encoded)

            model_path = os.path.join(final_models_dir, f'{name}_final_model.pkl')
            if name == 'cat':
                model.save_model(model_path)
            else:  # xgb
                model.save_model(model_path)

            predictions[name] = model.predict_proba(X_dev_scaled)

        weighted_probas = np.zeros_like(predictions['cat'])
        for name, pred_proba in predictions.items():
            weighted_probas += weights[name] * pred_proba

        final_predictions = np.argmax(weighted_probas, axis=1)

        # Transform predictions back to original labels
        final_predictions = self.label_encoder.inverse_transform(final_predictions)

        return final_predictions, cv_results

    def predict_with_fold_model(self, X_test, fold_num, model_name='ensemble'):
        """
        Make predictions using a specific fold's model
        """
        # Load encoders
        scaler_path = os.path.join(self.model_dir, 'scaler.pkl')
        encoder_path = os.path.join(self.model_dir, 'label_encoder.pkl')

        with open(scaler_path, 'rb') as f:
            scaler = pickle.load(f)
        with open(encoder_path, 'rb') as f:
            self.label_encoder = pickle.load(f)

        X_test_scaled = scaler.transform(X_test)

        if model_name == 'ensemble':
            cat_model, _ = self.load_fold_model('cat', fold_num)
            xgb_model, _ = self.load_fold_model('xgb', fold_num)

            cat_pred = cat_model.predict_proba(X_test_scaled)
            xgb_pred = xgb_model.predict_proba(X_test_scaled)

            weights = {'cat': 0.6, 'xgb': 0.4}
            weighted_probas = weights['cat'] * cat_pred + weights['xgb'] * xgb_pred

            predictions = np.argmax(weighted_probas, axis=1)
        else:
            model, _ = self.load_fold_model(model_name, fold_num)
            predictions = model.predict(X_test_scaled)

        # Transform predictions back to original labels
        return self.label_encoder.inverse_transform(predictions)

    def predict_with_final_model(self, X_test):
        """
        Make predictions using the final trained models
        """
        # Load encoders
        scaler_path = os.path.join(self.model_dir, 'scaler.pkl')
        encoder_path = os.path.join(self.model_dir, 'label_encoder.pkl')

        with open(scaler_path, 'rb') as f:
            scaler = pickle.load(f)
        with open(encoder_path, 'rb') as f:
            self.label_encoder = pickle.load(f)

        X_test_scaled = scaler.transform(X_test)

        predictions = {}
        weights = {'cat': 0.6, 'xgb': 0.4}
        final_models_dir = os.path.join(self.model_dir, 'final_models')

        for name in ['cat', 'xgb']:
            model_path = os.path.join(final_models_dir, f'{name}_final_model.pkl')
            if name == 'cat':
                model = CatBoostClassifier()
                model.load_model(model_path)
            else:  # xgb
                model = xgb.XGBClassifier()
                model.load_model(model_path)

            predictions[name] = model.predict_proba(X_test_scaled)

        weighted_probas = np.zeros_like(predictions['cat'])
        for name, pred_proba in predictions.items():
            weighted_probas += weights[name] * pred_proba

        predictions = np.argmax(weighted_probas, axis=1)

        # Transform predictions back to original labels
        return self.label_encoder.inverse_transform(predictions)

In [9]:
data_loader = DataLoader()
data_loader.extract_zip_files()
data_dict = data_loader.load_tsv_files()
df_train_uses_merged, df_dev_uses_merged, df_test_uses_merged  = data_loader.create_merged_dataframes(data_dict)

2024-11-17 01:09:26,375 - INFO - Found languages: ['norwegian', 'german', 'chinese', 'spanish', 'english', 'russian', 'swedish']


In [10]:
df_train_uses_merged['median_cleaned'].value_counts()

median_cleaned
4.0    30257
1.0     7099
3.0     5967
2.0     4510
Name: count, dtype: int64

In [11]:
df_train_uses_merged.shape

(47833, 10)

In [12]:
#embedding_generator = EmbeddingGenerator()
#train_embeddings = embedding_generator.generate_embeddings(df_train_uses_merged, 'subtask2_train_embeddings.npz')
#dev_embeddings = embedding_generator.generate_embeddings(df_dev_uses_merged, 'subtask2_dev_embeddings.npz')

In [18]:
model_trainer = ModelTrainer(class_labels=4, model_dir='saved_models')
X_train = model_trainer.prepare_data('subtask1_train_embeddings.npz', df_train_uses_merged)
X_dev = model_trainer.prepare_data('subtask1_dev_embeddings.npz', df_dev_uses_merged)
y_train = df_train_uses_merged['median_cleaned'].astype(float).values
y_dev = df_dev_uses_merged['median_cleaned'].astype(float).values
X_test = model_trainer.prepare_data('subtask1_test_embeddings.npz', df_test_uses_merged)

In [19]:
predictions, cv_results = model_trainer.train_and_predict(X_train, y_train, X_dev, y_dev, X_train)

Original class distribution: Counter({4.0: 30257, 1.0: 7099, 3.0: 5967, 2.0: 4510})
Encoded class distribution: Counter({3: 30257, 0: 7099, 2: 5967, 1: 4510})
Original class distribution (Dev): Counter({4.0: 5676, 1.0: 1055, 2.0: 817, 3.0: 739})
Encoded class distribution (Dev): Counter({3: 5676, 0: 1055, 1: 817, 2: 739})
Calculated class weights: {2: 5.070722306016424, 3: 1.0, 0: 4.262149598535005, 1: 6.708869179600887}

Fold 1/3
Fold 1 class distribution - Training: Counter({3: 20171, 0: 4733, 2: 3978, 1: 3006})
Fold 1 class distribution - Validation: Counter({3: 10086, 0: 2366, 2: 1989, 1: 1504})
Training cat...

cat metrics for fold 1:
accuracy: 0.6866
balanced_accuracy: 0.6885
macro_f1: 0.6149
weighted_f1: 0.7049
class_0_precision: 0.5362
class_0_recall: 0.8178
class_0_f1: 0.6477
class_0_support: 2366.0000
class_1_precision: 0.4504
class_1_recall: 0.6941
class_1_f1: 0.5463
class_1_support: 1504.0000
class_2_precision: 0.4167
class_2_recall: 0.5631
class_2_f1: 0.4789
class_2_suppor

Parameters: { "scale_pos_weight" } are not used.




xgb metrics for fold 1:
accuracy: 0.7985
balanced_accuracy: 0.6275
macro_f1: 0.6690
weighted_f1: 0.7799
class_0_precision: 0.7798
class_0_recall: 0.7126
class_0_f1: 0.7447
class_0_support: 2366.0000
class_1_precision: 0.7283
class_1_recall: 0.5080
class_1_f1: 0.5985
class_1_support: 1504.0000
class_2_precision: 0.6933
class_2_recall: 0.3363
class_2_f1: 0.4529
class_2_support: 1989.0000
class_3_precision: 0.8168
class_3_recall: 0.9531
class_3_f1: 0.8797
class_3_support: 10086.0000

Fold 2/3
Fold 2 class distribution - Training: Counter({3: 20171, 0: 4733, 2: 3978, 1: 3007})
Fold 2 class distribution - Validation: Counter({3: 10086, 0: 2366, 2: 1989, 1: 1503})
Training cat...

cat metrics for fold 2:
accuracy: 0.6861
balanced_accuracy: 0.6813
macro_f1: 0.6119
weighted_f1: 0.7043
class_0_precision: 0.5401
class_0_recall: 0.8052
class_0_f1: 0.6465
class_0_support: 2366.0000
class_1_precision: 0.4457
class_1_recall: 0.6693
class_1_f1: 0.5351
class_1_support: 1503.0000
class_2_precision: 0.

Parameters: { "scale_pos_weight" } are not used.




xgb metrics for fold 2:
accuracy: 0.7978
balanced_accuracy: 0.6266
macro_f1: 0.6690
weighted_f1: 0.7798
class_0_precision: 0.7974
class_0_recall: 0.7054
class_0_f1: 0.7486
class_0_support: 2366.0000
class_1_precision: 0.7072
class_1_recall: 0.4997
class_1_f1: 0.5856
class_1_support: 1503.0000
class_2_precision: 0.6885
class_2_recall: 0.3489
class_2_f1: 0.4631
class_2_support: 1989.0000
class_3_precision: 0.8154
class_3_recall: 0.9524
class_3_f1: 0.8786
class_3_support: 10086.0000

Fold 3/3
Fold 3 class distribution - Training: Counter({3: 20172, 0: 4732, 2: 3978, 1: 3007})
Fold 3 class distribution - Validation: Counter({3: 10085, 0: 2367, 2: 1989, 1: 1503})
Training cat...

cat metrics for fold 3:
accuracy: 0.6925
balanced_accuracy: 0.6802
macro_f1: 0.6159
weighted_f1: 0.7094
class_0_precision: 0.5496
class_0_recall: 0.8099
class_0_f1: 0.6548
class_0_support: 2367.0000
class_1_precision: 0.4640
class_1_recall: 0.6653
class_1_f1: 0.5467
class_1_support: 1503.0000
class_2_precision: 0.

Parameters: { "scale_pos_weight" } are not used.




xgb metrics for fold 3:
accuracy: 0.7968
balanced_accuracy: 0.6160
macro_f1: 0.6632
weighted_f1: 0.7769
class_0_precision: 0.8063
class_0_recall: 0.6929
class_0_f1: 0.7453
class_0_support: 2367.0000
class_1_precision: 0.7229
class_1_recall: 0.4704
class_1_f1: 0.5699
class_1_support: 1503.0000
class_2_precision: 0.7063
class_2_recall: 0.3409
class_2_f1: 0.4598
class_2_support: 1989.0000
class_3_precision: 0.8085
class_3_recall: 0.9597
class_3_f1: 0.8776
class_3_support: 10085.0000

Average CV Scores:
cat accuracy: 0.6884 ± 0.0029
cat balanced_accuracy: 0.6834 ± 0.0037
cat macro_f1: 0.6142 ± 0.0017
cat weighted_f1: 0.7062 ± 0.0023
xgb accuracy: 0.7977 ± 0.0007
xgb balanced_accuracy: 0.6234 ± 0.0052
xgb macro_f1: 0.6670 ± 0.0027
xgb weighted_f1: 0.7789 ± 0.0014

Training final cat model...

Training final xgb model...


Parameters: { "scale_pos_weight" } are not used.



In [20]:
fold_predictions = model_trainer.predict_with_fold_model(X_train, fold_num=1, model_name='ensemble')


In [21]:
#predictions, cv_results = model_trainer.train_and_predict(X_train, y_train, X_train)
# Make predictions using a specific fold's model
#fold_predictions = model_trainer.predict_with_fold_model(X_train, fold_num=1, model_name='ensemble')
# Make predictions using the final model
final_predictions = model_trainer.predict_with_final_model(X_train)

In [24]:
# Predict on test data by language
for language, group in df_test_uses_merged.groupby('language'):
    test_indices = group.index
    X_test_ = X_test[test_indices]  # Select test data for the specific language
    y_pred = model_trainer.predict_with_fold_model(X_test_, fold_num=1, model_name='ensemble')  
    df_test_uses_merged.loc[test_indices, 'prediction'] = y_pred  

In [25]:
# Create answer file in required format for codalab
out_dir = 'answer/'
if not os.path.exists(out_dir):
    os.mkdir(out_dir)
answer_df = df_test_uses_merged[['identifier1', 'identifier2', 'prediction', 'language']]
answer_df = answer_df.reset_index(drop=True)
for i in list(answer_df["language"].value_counts().index):
    df_temp = answer_df[answer_df["language"]==i]
    df_temp = df_temp.drop('language', axis=1)
    df_temp.to_csv(f'{out_dir}{i}.tsv', index=False, sep='\t', quoting=csv.QUOTE_MINIMAL, quotechar='"')

with ZipFile('answer.zip', 'w') as zipf:
    for root, _, files in os.walk(out_dir):
        for file in files:
            zipf.write(os.path.join(root, file), arcname=file)

answer_df.to_csv("prediction_df.csv", index=False)