In [1]:
# LOADING

import pandas as pd
import os
from typing import List, Tuple, Union

DIR = '../input/twitter-datasets'


def _read_data(path: str) -> List[str]:
    with open(path, 'r') as f:
        return [x for x in f]


def _read_data_with_ids(path: str) -> Tuple[List[str], List[str]]:
    index = []
    rows = []
    with open(path, 'r') as f:
        for line in f:
            id, x = line.split(',', maxsplit=1)
            index.append(id)
            rows.append(x)
    return index, rows


def load_train(full=False, dir=DIR, eval_frac=None, x_col='x', y_col='y', neg_label=-1, pos_label=1) -> Union[pd.DataFrame, Tuple[pd.DataFrame, pd.DataFrame]]:
    pos_path = os.path.join(
        dir, 'train_pos' + ('_full' if full else '') + '.txt')
    neg_path = os.path.join(
        dir, 'train_neg' + ('_full' if full else '') + '.txt')

    pos_rows = _read_data(pos_path)
    pos = pd.DataFrame({x_col: pos_rows})
    pos[y_col] = pos_label

    neg_rows = _read_data(neg_path)
    neg = pd.DataFrame({x_col: neg_rows})
    neg[y_col] = neg_label

    df = pd.concat([pos, neg], ignore_index=True).sample(frac=1).reset_index()
    if eval_frac is None:
        return df
    else:
        val = df.sample(frac=eval_frac)
        train = df.drop(val.index)
        return train, val


def load_test(dir="../input/cil-text-classification-2022", x_col='x') -> pd.DataFrame:
    path = os.path.join(dir, 'test_data.txt')
    index, rows = _read_data_with_ids(path)
    df = pd.DataFrame({x_col: rows}, index)

    return df


# PREPROCESSING

from typing import Dict, Optional
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WhitespaceTokenizer
from nltk.corpus import stopwords
from textblob import Word
from textblob import TextBlob
from tqdm import tqdm

nltk.download('stopwords')
nltk.download('omw-1.4')
stop_words = set(stopwords.words('english'))

nltk.download('wordnet')
tokenizer = WhitespaceTokenizer()
lemmatizer = WordNetLemmatizer()

tqdm.pandas()


def to_lower(df: pd.DataFrame, x_col='x'):
    """
    To be applied to a dataframe with a column called 'x' that contains sentences.
    """
    df[x_col] = df[x_col].apply(lambda sentence: sentence.lower())


def tokenize(df: pd.DataFrame, x_col='x'):
    """
    To be applied to a dataframe with a column called 'x' that contains sentences.
    """
    df[x_col] = df[x_col].apply(lambda sentence: tokenizer.tokenize(sentence))


def remove_tags(df: pd.DataFrame, x_col='x'):
    """
    To be applied to a dataframe with a column called 'x' that contains sentences.
    Deprecated in favour of remove_tag_tokens(df: pd.DataFrame)
    """
    df[x_col] = df[x_col].apply(lambda sentence: sentence.replace(
        '<user>', '').replace('<url>', '').strip())


def remove_tag_tokens(df: pd.DataFrame, x_col='x'):
    """
    To be applied to a dataframe with a column called 'x' that contains tokens.
    """
    df[x_col] = df[x_col].apply(
        lambda tokens: [w for w in tokens if not w in ['user', '<url>']])


def remove_stopwords(df: pd.DataFrame, x_col='x'):
    """
    To be applied to a dataframe with a column called 'x' that contains tokens.
    """
    df[x_col] = df[x_col].apply(
        lambda tokens: [w for w in tokens if not w in stop_words])


def lemmatize(df: pd.DataFrame, x_col='x'):
    """
    To be applied to a dataframe with a column called 'x' that contains tokens.
    """
    df[x_col] = df[x_col].apply(
        lambda tokens: [lemmatizer.lemmatize(w) for w in tokens])


def remove_single_symbols(df: pd.DataFrame, x_col='x'):
    """
    To be applied to a dataframe with a column called 'x' that contains tokens.
    """
    df[x_col] = df[x_col].apply(
        lambda tokens: [w for w in tokens if len(w) > 1])


def spelling_correction(df: pd.DataFrame, x_col='x'):
    """
    To be applied to a dataframe with a column called 'x' that contains tokens.
    """
    df[x_col] = df[x_col].progress_apply(
        lambda tokens: [Word(w).correct() for w in tokens])


def replace_user_handles(df: pd.DataFrame, x_col='x'):
    """
    To be applied to a dataframe with a column called 'x' that contains tokens.
    """
    df[x_col] = df[x_col].apply(lambda tokens: [w if not (
        w.startswith("@") and len(w) > 1) else "<user>" for w in tokens])


def replace_urls(df: pd.DataFrame, x_col='x'):
    """
    To be applied to a dataframe with a column called 'x' that contains tokens.
    """
    df[x_col] = df[x_col].apply(lambda tokens: [w if not (w.startswith(
        "http://") or w.startswith("https://") or w.startswith("www.")) else "<url>" for w in tokens])


def untokenize(df: pd.DataFrame, x_col='x'):
    """
    To be applied to a dataframe with a column called 'x' that contains tokens.
    """
    df[x_col] = df[x_col].apply(lambda tokens: " ".join(tokens))


def preprocess(df: pd.DataFrame, flags: Optional[Dict[str, bool]], x_col='x'):
    if flags is not None:
        if flags.get('to_lower', False):
            to_lower(df, x_col=x_col)
        if flags.get('tokenize', False):
            tokenize(df, x_col=x_col)
        if flags.get('replace_user_handles', False):
            replace_user_handles(df, x_col=x_col)
        if flags.get('replace_urls', False):
            replace_urls(df, x_col=x_col)
        if flags.get('remove_tags', False):
            remove_tags(df, x_col=x_col)
        if flags.get('remove_tag_tokens', False):
            remove_tag_tokens(df, x_col=x_col)
        if flags.get('remove_stopwords', False):
            remove_stopwords(df, x_col=x_col)
        if flags.get('lemmatize', False):
            lemmatize(df, x_col=x_col)
        if flags.get('remove_single_symbols', False):
            remove_single_symbols(df, x_col=x_col)
        if flags.get('spelling_correction', False):
            spelling_correction(df, x_col=x_col)

            
# SUBMISSION

import pandas as pd
import numpy as np
from typing import Callable

from loading import load_test


def prepare_model_submission(model: Callable[[pd.DataFrame], np.array], file='submission.csv'):
    df = load_test()
    y_pred = model(df)
    prepare_submission(y_pred, file)


def prepare_submission(y_pred: np.ndarray, file='submission.csv'):
    df = pd.DataFrame(y_pred, columns=['Prediction'])
    df.index += 1
    df.to_csv(file, index_label='Id')


# BERT INIT

from pathlib import Path
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, ClassLabel
from scipy.special import softmax

from evaluation import evaluate, evaluate_prob
from loading import load_train
from preprocessing import preprocess


def load(df_train, df_val, preprocessing=None):
    # df_train, df_val = load_train(full=full, eval_frac=0.2, x_col='text', y_col='label', neg_label=0, pos_label=1)

    preprocess(df_train, flags=preprocessing, x_col='text')
    preprocess(df_val, flags=preprocessing, x_col='text')

    dataset_train = Dataset.from_pandas(df_train)
    dataset_val = Dataset.from_pandas(df_val)

    new_features = dataset_train.features.copy()
    new_features['label'] = ClassLabel(names=['0', '1'])

    dataset_train = dataset_train.cast(new_features)
    dataset_val = dataset_val.cast(new_features)

    return dataset_train, dataset_val


def tokenize(ds, tokenizer, path=None, force=True):
    def tokenize_function(ds):
        return tokenizer(ds['text'], padding=True, truncation=True)

    def load_or_tokenize(ds, path, force):
        if not force and path is not None and Path(path).exists():
            return Dataset.load_from_disk(path)
        else:
            ds_tokenized = ds.map(tokenize_function, batched=True)
            if path is not None:
                ds_tokenized.save_to_disk(path)
            return ds_tokenized

    return load_or_tokenize(ds, path=path, force=force)


def get_BERT(model_name, device):
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=4, ignore_mismatched_sizes=True).to(device)
    model.save_pretrained(model_name)
    return model


def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    point_estimates = np.argmax(predictions, axis=1)
    point_estimate_eval = evaluate(labels, point_estimates)

    prob_estimates = softmax(predictions, axis=1)[:, 1]
    prob_estimates_eval = evaluate_prob(labels, prob_estimates)
    confidence = np.max(prob_estimates, axis=1)
    all_confidence = confidence.mean()
    all_confidence_std = confidence.std()
    correct_confidence = confidence[labels == point_estimates].mean()
    correct_confidence_std = confidence[labels == point_estimates].std()
    incorrect_confidence = confidence[labels != point_estimates].mean()
    incorrect_confidence_std = confidence[labels != point_estimates].std()

    return {
        **point_estimate_eval,
        **prob_estimates_eval,
        'confidence': all_confidence,
        'confidence_std': all_confidence_std,
        'correct_confidence': correct_confidence,
        'correct_confidence_std': correct_confidence_std,
        'incorrect_confidence': incorrect_confidence,
        'incorrect_confidence_std': incorrect_confidence_std,
    }


def train(model_name, tokenizer_name, device, df_train, df_val, preprocessing=None, batch_size=32, epochs=1, force_tokenize=True):
    dataset_train, dataset_val = load(
        df_train, df_val, preprocessing=preprocessing)

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    train_tokenized = tokenize(
        dataset_train,
        tokenizer,
        # path=f'bert/cache/train_tokenized__{tokenizer_name}{"__full" if full else ""}',
        force=force_tokenize)
    val_tokenized = tokenize(
        dataset_val,
        tokenizer,
        # path=f'bert/cache/val_tokenized__{tokenizer_name}{"__full" if full else ""}',
        force=force_tokenize)

    model = get_BERT(model_name, device)

    training_args = TrainingArguments(
        output_dir='bert_data/test_trainer',
        num_train_epochs=epochs,
        save_strategy='epoch',
        evaluation_strategy='epoch',
        per_device_train_batch_size=batch_size,
        load_best_model_at_end=True)

    trainer = Trainer(
        model,
        training_args,
        train_dataset=train_tokenized,
        eval_dataset=val_tokenized,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics)

    trainer.train()

    # val_pred = trainer.predict(val_tokenized)
    # y_pred = np.argmax(val_pred.predictions, axis=1)
    # y = val_tokenized.to_pandas()['label']
    # metrics = evaluate(y, y_pred)
    return model


def objective(args, model_name, tokenizer_name, device, full=False):
    print(args)
    _, metrics = train(model_name, tokenizer_name, device, full=full, **args)
    return -metrics['accuracy']


# TRAIN TEST SPLIT

import pandas as pd

df_train = pd.read_csv('train_test/train.csv')
df_eval = pd.read_csv('train_test/test.csv')


def select_train(size=160_000):
    return df_train.iloc[:size].drop(['Unnamed: 0'], axis='columns')


def select_train_with_cluster(df_cluster_map: pd.DataFrame, cluster: int, size=160_000):
    df = pd.merge(df_train, df_cluster_map, on='index')
    return df[df['cluster'] == cluster].iloc[:size].drop(['Unnamed: 0'], axis='columns')


def select_eval(size=40_000):
    return df_eval.iloc[:size].drop(['Unnamed: 0'], axis='columns')


def select_eval_with_cluster(df_cluster_map: pd.DataFrame, cluster: int, size=40_000):
    df = pd.merge(df_eval, df_cluster_map, on='index')
    return df[df['cluster'] == cluster].iloc[:size].drop(['Unnamed: 0'], axis='columns')

[nltk_data] Downloading package stopwords to /Users/franz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/franz/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/franz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
import torch
import numpy as np
from transformers import AutoTokenizer, Trainer
from datasets import Dataset

FULL=True

task='emotion'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"
TOKENIZER = MODEL

EPOCHS = 1
BATCH_SIZE = 32
PREPROCESSING = None

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = get_BERT(model_name, device)
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
trainer = Trainer(model, tokenizer=tokenizer)

# TEST DATA
print("load test data")
df_test = load_test(x_col='text')
dataset_test = Dataset.from_pandas(df_test)
print("tokenize test data")
test_tokenized = tokenize(dataset_test, tokenizer, path=f'bert/cache/test_tokenized__{TOKENIZER}')
print("predict emo labels for test data")
test_pred = trainer.predict(test_tokenized)
test_pred = np.argmax(test_pred.predictions, axis=1)
print("save emo labels for test data")
test_pred = pd.DataFrame(test_pred)
test_pred.to_csv("test_1_epoch.csv")

# TRAINING DATA
print("load training data")
df_train = load_train(x_col='text', full=True)
dataset_train = Dataset.from_pandas(df_train)
print("tokenize training data")
train_tokenized = tokenize(dataset_train, tokenizer, path=f'bert/cache/test_tokenized__{TOKENIZER}')
print("predict emo labels for training data")
train_pred = trainer.predict(train_tokenized)
train_pred = np.argmax(train_pred.predictions, axis=1)
print("save emo labels for training data")
train_pred = pd.DataFrame(train_pred)
train_pred.to_csv("train_emotions.csv")

'cpu'