In [1]:
from cmath import log
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss, roc_auc_score
from typing import Callable, Tuple, Dict
import logging


def _log_metrics(metrics):
    logging.info(
    '---\n' +
    '\n'.join([f'* {x}: {y}' for x,y in metrics.items()]) +
    '\n---'
    )


def evaluate_prob(y: np.array, y_pred: np.array, verbose=True) -> Dict[str, float]:
    """
    Returns BCE loss, AUC in this order.
    """

    bce = log_loss(y, y_pred)
    auc = roc_auc_score(y, y_pred)
    result = {'bce': bce, 'auc': auc}

    if verbose:
        _log_metrics(result)
    return result


def evaluate(y: np.array, y_pred: np.array) -> Dict[str, float]:
    """
    Returns accuracy, precision, recall, F1, BCE loss, AUC in this order.

    * accuracy: proportion of correctly classified answers
    * precision: proportion of correctly classified positives
    * recall: proportion of actual positives correctly classified
    * F1: combination of precision & recall
    """

    accuracy = accuracy_score(y, y_pred)
#     precision = precision_score(y, y_pred)
#     recall = recall_score(y, y_pred)
#     f1 = f1_sc 0 #ore(y, y_pred)
#     prob_metrics = evaluate_prob(y, y_pred, verbose=False)
    result = {'accuracy': accuracy}#, 'precision': precision, 'recall': recall, 'f1': f1, **prob_metrics}

    _log_metrics(result)
    return result


def evaluate_model(model: Callable[[pd.DataFrame], np.array], df: pd.DataFrame) -> Tuple[float, float, float, float, float, float]:
    """
    Expects a dataframe with columns `x` and `y`.
    """

    y = df['y'].to_numpy()
    y_pred = model(df)

    return evaluate(y, y_pred)



In [2]:
import pandas as pd
import os
from typing import List, Tuple, Union

DIR = '../input/twitter-datasets'


def _read_data(path: str) -> List[str]:
    with open(path, 'r') as f:
        return [x for x in f]


def _read_data_with_ids(path: str) -> Tuple[List[str], List[str]]:
    index = []
    rows = []
    with open(path, 'r') as f:
        for line in f:
            id, x = line.split(',', maxsplit=1)
            index.append(id)
            rows.append(x)
    return index, rows


def load_train(full=False, dir=DIR, eval_frac=None, x_col='x', y_col='y', neg_label=-1, pos_label=1) -> Union[pd.DataFrame, Tuple[pd.DataFrame, pd.DataFrame]]:
    pos_path = os.path.join(dir, 'train_pos' + ('_full' if full else '') + '.txt')
    neg_path = os.path.join(dir, 'train_neg' + ('_full' if full else '') + '.txt')

    pos_rows = _read_data(pos_path)
    pos = pd.DataFrame({x_col: pos_rows})
    pos[y_col] = pos_label

    neg_rows = _read_data(neg_path)
    neg = pd.DataFrame({x_col: neg_rows})
    neg[y_col] = neg_label

    df = pd.concat([pos, neg], ignore_index=True).sample(frac=1).reset_index()
    if eval_frac is None:
        return df
    else:
        val = df.sample(frac=eval_frac)
        train = df.drop(val.index)
        return train, val


def load_test(dir=DIR, x_col='x') -> pd.DataFrame:
    path = os.path.join("../input/cil-text-classification-2022", 'test_data.txt')
    index, rows = _read_data_with_ids(path)
    df = pd.DataFrame({x_col: rows}, index)

    return df

def load_emotion() -> pd.DataFrame:
    df = pd.read_csv("../input/clean-emotions/text_emotion_cleaned.csv")
    df["label"] = np.nan
    df.loc[df['y'] == 'noemo', "label"] = 0
    df.loc[df['y'] == 'joy', "label"] = 1
    df.loc[df['y'] == 'fear', "label"] = 2
    df.loc[df['y'] == 'sadness', "label"] = 3
    df.loc[df['y'] == 'love', "label"] = 4
    df.loc[df['y'] == 'surprise', "label"] = 5
    df.loc[df['y'] == 'anger', "label"] = 6
    
    return df.rename({"x":"text"}, axis="columns")


In [3]:
load_emotion()

Unnamed: 0.1,Unnamed: 0,text,y,label
0,0,<user> i know i was listenin to bad habit earl...,noemo,0.0
1,1,layin n bed with a headache ughhhh...waitin on...,sadness,3.0
2,2,funeral ceremony...gloomy friday...,sadness,3.0
3,3,wants to hang out with friends soon!,joy,1.0
4,4,<user> we want to trade with someone who has h...,noemo,0.0
...,...,...,...,...
39995,39995,<user>,noemo,0.0
39996,39996,happy mothers day all my love,love,4.0
39997,39997,happy mother's day to all the mommies out ther...,love,4.0
39998,39998,<user> wassup beautiful!!! follow me!! peep ou...,joy,1.0


In [4]:
from pathlib import Path
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, ClassLabel, load_metric


def load(full=False, preprocessing=None):
    # df_train, df_val = load_train(full=full, eval_frac=0.2, x_col='text', y_col='label', neg_label=0, pos_label=1)
    df_train = df_val = load_emotion()
    
#     preprocess(df_train, flags=preprocessing, x_col='text')
#     preprocess(df_val, flags=preprocessing, x_col='text')

    dataset_train = Dataset.from_pandas(df_train)
    dataset_val = Dataset.from_pandas(df_val)

    new_features = dataset_train.features.copy()
    new_features['label'] = ClassLabel(names=['0.0', '1.0', '2.0', '3.0', '4.0', '5.0', '6.0'])

    dataset_train = dataset_train.cast(new_features)
    dataset_val = dataset_val.cast(new_features)

    return dataset_train, dataset_val


def tokenize(ds, tokenizer, path, force=True):
    def tokenize_function(ds):
        return tokenizer(ds['text'], padding=True, truncation=True)

    def load_or_tokenize(ds, path, force):
        if not force and Path(path).exists():
            return Dataset.load_from_disk(path)
        else:
            ds_tokenized = ds.map(tokenize_function, batched=True)
            ds_tokenized.save_to_disk(path)
            return ds_tokenized

    return load_or_tokenize(ds, path=path, force=force)


def get_BERT(model_name, device):
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7).to(device)
    model.save_pretrained(model_name)
    return model


def train(model_name, tokenizer_name, device, full=False, preprocessing=None, batch_size=32, epochs=1, force_tokenize=True):
    dataset_train, dataset_val = load(full=full, preprocessing=preprocessing)

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    train_tokenized = tokenize(
        dataset_train,
        tokenizer,
        path=f'bert/cache/train_tokenized__{tokenizer_name}{"__full" if full else ""}',
        force=force_tokenize)
    print(train_tokenized)
    val_tokenized = tokenize(
        dataset_val,
        tokenizer,
        path=f'bert/cache/val_tokenized__{tokenizer_name}{"__full" if full else ""}',
        force=force_tokenize)

    model = get_BERT(model_name, device)

    training_args = TrainingArguments(
        output_dir='bert_data/test_trainer',
        num_train_epochs=epochs,
        save_strategy='epoch',
        evaluation_strategy='epoch',
        per_device_train_batch_size=batch_size,
        load_best_model_at_end=True)

    metric = load_metric('accuracy')
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return metric.compute(predictions=predictions, references=labels)

    trainer = Trainer(
        model,
        training_args,
        train_dataset=train_tokenized,
        eval_dataset=train_tokenized,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics)

    trainer.train()

    val_pred = trainer.predict(val_tokenized)
    y_pred = np.argmax(val_pred.predictions, axis=1)
    y = val_tokenized.to_pandas()['label']
    metrics = evaluate(y, y_pred)
    return model, metrics


def objective(args, model_name, tokenizer_name, device, full=False):
    print(args)
    _, metrics = train(model_name, tokenizer_name, device, full=full, **args)
    return -metrics['accuracy']


In [5]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [6]:
import os

os.environ["WANDB_DISABLED"] = "true"

In [7]:
FULL=False

MODEL = 'distilbert-base-uncased' # 'cardiffnlp/twitter-roberta-base-sentiment-latest'
TOKENIZER = 'bert-base-uncased'

EPOCHS = 1
BATCH_SIZE = 32
model, accuracy = train(MODEL, TOKENIZER, device, full=FULL, batch_size=BATCH_SIZE, epochs=EPOCHS)
model.save_pretrained("emo_1_epoch")

Casting the dataset:   0%|          | 0/4 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/4 [00:00<?, ?ba/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

  0%|          | 0/40 [00:00<?, ?ba/s]

Dataset({
    features: ['Unnamed: 0', 'text', 'y', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 40000
})


  0%|          | 0/40 [00:00<?, ?ba/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

Downloading builder script:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_type_ids, Unnamed: 0, y, text. If token_type_ids, Unnamed: 0, y, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 40000
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1250


Epoch,Training Loss,Validation Loss,Accuracy
1,1.4512,1.321792,0.506675


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_type_ids, Unnamed: 0, y, text. If token_type_ids, Unnamed: 0, y, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 40000
  Batch size = 8
Saving model checkpoint to bert_data/test_trainer/checkpoint-1250
Configuration saved in bert_data/test_trainer/checkpoint-1250/config.json
Model weights saved in bert_data/test_trainer/checkpoint-1250/pytorch_model.bin
tokenizer config file saved in bert_data/test_trainer/checkpoint-1250/tokenizer_config.json
Special tokens file saved in bert_data/test_trainer/checkpoint-1250/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from bert_data/test_trainer/checkpoint-1250 (score: 1.321791648864746).
The following c

Configuration saved in emo_1_epoch/config.json
Model weights saved in emo_1_epoch/pytorch_model.bin


In [8]:
df_test = load_test(x_col='text')
dataset_test = Dataset.from_pandas(df_test)
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
test_tokenized = tokenize(dataset_test, tokenizer, path=f'bert/cache/test_tokenized__{TOKENIZER}')
trainer = Trainer(model, tokenizer=tokenizer)
test_pred = trainer.predict(test_tokenized)
test_pred = np.argmax(test_pred.predictions, axis=1)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/bert-base-uncased/resolve/ma

  0%|          | 0/10 [00:00<?, ?ba/s]

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
The following columns in the test set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_type_ids, __index_level_0__, text. If token_type_ids, __index_level_0__, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 10000
  Batch size = 8


In [9]:
test_pred = pd.DataFrame(test_pred)
test_pred.to_csv("test_1_epoch.csv")
print(test_pred)

      0
0     0
1     2
2     2
3     1
4     2
...  ..
9995  1
9996  2
9997  1
9998  1
9999  0

[10000 rows x 1 columns]


In [10]:
df_train = load_train(x_col='text', full=True)
dataset_train = Dataset.from_pandas(df_train)
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
train_tokenized = tokenize(dataset_train, tokenizer, path=f'bert/cache/test_tokenized__{TOKENIZER}')
trainer = Trainer(model, tokenizer=tokenizer)
train_pred = trainer.predict(train_tokenized)
train_pred = np.argmax(train_pred.predictions, axis=1)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/bert-base-uncased/resolve/ma

  0%|          | 0/2500 [00:00<?, ?ba/s]

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
The following columns in the test set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: y, index, token_type_ids, text. If y, index, token_type_ids, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 2500000
  Batch size = 8


In [11]:
train_pred = pd.DataFrame(train_pred)
train_pred.to_csv("train_1_epoch_full.csv")