In [1]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
from datetime import datetime
import os
import gzip
import csv

In [2]:
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

In [3]:
import pandas as pd
from random import sample
import random

In [4]:
name_pairs_score_df = pd.read_csv("data/data/names/name_pairs_score.csv")
train_li = ["train"] * (round(len(name_pairs_score_df) * 0.68))
dev_li = ["dev"] * (round(len(name_pairs_score_df) * 0.18))
test_li = ["test"] * (round(len(name_pairs_score_df) * 0.14))
split_li = [item for sublist in [train_li,dev_li,test_li] for item in sublist]
print(len(split_li) - len(name_pairs_score_df))

seed=60
n=len(split_li)
split_random_li = random.Random(seed).sample(split_li, n)

name_pairs_score_df["split"] = split_random_li
name_pairs_score_df.to_csv("data/data/names/name_pairs_score_split.csv", index=False)

0


In [5]:
#Import names dataset
names_dataset_path = "data/data/names/name_pairs_score_split.csv"

In [9]:
# !ls output/

In [30]:
model_name = 'T-Systems-onsite/cross-en-de-roberta-sentence-transformer'
train_batch_size = 8
num_epochs = 2

lr = 1e-05
eps = 1e-06
weight_decay = 0.05
warmup_steps_proportion = 0.10

In [12]:
model_save_path = 'output/training_namesdataset_continue_training-' + \
    model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [13]:
model = SentenceTransformer(model_name)

2021-11-17 19:48:51 - Load pretrained SentenceTransformer: T-Systems-onsite/cross-en-de-roberta-sentence-transformer
2021-11-17 19:48:56 - No sentence-transformers model found with name /Users/faridamustafazade/.cache/torch/sentence_transformers/T-Systems-onsite_cross-en-de-roberta-sentence-transformer. Creating a new one with MEAN pooling.
2021-11-17 19:49:16 - Use pytorch device: cpu


In [14]:
train_samples = []
dev_samples = []
test_samples = []
# with gzip.open(own_dataset_path, 'rt', encoding='utf8') as fIn:
#     reader = csv.DictReader(stsbenchmark_dataset_path, delimiter='\t', quoting=csv.QUOTE_NONE)
#     for row in reader:
#         score = float(row['score']) / 5.0  # Normalize score to range 0 ... 1
#         inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score)

#         if row['split'] == 'dev':
#             dev_samples.append(inp_example)
#         elif row['split'] == 'test':
#             test_samples.append(inp_example)
#         else:
#             train_samples.append(inp_example)


# with gzip.open(own_dataset_path, 'rt', encoding='utf8') as fIn:
f = open(names_dataset_path, "r")
reader = csv.DictReader(f, delimiter=',', quoting=csv.QUOTE_NONE)
for row in reader:
    score = float(row['score']) #/ 5.0  Normalize score to range 0 ... 1
    inp_example = InputExample(texts=[row['name1'], row['name2']], label=score)

    if row['split'] == 'dev':
        dev_samples.append(inp_example)
    elif row['split'] == 'test':
        test_samples.append(inp_example)
    else:
        train_samples.append(inp_example)

In [15]:
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

In [16]:
# Development set: Measure correlation between cosine score and gold labels
logging.info("Read names dataset dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='names-dev')

2021-11-17 19:50:41 - Read names dataset dev dataset


In [17]:
# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * warmup_steps_proportion)
logging.info("Warmup-steps: {}".format(warmup_steps))

2021-11-17 19:51:35 - Warmup-steps: 674


In [21]:
train_loss.loss_fct

CosineSimilarityLoss(
  (model): SentenceTransformer(
    (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  )
  (loss_fct): MSELoss()
  (cos_score_transformation): Identity()
)

In [19]:
help(losses.CosineSimilarityLoss)

Help on class CosineSimilarityLoss in module sentence_transformers.losses.CosineSimilarityLoss:

class CosineSimilarityLoss(torch.nn.modules.module.Module)
 |  CosineSimilarityLoss(model: sentence_transformers.SentenceTransformer.SentenceTransformer, loss_fct=MSELoss(), cos_score_transformation=Identity())
 |  
 |  CosineSimilarityLoss expects, that the InputExamples consists of two texts and a float label.
 |  
 |  It computes the vectors u = model(input_text[0]) and v = model(input_text[1]) and measures the cosine-similarity between the two.
 |  By default, it minimizes the following loss: ||input_label - cos_score_transformation(cosine_sim(u,v))||_2.
 |  
 |  :param model: SentenceTranformer model
 |  :param loss_fct: Which pytorch loss function should be used to compare the cosine_similartiy(u,v) with the input_label? By default, MSE:  ||input_label - cosine_sim(u,v)||_2
 |  :param cos_score_transformation: The cos_score_transformation function is applied on top of cosine_similarit

In [24]:
def fit_classifier_with_hyperparameter_search(
        X, y, basemod, cv, param_grid, scoring='f1_macro', verbose=True):
    """
    Fit a classifier with hyperparameters set via cross-validation.

    Parameters
    ----------
    X : 2d np.array
        The matrix of features, one example per row.

    y : list
        The list of labels for rows in `X`.

    basemod : an sklearn model class instance
        This is the basic model-type we'll be optimizing.

    cv : int or an sklearn Splitter
        Number of cross-validation folds, or the object used to define
        the splits. For example, where there is a predefeined train/dev
        split one wants to use, one can feed in a `PredefinedSplitter`
        instance to use that split during cross-validation.

    param_grid : dict
        A dict whose keys name appropriate parameters for `basemod` and
        whose values are lists of values to try.

    scoring : value to optimize for (default: f1_macro)
        Other options include 'accuracy' and 'f1_micro'. See
        http://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

    verbose : bool
        Whether to print some summary information to standard output.

    Prints
    ------
    To standard output (if `verbose=True`)
        The best parameters found.
        The best macro F1 score obtained.

    Returns
    -------
    An instance of the same class as `basemod`.
        A trained model instance, the best model found.

    """
    if isinstance(cv, int):
        cv = StratifiedShuffleSplit(n_splits=cv, test_size=0.20)
    # Find the best model within param_grid:
    crossvalidator = RandomizedSearchCV(basemod, param_grid, cv=cv, scoring=scoring)
    crossvalidator.fit(X, y)
    # Report some information:
    if verbose:
        print("Best params: {}".format(crossvalidator.best_params_))
        print("Best score: {0:0.03f}".format(crossvalidator.best_score_))
    # Return the best model found:
    return crossvalidator.best_estimator_

In [22]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
def fit_hf_bert_classifier_with_hyperparameter_search(X, y):
    basemod = HfBertClassifier(
        weights_name='bert-base-cased',
        batch_size=8,  # Small batches to avoid memory overload.
        max_iter=1,  # We'll search based on 1 iteration for efficiency.
        n_iter_no_change=5,   # Early-stopping params are for the
        early_stopping=True)  # final evaluation.

    param_grid = {
        'gradient_accumulation_steps': [1, 4, 8],
        'eta': [0.00005, 0.0001, 0.001],
        'hidden_dim': [100, 200, 300]}

    bestmod = utils.fit_classifier_with_hyperparameter_search(
        X, y, basemod, cv=3, param_grid=param_grid)

    return bestmod

In [26]:
model[0]

Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 

In [27]:
model[1]

Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})

In [None]:
"""
Optuna example that optimizes multi-layer perceptrons using PyTorch.

In this example, we optimize the validation accuracy of hand-written digit recognition using
PyTorch and FashionMNIST. We optimize the neural network architecture as well as the optimizer
configuration. As it is too time consuming to use the whole FashionMNIST dataset,
we here use a small subset of it.

"""

import os

import optuna
from optuna.trial import TrialState
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
from torchvision import datasets
from torchvision import transforms


DEVICE = torch.device("cpu")
BATCHSIZE = 128
CLASSES = 10
DIR = os.getcwd()
EPOCHS = 10
LOG_INTERVAL = 10
N_TRAIN_EXAMPLES = BATCHSIZE * 30
N_VALID_EXAMPLES = BATCHSIZE * 10


def define_model(trial):
    # We optimize the number of layers, hidden units and dropout ratio in each layer.
    n_layers = trial.suggest_int("n_layers", 1, 3)
    layers = []

    in_features = 28 * 28
    for i in range(n_layers):
        out_features = trial.suggest_int("n_units_l{}".format(i), 4, 128)
        layers.append(nn.Linear(in_features, out_features))
        layers.append(nn.ReLU())
        p = trial.suggest_float("dropout_l{}".format(i), 0.2, 0.5)
        layers.append(nn.Dropout(p))

        in_features = out_features
    layers.append(nn.Linear(in_features, CLASSES))
    layers.append(nn.LogSoftmax(dim=1))

    return nn.Sequential(*layers)


def get_mnist():
    # Load FashionMNIST dataset.
    train_loader = torch.utils.data.DataLoader(
        datasets.FashionMNIST(DIR, train=True, download=True, transform=transforms.ToTensor()),
        batch_size=BATCHSIZE,
        shuffle=True,
    )
    valid_loader = torch.utils.data.DataLoader(
        datasets.FashionMNIST(DIR, train=False, transform=transforms.ToTensor()),
        batch_size=BATCHSIZE,
        shuffle=True,
    )

    return train_loader, valid_loader


def objective(trial):

    # Generate the model.
    model = define_model(trial).to(DEVICE)

    # Generate the optimizers.
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)

    # Get the FashionMNIST dataset.
    train_loader, valid_loader = get_mnist()

    # Training of the model.
    for epoch in range(EPOCHS):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            # Limiting training data for faster epochs.
            if batch_idx * BATCHSIZE >= N_TRAIN_EXAMPLES:
                break

            data, target = data.view(data.size(0), -1).to(DEVICE), target.to(DEVICE)

            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            optimizer.step()

        # Validation of the model.
        model.eval()
        correct = 0
        with torch.no_grad():
            for batch_idx, (data, target) in enumerate(valid_loader):
                # Limiting validation data.
                if batch_idx * BATCHSIZE >= N_VALID_EXAMPLES:
                    break
                data, target = data.view(data.size(0), -1).to(DEVICE), target.to(DEVICE)
                output = model(data)
                # Get the index of the max log-probability.
                pred = output.argmax(dim=1, keepdim=True)
                correct += pred.eq(target.view_as(pred)).sum().item()

        accuracy = correct / min(len(valid_loader.dataset), N_VALID_EXAMPLES)

        trial.report(accuracy, epoch)

        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return accuracy


if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100, timeout=600)

    pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
    complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

    print("Study statistics: ")
    print("  Number of finished trials: ", len(study.trials))
    print("  Number of pruned trials: ", len(pruned_trials))
    print("  Number of complete trials: ", len(complete_trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

In [29]:
model_save_path

'output/training_namesdataset_continue_training-T-Systems-onsite/cross-en-de-roberta-sentence-transformer-2021-11-17_19-48-40'

In [31]:
model_name

'T-Systems-onsite/cross-en-de-roberta-sentence-transformer'

In [41]:
def objective(trial):

    SEED = 42
    torch.manual_seed(SEED)
    np.random.seed(SEED)

    from sentence_transformers.datasets import SentenceLabelDataset
    model_name = 'T-Systems-onsite/cross-en-de-roberta-sentence-transformer'

    train_batch_size = trial.suggest_categorical("Batch", [8, 16, 32, 64, 128])
    num_epochs = trial.suggest_categorical("Epochs", [1, 2, 3])
    warm = trial.suggest_uniform("warm", 0.0, 0.5)
    #     warm = trial.suggest_uniform("warm", 0.0, 1.0)
    
    
    train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)


    warmup_steps = math.ceil(len(train_dataloader) * num_epochs * warm)
    margin = trial.suggest_uniform("margin", 0.0, 2.0)

    model = SentenceTransformer(model_name)

    train_loss = losses.CosineSimilarityLoss(model=model)

    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=evaluator,
              epochs=num_epochs,
              warmup_steps=warmup_steps)

    best_score = max(evaluator.bests)
    best_seed.append(best_score)

    return max(best_seed)

In [37]:
import torch

import optuna

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, timeout=None)

[32m[I 2021-11-17 21:18:47,561][0m A new study created in memory with name: no-name-561ba4af-27d6-4f6d-87ac-4c01a6d15549[0m


2021-11-17 21:18:47 - Load pretrained SentenceTransformer: T-Systems-onsite/cross-en-de-roberta-sentence-transformer
2021-11-17 21:18:51 - No sentence-transformers model found with name /Users/faridamustafazade/.cache/torch/sentence_transformers/T-Systems-onsite_cross-en-de-roberta-sentence-transformer. Creating a new one with MEAN pooling.
2021-11-17 21:19:03 - Use pytorch device: cpu


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/211 [00:00<?, ?it/s]

In [None]:
# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)