<a href="https://colab.research.google.com/github/dimitarpg13/transformer_examples/blob/main/notebooks/bert/Response_Prediction_with_SBERT_and_SNLI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
from datetime import datetime
import random
import math
import numpy as np
import pandas as pd
import pprint
import gzip
import csv
import logging
from IPython.display import display
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import zipfile

SNLI_DATASET_PATH = '/content/drive/MyDrive/prediction-with-assortment/SNLI/snli_1.0.zip'

SNLI_DATASET_EXTRACT_DIR = '/content/drive/MyDrive/prediction-with-assortment/SNLI'

SNLI_DATASET_DIR = os.path.join(SNLI_DATASET_EXTRACT_DIR, "snli_1.0")

SNLI_TRAIN_FILE =  os.path.join(SNLI_DATASET_DIR, "snli_1.0_train.txt")
SNLI_DEV_FILE = os.path.join(SNLI_DATASET_DIR, "snli_1.0_dev.txt")
SNLI_TEST_FILE = os.path.join(SNLI_DATASET_DIR, "snli_1.0_dev.txt")

if (os.path.isdir(SNLI_DATASET_EXTRACT_DIR) and os.path.isdir(SNLI_DATASET_DIR) and
   os.path.isfile(SNLI_TRAIN_FILE) and os.path.isfile(SNLI_DEV_FILE) and os.path.isfile(SNLI_TEST_FILE)):
    print("Dataset already extracted")
else:
    # Open the zip file in read mode
    with zipfile.ZipFile(SNLI_DATASET_PATH, 'r') as zip_ref:
        # Extract all files to the specified directory
        zip_ref.extractall(SNLI_DATASET_EXTRACT_DIR)

os.listdir(SNLI_DATASET_DIR)

Dataset already extracted


['Icon\r',
 '.DS_Store',
 'README.txt',
 'snli_1.0_dev.txt',
 'snli_1.0_dev.jsonl',
 'snli_1.0_test.jsonl',
 'snli_1.0_test.txt',
 'snli_1.0_train.jsonl',
 'snli_1.0_train.txt',
 'snli_1.0_train.parquet',
 'snli_1.0_dev.parquet',
 'snli_1.0_test.parquet']

In [None]:
SNLI_TRAIN_FILE =  os.path.join(SNLI_DATASET_DIR, "snli_1.0_train.txt")
SNLI_DEV_FILE = os.path.join(SNLI_DATASET_DIR, "snli_1.0_dev.txt")
SNLI_TEST_FILE = os.path.join(SNLI_DATASET_DIR, "snli_1.0_dev.txt")

PARQUET_TRAIN_FILE = os.path.join(SNLI_DATASET_DIR, "snli_1.0_train.parquet")
PARQUET_DEV_FILE = os.path.join(SNLI_DATASET_DIR, "snli_1.0_dev.parquet")
PARQUET_TEST_FILE = os.path.join(SNLI_DATASET_DIR, "snli_1.0_test.parquet")

if (os.path.isfile(PARQUET_TRAIN_FILE) and os.path.isfile(PARQUET_DEV_FILE) and os.path.isfile(PARQUET_TEST_FILE)):
    print("Parquet files already created.")
    df_train = pd.read_parquet(PARQUET_TRAIN_FILE)
    df_dev = pd.read_parquet(PARQUET_DEV_FILE)
    df_test = pd.read_parquet(PARQUET_TEST_FILE)
else:
    print("Loading Dataframes from CSV...")
    df_train = pd.read_csv(SNLI_TRAIN_FILE, sep="\t")
    df_dev = pd.read_csv(SNLI_DEV_FILE, sep="\t")
    df_test = pd.read_csv(SNLI_TEST_FILE, sep="\t")
    print("Saving Parquet files...")
    df_train.to_parquet(PARQUET_TRAIN_FILE)
    df_dev.to_parquet(PARQUET_DEV_FILE)
    df_test.to_parquet(PARQUET_TEST_FILE)
# The SNLI dataset contains several columns, but for many tasks, only gold_label, sentence1, and sentence2 are needed.

df_train[['sentence1', 'sentence2', 'gold_label']][:5]

Parquet files already created.


Unnamed: 0,sentence1,sentence2,gold_label
0,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,neutral
1,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",contradiction
2,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",entailment
3,Children smiling and waving at camera,They are smiling at their parents,neutral
4,Children smiling and waving at camera,There are children present,entailment


In [None]:
print(f"Number of records in the training dataset df_train: {len(df_train)}")
print(f"Number of records in the dev dataset df_dev: {len(df_dev)}")
print(f"Number of records in the test dataset df_test: {len(df_test)}")

Number of records in the training dataset df_train: 550152
Number of records in the dev dataset df_dev: 10000
Number of records in the test dataset df_test: 10000


## Select 200K at random from the training dataset and use those as a new training dataset

In [None]:
PARQUET_TRAIN_NEW_FILE = os.path.join(SNLI_DATASET_DIR, "snli_1.0_train_new.parquet")
PARQUET_TRAIN_REST_FILE = os.path.join(SNLI_DATASET_DIR, "snli_1.0_train_rest.parquet")
TRAIN_SAMPLE_SIZE = 200000

if (os.path.isfile(PARQUET_TRAIN_NEW_FILE) and os.path.isfile(PARQUET_TRAIN_REST_FILE)):
    print("Parquet files already created.")
    df_train_new = pd.read_parquet(PARQUET_TRAIN_NEW_FILE)
    df_rest = pd.read_parquet(PARQUET_TRAIN_REST_FILE)
    print(f"Number of records in the new training dataset df_train_new: {len(df_train_new)}")
    print(f"Number of records in the rest training dataset df_rest: {len(df_rest)}")
    print(f"Unique values of the gold_label column in df_train_new: {df_train_new['gold_label'].unique()}")
    print(f"Unique values of the gold_label column in df_rest: {df_rest['gold_label'].unique()}")
else:
    print("Sampling 200K records from the training dataset...")
    df_train_new = df_train.sample(n=TRAIN_SAMPLE_SIZE)
    df_rest = df_train.loc[~df_train.index.isin(df_train_new.index)]
    print(f"Number of records in the new training dataset df_train_new: {len(df_train_new)}")
    print(f"Number of records in the rest training dataset df_rest: {len(df_rest)}")
    print("Saving Parquet files...")
    df_train_new.to_parquet(PARQUET_TRAIN_NEW_FILE)
    df_rest.to_parquet(PARQUET_TRAIN_REST_FILE)

df_train_new[['sentence1', 'sentence2', 'gold_label']][:5]



Sampling 200K records from the training dataset...
Number of records in the new training dataset df_train_new: 200000
Number of records in the rest training dataset df_rest: 350152
Saving Parquet files...


Unnamed: 0,sentence1,sentence2,gold_label
226456,A person sits on the front deck of a ship and ...,A person sits on the front deck of a ship and ...,entailment
508208,A young girl peeking out from behind a tree in...,A child reads a book in her room,contradiction
534199,Three men are sitting on a bench in front of a...,the men are on a boat crashing through the wav...,contradiction
71207,A girl in a green shirt with long dark hair wi...,The girl is wearing a green shirt,entailment
531368,"A newlywed couple, still wearing their wedding...",Nobody near the car.,contradiction


In [None]:
print(f"Unique values of the gold_label column: {df_train_new['gold_label'].unique()}")

Unique values of the gold_label column: ['entailment' 'contradiction' 'neutral' '-']


In [None]:
df_train_new = df_train_new[df_train_new['gold_label'].isin(["contradiction", "entailment", "neutral"])]
df_rest = df_rest[df_rest['gold_label'].isin(["contradiction", "entailment", "neutral"])]
label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}
df_train_new.loc[:, 'gold_label_int'] = df_train_new.loc[:, 'gold_label'].apply(lambda x: label2int[x])
df_rest.loc[:, 'gold_label_int'] = df_rest.loc[:, 'gold_label'].apply(lambda x: label2int[x])
df_train_new_label = df_train_new.loc[:, ['sentence1', 'sentence2', 'gold_label_int']]
df_train_new_label[:5]

Unnamed: 0,sentence1,sentence2,gold_label_int
418178,A reddish-brown dog is laying at the front of ...,An animal is laying on a blanket.,1
92098,A man wearing a red t-shirt is snowboarding ev...,A man wearing a wetsuit is in the ocean.,0
463990,An asian man walking down the street in the rain.,An asian man walks down the street with an umb...,2
424211,A woman giving out free hugs.,A woman is making people pay for kisses.,0
26724,A young girl sitting in swimming gear,a young girl is riding a horse,0


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer,LoggingHandler, losses, util, datasets, models
#from sentence_transformers.datasets import DataLoader
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample
#from transformers import InputExample
from sentence_transformers.cross_encoder.evaluation import CESoftmaxAccuracyEvaluator

class DataFrameDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        example = InputExample(
            guid=str(idx),
            texts=[row['sentence1'], row['sentence2']],  # Replace with your text column name
            label=row['gold_label_int'],  # Replace with your label column name
        )
        #return example
        return example.texts, example.label


#in1 = InputExample(texts=['My first sentence', 'My second sentence'], label=0.8)
#in2 = InputExample(texts=['Another pair', 'Unrelated sentence'], label=0.3)
#train_examples = [in1, in2]

#dataloader = DataLoader(train_examples, batch_size=2, shuffle=True)

# Create Dataset and DataLoader
dataset = DataFrameDataset(df_train_new_label)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Iterate through the DataLoader
#for batch in dataloader:
#    print(batch)

max_seq_length = 128

num_epochs = 1

train_batch_size = 16

model_name = 'roberta-base'
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode='mean')
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]