In [2]:
import os
from datetime import datetime
import random
import math
import numpy as np
import pandas as pd
import pprint
import gzip
import csv
import logging
from IPython.display import display
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [36]:
import zipfile

SNLI_DATASET_PATH = '/content/drive/MyDrive/prediction-with-assortment/SNLI/snli_1.0.zip'

SNLI_DATASET_EXTRACT_DIR = '/content/drive/MyDrive/prediction-with-assortment/SNLI'

SNLI_DATASET_DIR = os.path.join(SNLI_DATASET_EXTRACT_DIR, "snli_1.0")

SNLI_TRAIN_FILE =  os.path.join(SNLI_DATASET_DIR, "snli_1.0_train.txt")
SNLI_DEV_FILE = os.path.join(SNLI_DATASET_DIR, "snli_1.0_dev.txt")
SNLI_TEST_FILE = os.path.join(SNLI_DATASET_DIR, "snli_1.0_dev.txt")

if (os.path.isdir(SNLI_DATASET_EXTRACT_DIR) and os.path.isdir(SNLI_DATASET_DIR) and
   os.path.isfile(SNLI_TRAIN_FILE) and os.path.isfile(SNLI_DEV_FILE) and os.path.isfile(SNLI_TEST_FILE)):
    print("Dataset already extracted")
else:
    # Open the zip file in read mode
    with zipfile.ZipFile(SNLI_DATASET_PATH, 'r') as zip_ref:
        # Extract all files to the specified directory
        zip_ref.extractall(SNLI_DATASET_EXTRACT_DIR)

os.listdir(SNLI_DATASET_DIR)

Dataset already extracted


['.DS_Store',
 'Icon\r',
 'README.txt',
 'snli_1.0_dev.jsonl',
 'snli_1.0_dev.txt',
 'snli_1.0_test.jsonl',
 'snli_1.0_test.txt',
 'snli_1.0_train.jsonl',
 'snli_1.0_train.txt']

In [47]:
SNLI_TRAIN_FILE =  os.path.join(SNLI_DATASET_DIR, "snli_1.0_train.txt")
SNLI_DEV_FILE = os.path.join(SNLI_DATASET_DIR, "snli_1.0_dev.txt")
SNLI_TEST_FILE = os.path.join(SNLI_DATASET_DIR, "snli_1.0_dev.txt")

df_train = pd.read_csv(SNLI_TRAIN_FILE, sep="\t")
df_dev = pd.read_csv(SNLI_DEV_FILE, sep="\t")
df_test = pd.read_csv(SNLI_TEST_FILE, sep="\t")
# The SNLI dataset contains several columns, but for many tasks, only gold_label, sentence1, and sentence2 are needed.

df_train[['sentence1', 'sentence2', 'gold_label']][:5]

Unnamed: 0,sentence1,sentence2,gold_label
0,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,neutral
1,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",contradiction
2,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",entailment
3,Children smiling and waving at camera,They are smiling at their parents,neutral
4,Children smiling and waving at camera,There are children present,entailment


In [48]:
print(f"Number of records in the training dataset df_train: {len(df_train)}")
print(f"Number of records in the dev dataset df_dev: {len(df_dev)}")
print(f"Number of records in the test dataset df_test: {len(df_test)}")

Number of records in the training dataset df_train: 550152
Number of records in the dev dataset df_dev: 10000
Number of records in the test dataset df_test: 10000


## Select 200K at random from the training dataset and use those as a new training dataset

In [49]:
TRAIN_SAMPLE_SIZE = 200000
df_train_new = df_train.sample(n=TRAIN_SAMPLE_SIZE)
df_rest = df_train.loc[~df_train.index.isin(df_train_new.index)]
df_train_new[['sentence1', 'sentence2', 'gold_label']][:5]

Unnamed: 0,sentence1,sentence2,gold_label
468395,Aaron Rodgers and another Green Bay Packers te...,Aaron Rodgers and a teammate celebrate in the ...,entailment
53439,A man in an orange jacket playfully tosses a s...,A man and a small girl eat pizza together,contradiction
247263,A boy on a chained swing looks to the right an...,A boy smiles at a sunny park while swinging.,neutral
524024,A child in a striped shirt pets a white goat t...,The child is stabbing the goat.,contradiction
198419,A guy catches a wave on his surfboard,watched by a group,neutral


In [50]:
print(f"Unique values of the gold_label column: {df_train_new['gold_label'].unique()}")

Unique values of the gold_label column: ['entailment' 'contradiction' 'neutral' '-']


In [57]:
df_train_new = df_train_new[df_train_new['gold_label'].isin(["contradiction", "entailment", "neutral"])]
df_rest = df_rest[df_rest['gold_label'].isin(["contradiction", "entailment", "neutral"])]
label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}
df_train_new.loc[:, 'gold_label_int'] = df_train_new.loc[:, 'gold_label'].apply(lambda x: label2int[x])
df_rest.loc[:, 'gold_label_int'] = df_rest.loc[:, 'gold_label'].apply(lambda x: label2int[x])
df_train_new_label = df_train_new.loc[:, ['sentence1', 'sentence2', 'gold_label_int']]
df_train_new_label[:5]

Unnamed: 0,sentence1,sentence2,gold_label_int
468395,Aaron Rodgers and another Green Bay Packers te...,Aaron Rodgers and a teammate celebrate in the ...,1
53439,A man in an orange jacket playfully tosses a s...,A man and a small girl eat pizza together,0
247263,A boy on a chained swing looks to the right an...,A boy smiles at a sunny park while swinging.,2
524024,A child in a striped shirt pets a white goat t...,The child is stabbing the goat.,0
198419,A guy catches a wave on his surfboard,watched by a group,2


In [67]:
import torch
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer,LoggingHandler, losses, util, datasets, models
#from sentence_transformers.datasets import DataLoader
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample
#from transformers import InputExample
from sentence_transformers.cross_encoder.evaluation import CESoftmaxAccuracyEvaluator

class DataFrameDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        example = InputExample(
            guid=str(idx),
            texts=[row['sentence1'], row['sentence2']],  # Replace with your text column name
            label=row['gold_label_int'],  # Replace with your label column name
        )
        return example


#in1 = InputExample(texts=['My first sentence', 'My second sentence'], label=0.8)
#in2 = InputExample(texts=['Another pair', 'Unrelated sentence'], label=0.3)
#train_examples = [in1, in2]

#dataloader = DataLoader(train_examples, batch_size=2, shuffle=True)

# Create Dataset and DataLoader
dataset = DataFrameDataset(df_train_new_label)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Iterate through the DataLoader
#for batch in dataloader:
#    print(batch)

max_seq_length = 128

num_epochs = 1

train_batch_size = 16

model_name = 'roberta-base'
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode='mean')
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]