In [2]:
import os
from datetime import datetime
import random
import math
import numpy as np
import pandas as pd
import pprint
import gzip
import csv
import logging
from IPython.display import display
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import zipfile

SNLI_DATASET_PATH = '/content/drive/MyDrive/prediction-with-assortment/SNLI/snli_1.0.zip'

SNLI_DATASET_EXTRACT_DIR = '/content/drive/MyDrive/prediction-with-assortment/SNLI'

SNLI_DATASET_DIR = os.path.join(SNLI_DATASET_EXTRACT_DIR, "snli_1.0")

SNLI_TRAIN_FILE =  os.path.join(SNLI_DATASET_DIR, "snli_1.0_train.txt")
SNLI_DEV_FILE = os.path.join(SNLI_DATASET_DIR, "snli_1.0_dev.txt")
SNLI_TEST_FILE = os.path.join(SNLI_DATASET_DIR, "snli_1.0_dev.txt")

if (os.path.isdir(SNLI_DATASET_EXTRACT_DIR) and os.path.isdir(SNLI_DATASET_DIR) and
   os.path.isfile(SNLI_TRAIN_FILE) and os.path.isfile(SNLI_DEV_FILE) and os.path.isfile(SNLI_TEST_FILE)):
    print("Dataset already extracted")
else:
    # Open the zip file in read mode
    with zipfile.ZipFile(SNLI_DATASET_PATH, 'r') as zip_ref:
        # Extract all files to the specified directory
        zip_ref.extractall(SNLI_DATASET_EXTRACT_DIR)

os.listdir(SNLI_DATASET_DIR)

Dataset already extracted


['.DS_Store',
 'Icon\r',
 'README.txt',
 'snli_1.0_dev.jsonl',
 'snli_1.0_dev.txt',
 'snli_1.0_test.jsonl',
 'snli_1.0_test.txt',
 'snli_1.0_train.jsonl',
 'snli_1.0_train.txt']

In [6]:
SNLI_TRAIN_FILE =  os.path.join(SNLI_DATASET_DIR, "snli_1.0_train.txt")
SNLI_DEV_FILE = os.path.join(SNLI_DATASET_DIR, "snli_1.0_dev.txt")
SNLI_TEST_FILE = os.path.join(SNLI_DATASET_DIR, "snli_1.0_dev.txt")

df_train = pd.read_csv(SNLI_TRAIN_FILE, sep="\t")
df_dev = pd.read_csv(SNLI_DEV_FILE, sep="\t")
df_test = pd.read_csv(SNLI_TEST_FILE, sep="\t")
# The SNLI dataset contains several columns, but for many tasks, only gold_label, sentence1, and sentence2 are needed.

df_train[['sentence1', 'sentence2', 'gold_label']][:5]

Unnamed: 0,sentence1,sentence2,gold_label
0,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,neutral
1,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",contradiction
2,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",entailment
3,Children smiling and waving at camera,They are smiling at their parents,neutral
4,Children smiling and waving at camera,There are children present,entailment


In [7]:
print(f"Number of records in the training dataset df_train: {len(df_train)}")
print(f"Number of records in the dev dataset df_dev: {len(df_dev)}")
print(f"Number of records in the test dataset df_test: {len(df_test)}")

Number of records in the training dataset: 550152
Number of records in the dev dataset: 10000
Number of records in the test dataset: 10000


## Select 200K at random from the training dataset and use those as a new training dataset

In [13]:
df_train_new = df_train.sample(n=200000)
df_rest = df_train.loc[~df_train.index.isin(df_train_new.index)]
df_train_new[['sentence1', 'sentence2', 'gold_label']][:5]

Unnamed: 0,sentence1,sentence2,gold_label
207991,A man walking down the street.,The man is walking next to a main road.,neutral
6340,A man in a blue jacket has fallen asleep while...,The person is wide awake.,contradiction
83338,A subway station where numerous people are sta...,A woman is listening to music at the station.,neutral
480827,A priest performs religious ceremonies against...,The priest is surrounded by religious objects.,entailment
316264,A man in a blue shirt is bowling.,The shirt is purple.,contradiction


In [19]:
print(f"Unique values of the gold_label column: {df_train_new['gold_label'].unique()}")

Unique values of the gold_label column: ['neutral' 'contradiction' 'entailment' '-']


In [32]:
df_train_new = df_train_new.loc[:, ['sentence1', 'sentence2', 'gold_label']].where(df_train_new['gold_label'].isin(["contradiction", "entailment", "neutral"]))
df_rest = df_rest.loc[:, ['sentence1', 'sentence2', 'gold_label']].where(df_rest['gold_label'].isin(["contradiction", "entailment", "neutral"]))
label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}
df_train_new.loc[:, 'gold_label_int'] = df_train_new.loc[:, 'gold_label'].apply(lambda x: label2int[x])
df_rest.loc[:, 'gold_label_int'] = df_rest.loc[:, 'gold_label'].apply(lambda x: label2int[x])
df_train_new[:5]

Unnamed: 0,sentence1,sentence2,gold_label,gold_label_int
207991,A man walking down the street.,The man is walking next to a main road.,neutral,2
6340,A man in a blue jacket has fallen asleep while...,The person is wide awake.,contradiction,0
83338,A subway station where numerous people are sta...,A woman is listening to music at the station.,neutral,2
480827,A priest performs religious ceremonies against...,The priest is surrounded by religious objects.,entailment,1
316264,A man in a blue shirt is bowling.,The shirt is purple.,contradiction,0
