In [1]:
#check GPU

import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("There are %d GPU(s) available." % torch.cuda.device_count())
    print("We will use the GPU: ", torch.cuda.get_device_name(0))
    
else:
    print("No GPU available, using the CPU instead.")
    device = torch.device('cpu')

There are 1 GPU(s) available.
We will use the GPU:  NVIDIA GeForce RTX 3050 Laptop GPU


In [2]:
!pip install wget



In [3]:
#downloading dataset
import wget
import os

print('Downloading dataset...')

url = 'https://nyu-mll.github.io/CoLA/cola_public_1.1.zip'

#download the file (if we haven't already)
if not os.path.exists('../../../datasets/cola_public_1.1.zip'):
    wget.download(url, '../../../datasets/cola_public_1.1.zip')
print("Dataset sucsessfully downloaded")

Downloading dataset...
Dataset sucsessfully downloaded


In [4]:
#unzip the dataset
'''if not os.path.exists('../../../datasets/cola_public'):
    !unzip cola_public_1.1.zip''' #gonna unzip manually

"if not os.path.exists('../../../datasets/cola_public'):\n    !unzip cola_public_1.1.zip"

In [5]:
#loading all into the pandas data frame aka. Parsing

import pandas as pd

df = pd.read_csv('../../../datasets/cola_public/raw/in_domain_train.tsv', delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes','sentence'])

print('Number of training sentences: {:,}\n'.format(df.shape[0]))
#display random 10 sample
df.sample(10)

Number of training sentences: 8,551



Unnamed: 0,sentence_source,label,label_notes,sentence
4172,ks08,1,,One of the people was dying of thirst.
452,bc01,1,,Who is reading a book that criticizes who?
7212,sks13,1,,What John became was deadly afraid of flying.
4674,ks08,1,,They talked about the scandal for days.
7637,sks13,1,,Susan hopes that she will sleep.
3959,ks08,1,,Tom placed it under the table.
3636,ks08,1,,John asked me to put the clothes in the cupboa...
4460,ks08,1,,Did John find the solution?
3050,l-93,1,,Susan whispered.
6629,m_02,1,,Flora cooks.


In [6]:
df.loc[df.label == 0].sample(5)[['sentence','label']]

Unnamed: 0,sentence,label
7983,What Medea wondered if was the potion was ready,0
4147,He are the only person that I can rely on.,0
925,Truman visited yesterday you.,0
7510,John hurt John with John's umbrella when John ...,0
2623,The scratches removed from the tabletop.,0


In [7]:
sentences = df.sentence.values
lables = df.label.values

In [8]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
print(' Original: ', sentences[0])

print('Tokenized:', tokenizer.tokenize(sentences[0]))

print('Token IDs:', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))


#we can step by step do it by ourselves however we can combine all these steps into 1 action by using tokenize.encode

 Original:  Our friends won't buy this analysis, let alone the next one we propose.
Tokenized: ['our', 'friends', 'won', "'", 't', 'buy', 'this', 'analysis', ',', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose', '.']
Token IDs: [2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012]


required formatting
1) special tokens at the beginning and end
2) fill and cut the sentence up to the same length each
3) differentiate real tokens from padding-tokens using attention mask

In [12]:
#determine the max length
max_len=0

for sent in sentences:
    input_ids = tokenizer.encode(sent, add_special_tokens=True) #+ '[SEP]' and '[CLS]'
    
    max_len = max(max_len, len(input_ids))

print ('Max seq length: {}'.format(max_len))

Max seq length: 47
