## 1. Import Prerequisites

Run the following cell to confirm that the GPU is detected.

In [16]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')


Found GPU at: /device:GPU:0


In order for torch to use the GPU, we need to identify and specify the GPU as the device.   
Later, in our training loop, we will load data onto the device. 


In [17]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [18]:
!pip install transformers



## 2. Parse


In [19]:
# Check that dataset file is available
import glob

version_files = glob.glob('./versions_*.csv')

if len(version_files) == 0:
  raise SystemError("Cant find any versions!!")


# TODO: get latest uploaded version..
version_file = version_files[0]
print(f"Using - {version_file}")


Using - ./versions_2.csv


In [52]:
import pandas as pd

# Load the dataset into a pandas dataframe.
df = pd.read_csv(version_file)

# Report the number of sentences.
print(f'Number of sentences: {df.shape[0]}')

# Display 10 random rows from the data.
df.sample(10)

Number of sentences: 187531


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,article_id,article_source,url,title,abstract,hash,date_time,version,amount_of_words,label
50269,50269,50269,50270,1.9812418,Haaretz,https://www.haaretz.co.il/blogs/shukifriedman/...,"התורה כבסיס לגשר, לא כמצע לעימות בערינו","התורה, שאת קבלתה בהר סיני חוגגים הערב, מעבירה ...",06d21584b54fa2bb509d95059d8aa7cc1dff57f0bc6bef...,2021-05-18 07:30:01.872449,971,7,1
86430,86430,86430,86431,1.9812409,Haaretz,https://www.haaretz.co.il/wellbeing/health-blo...,את מי זה מעניין אם הרופא יהודי או ערבי?,"יהודים, מוסלמים ונוצרים ממשיכים להציל חיים בבי...",aeb2e49afca6e026a2a601f1806d55a3ac35bde7597250...,2021-05-21 03:57:02.565842,4889,9,1
33111,33111,33111,33112,1.8706066,Haaretz,https://www.haaretz.co.il/sport/opinions/.prem...,דחיית האולימפיאדה היא הוכחה למי שטרם הבין: אנח...,רק שני מאורעות גרמו לשיבוש המשחקים האולימפיים ...,3c3aac99635d51fc2a41b0c8656fd80bf6bf260a4e09b5...,2021-04-21 20:06:03.386298,1,11,1
95173,95173,95173,95174,1.9793247,Haaretz,https://www.haaretz.co.il/blogs/yasminelevi/BL...,יש מי שעושה במכנסיים ויש מי שמוכן לתפקיד,עבור ישראלים החרדים ממנהיג רעיל ומשולח כל רסן ...,2b9372c6e7910d29998d43807dcebc5cabf3cab0054ddf...,2021-05-21 20:58:02.136266,5865,8,1
51475,51475,51475,51476,1.9812233,Haaretz,https://www.haaretz.co.il/blogs/anumuseum/BLOG...,מחקלאי-לאומי לדתי-רוחני: הטרנספורמציה של שבועות,"בימים כתיקונם, הערב היתה לובשת ישראל חג ומגוון...",3892c946128aeaffb6c0e648d2f1a21b449ec72ed99bce...,2021-05-18 09:40:02.527090,1095,5,1
156091,156091,156091,156092,1.9881510,Haaretz,https://www.haaretz.co.il/wellbeing/health-blo...,כדי להתגבר על הפחדים עליכם להשלים עמם,"למרות המסרים שאנחנו מקבלים כל הזמן מהסביבה, הנ...",06e08185ef67d37b2e2dc2dfa63c0a2f8514d308ddccea...,2021-06-15 09:42:03.161347,4041,7,1
9018,9018,9018,9019,https://news.walla.co.il/item/3418448,Walla,https://news.walla.co.il/item/3418448,"השלג בדרך, וברשויות נערכים לסערה: ""מענה לכל תר...","לקראת המערכת החורפית הצפויה להתחיל היום, במשטר...",5edfaae81a41c477cb48f39ba054bc95faab301d39d6eb...,2021-02-16 15:15:52.699042,2,8,1
151464,151464,151464,151465,1.9881331,Haaretz,https://www.haaretz.co.il/blogs/barrydanino/BL...,"יומני היקר, חוזר מרעננה. ממתין עוד להבהרת הרב","שלוש רשימות דמיוניות מיומנו של תושב השומרון, ב...",fbfa19604d9c4173f9cdbed3078017d9ddb383ea719346...,2021-06-14 22:02:02.026999,3371,8,1
55428,55428,55428,55429,1.9812418,Haaretz,https://www.haaretz.co.il/blogs/shukifriedman/...,"התורה כבסיס לגשר, לא כמצע לעימות בערינו","התורה, שאת קבלתה בהר סיני חוגגים הערב, מעבירה ...",06d21584b54fa2bb509d95059d8aa7cc1dff57f0bc6bef...,2021-05-18 16:46:02.601055,1527,7,1
132917,132917,132917,132918,1.9881463,Haaretz,https://www.haaretz.co.il/family/einattalmon/B...,נושא הלינה המשותפת מעורר שאלות רבות. הנה התשוב...,"כיצד ליישם שינה בטוחה יחד עם התינוק, האם אימונ...",819c6aa49261ed78140dd6322bb5141da46d7ec101117c...,2021-06-13 01:00:01.864472,832,9,1


In [54]:
MAX_WORDS = df.amount_of_words.max()
print(f"Max words in sentence is - {MAX_WORDS}")


Max words in sentence is - 24


Let's extract the sentences and labels of our training set as numpy ndarrays.

In [55]:
sentences = df.title.values
labels = df.label.values

# 3. Tokenization & Input Formatting


## 3.1. BERT Tokenizer

In [56]:
from transformers import AutoTokenizer, AutoModelForMaskedLM


# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = AutoTokenizer.from_pretrained("avichr/heBERT")


Loading BERT tokenizer...


Let's apply the tokenizer to one sentence just to see the output.

In [58]:
import random

example_sentence = sentences[random.choice(range(len(sentences)))]
# Print the original sentence.
print(' Original: ', example_sentence)

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(example_sentence))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(example_sentence)))


 Original:  גדעון סער: "לא נצטרף לנתניהו ולא נתמוך בו"
Tokenized:  ['גדעון', 'סער', ':', '"', 'לא', 'נצ', '##טרף', 'לנתניהו', 'ולא', 'נת', '##מוך', 'בו', '"']
Token IDs:  [10811, 16183, 30, 6, 1527, 2804, 4279, 26322, 1801, 1873, 2603, 1846, 6]


In [59]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []

# For every sentence...
for sent in sentences:
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    encoded_sent = tokenizer.encode(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'

                        # This function also supports truncation and conversion
                        # to pytorch tensors, but we need to do padding, so we
                        # can't use these features :( .
                        #max_length = 128,          # Truncate all sentences.
                        #return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.
    input_ids.append(encoded_sent)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

Original:  קורונה? MeToo? כדורים? העיקר שהתחת שלי השתחרר!
Token IDs: [2, 14508, 35, 11113, 18548, 35, 14726, 35, 5390, 8866, 1019, 2088, 2458, 5277, 5, 3]


In [60]:
# We'll borrow the `pad_sequences` utility function to do this.
from keras.preprocessing.sequence import pad_sequences

# Set the maximum sequence length.
MAX_LEN = 32

print('\nPadding/truncating all sentences to %d values...' % MAX_LEN)

print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))

# Pad our input tokens with value 0.
# "post" indicates that we want to pad and truncate at the end of the sequence,
# as opposed to the beginning.
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post")

print('\nDone.')


Padding/truncating all sentences to 32 values...

Padding token: "[PAD]", ID: 0

Done.


In [61]:
# Create attention masks
attention_masks = []

# For each sentence...
for sent in input_ids:
    
    # Create the attention mask.
    #   - If a token ID is 0, then it's padding, set the mask to 0.
    #   - If a token ID is > 0, then it's a real token, set the mask to 1.
    att_mask = [int(token_id > 0) for token_id in sent]
    
    # Store the attention mask for this sentence.
    attention_masks.append(att_mask)

In [62]:
# Use train_test_split to split our data into train and validation sets for
# training
from sklearn.model_selection import train_test_split

# Use 90% for training and 10% for validation.
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2018, test_size=0.1)
# Do the same for the masks.
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,
                                             random_state=2018, test_size=0.1)

In [63]:
# Convert all inputs and labels into torch tensors, the required datatype 
# for our model.
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [64]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of
# 16 or 32.

batch_size = 32

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


##????

In [48]:
model = AutoModelForMaskedLM.from_pretrained("avichr/heBERT")

from transformers import pipeline
fill_mask = pipeline(
    "fill-mask",
    model="avichr/heBERT",
    tokenizer="avichr/heBERT"
)

options = fill_mask("הקורונה לקחה את [MASK] ולנו לא נשאר דבר.")

options

[{'score': 0.14047907292842865,
  'sequence': 'הקורונה לקחה את הילדים ולנו לא נשאר דבר.',
  'token': 3096,
  'token_str': 'הילדים'},
 {'score': 0.04530879110097885,
  'sequence': 'הקורונה לקחה את הכסף ולנו לא נשאר דבר.',
  'token': 5289,
  'token_str': 'הכסף'},
 {'score': 0.0362359881401062,
  'sequence': 'הקורונה לקחה את הכלב ולנו לא נשאר דבר.',
  'token': 12737,
  'token_str': 'הכלב'},
 {'score': 0.035021472722291946,
  'sequence': 'הקורונה לקחה את הילדה ולנו לא נשאר דבר.',
  'token': 12178,
  'token_str': 'הילדה'},
 {'score': 0.02997061051428318,
  'sequence': 'הקורונה לקחה את הרכב ולנו לא נשאר דבר.',
  'token': 3806,
  'token_str': 'הרכב'}]

In [49]:
max(options, key=lambda x: x["score"])

{'score': 0.14047907292842865,
 'sequence': 'הקורונה לקחה את הילדים ולנו לא נשאר דבר.',
 'token': 3096,
 'token_str': 'הילדים'}