In [26]:
# https://github.com/dadejecova/Natural-Language-Python/blob/main/dataset/trainset.csv
# We're going to install those libraries
#!pip install transformers
#!pip install sentencepiece
#!pip install rouge-score

# GPUT4 is the environment that we'll use

In [27]:
import transformers

transformers.__version__

'4.52.3'

In [28]:
import torch

import pandas as pd

from transformers import T5Tokenizer, T5ForConditionalGeneration

from transformers.optimization import Adafactor

from rouge_score import rouge_scorer

In [29]:
import io
import requests
import pandas as pd

In [30]:

url = 'https://raw.githubusercontent.com/dadejecova/Natural-Language-Python/main/dataset/trainset.csv'
response = requests.get(url)

In [31]:
if response.status_code == 200:
  #we read this with pnds
  restaurant_df = pd.read_csv(io.StringIO(response.text))
  print(restaurant_df.sample(10))
else:
  print("Failed to fetch Datset")

                                                      mr  \
35861  name[The Rice Boat], food[Chinese], priceRange...   
7255   name[The Rice Boat], food[Chinese], priceRange...   
14773  name[The Waterman], food[Japanese], priceRange...   
17733  name[Blue Spice], eatType[coffee shop], priceR...   
21654  name[The Wrestlers], eatType[coffee shop], foo...   
24814  name[The Punter], priceRange[moderate], area[c...   
11561  name[The Eagle], eatType[coffee shop], food[Fa...   
12943  name[Travellers Rest Beefeater], eatType[resta...   
26202  name[Loch Fyne], eatType[restaurant], food[Ita...   
14892  name[Aromi], eatType[pub], customer rating[ave...   

                                                     ref  
35861  The Rice Boat is a highly rated place selling ...  
7255   The Rice Boat provides Chinese food in the £20...  
14773  A moderately-priced, family-friendly restauran...  
17733  Blue Spice is a coffee shop near Avalon in the...  
21654  The Wrestlers is an adult Japanese co

In [32]:
restaurant_df.shape

(42061, 2)

In [33]:
restaurant_df['mr'][3]

'name[The Mill], eatType[coffee shop], food[French], priceRange[£20-25], area[riverside], near[The Sorrento]'

In [34]:
restaurant_df['ref'][3]

'Located near The Sorrento is a French Theme eatery and coffee shop called The Mill, with a price range at £20-£25 it is in the riverside area.'

In [35]:
restaurant_df_sampled = restaurant_df.sample(frac = 0.3)

restaurant_df_sampled.shape

(12618, 2)

In [36]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

print(f"Using {device} device")

Using cuda device


In [37]:
tokenizer = T5Tokenizer.from_pretrained("t5-small", legacy=False)

tokenizer

T5Tokenizer(name_or_path='t5-small', vocab_size=32000, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>', '<extra_id_4

In [38]:
tokens = tokenizer.tokenize("very budget-friendly restaurant")
tokens

['▁very', '▁budget', '-', 'friendly', '▁restaurant']

In [39]:
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[182, 1487, 18, 4905, 2062]

In [40]:
model = T5ForConditionalGeneration.from_pretrained("t5-small", return_dict = True)

print(model)

model = model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [41]:
restaurant_df['mr'][10]

'name[Browns Cambridge], food[Fast food], area[riverside], familyFriendly[no], near[The Sorrento]'

In [42]:
model.eval()

# Batch
inputs_ids = tokenizer.encode("name[Browns Cambridge], food[Fast food], area[riverside], familyFriendly[no], near[The Sorrento]", return_tensors = "pt")

inputs_ids


tensor([[  564,  6306,   279,  3623,    29,     7, 10096, 13679,   542,  6306,
           371,     9,     7,    17,   542, 13679,   616,  6306,  5927,    49,
          1583, 13679,   384, 17701,   120,  6306,    29,    32, 13679,  1084,
          6306,   634,   264,    52,  5320,    32,   908,     1]])

In [43]:
inputs_ids = inputs_ids.to(device)

outputs = model.generate(inputs_ids, max_new_tokens=50)

outputs

tensor([[    0,   465,    51,  6306,   279,  3623,    29,     7, 10096, 13679,
           542,  6306,   371,     9,     7,    17,   542, 13679,   616,  6306,
          5927,    49,  1583, 13679,   384, 17701,   120,  6306,    29,    32,
         13679,  1084,  6306,   634,   264,    52,  5320,    32,   908,     1]],
       device='cuda:0')

In [44]:
tokenizer.decode(outputs[0])

'<pad> Nom[Browns Cambridge], food[Fast food], area[riverside], familyFriendly[no], near[The Sorrento]</s>'

In [45]:
# Reference summary
reference_text = "Bibimbap house is a moderately priced restaurant who's main cuisine is English Food."
predicted_text = "Bibimbap house is a moderately priced restaurant who's main cuisine is English Food."

# Initialize the Rougue Scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Calculate the scores
rouge_scores = scorer.score(reference_text, predicted_text)

# print roughe score
for metric, scores in rouge_scores.items():
  print(f'{metric}:')
  print(f'  Precision: {scores.precision}')
  print(f'  Recall: {scores.recall}')
  print(f'  F1 Score: {scores.fmeasure}')

rouge1:
  Precision: 1.0
  Recall: 1.0
  F1 Score: 1.0
rouge2:
  Precision: 1.0
  Recall: 1.0
  F1 Score: 1.0
rougeL:
  Precision: 1.0
  Recall: 1.0
  F1 Score: 1.0


In [46]:
# Another example
reference_text = "Bibimbap house is a moderately priced restaurant who's main cuisine is English Food."
predicted_text = "Bibimbap house is a fairly priced restaurant seving English Food"

# Initialize the Rougue Scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Calculate the scores
rouge_scores = scorer.score(reference_text, predicted_text)

# print roughe score
for metric, scores in rouge_scores.items():
  print(f'{metric}:')
  print(f'  Precision: {scores.precision}')
  print(f'  Recall: {scores.recall}')
  print(f'  F1 Score: {scores.fmeasure}')

rouge1:
  Precision: 0.8
  Recall: 0.5714285714285714
  F1 Score: 0.6666666666666666
rouge2:
  Precision: 0.5555555555555556
  Recall: 0.38461538461538464
  F1 Score: 0.4545454545454546
rougeL:
  Precision: 0.8
  Recall: 0.5714285714285714
  F1 Score: 0.6666666666666666


In [47]:
# Another example 2
reference_text = "Bibimbap house is a moderately priced restaurant who's main cuisine is English Food."
predicted_text = "House is a restaurant serving French food"

# Initialize the Rougue Scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Calculate the scores
rouge_scores = scorer.score(reference_text, predicted_text)

# print roughe score
for metric, scores in rouge_scores.items():
  print(f'{metric}:')
  print(f'  Precision: {scores.precision}')
  print(f'  Recall: {scores.recall}')
  print(f'  F1 Score: {scores.fmeasure}')

rouge1:
  Precision: 0.7142857142857143
  Recall: 0.35714285714285715
  F1 Score: 0.4761904761904762
rouge2:
  Precision: 0.3333333333333333
  Recall: 0.15384615384615385
  F1 Score: 0.21052631578947367
rougeL:
  Precision: 0.7142857142857143
  Recall: 0.35714285714285715
  F1 Score: 0.4761904761904762
