In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Training mGPT on Twitter Emoji Daataset

In [None]:
!pip install transformers
!pip install sentencepiece
!pip install datasets
!pip install pytorch-lightning
!pip install wandb 

In [None]:
import wandb

wandb.init(project="CS505-DistilGPT2", entity="sks99")

[34m[1mwandb[0m: Currently logged in as: [33msks99[0m (use `wandb login --relogin` to force relogin)


In [None]:
from transformers import pipeline

In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# tokenizer = AutoTokenizer.from_pretrained('google/mt5-small')
# model = TFMT5Model.from_pretrained('google/mt5-small').cuda()
tokenizer = AutoTokenizer.from_pretrained('distilgpt2')
model = AutoModelForCausalLM.from_pretrained('distilgpt2').to(device)

Downloading:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/336M [00:00<?, ?B/s]

In [None]:
# #### Step 1: Tokenize the input into integer token IDs
# inputs = tokenizer.encode("Hello, how are you?", return_tensors='pt').to(model.device)
# print("Input Token IDs: " + str(inputs))

In [None]:
# context = "Hello, how are you?"
# encoded_input = tokenizer(
#     context,
#     padding='max_length',
#     max_length=16,
#     truncation=True,
#     return_tensors="pt",
# ).to(model.device)
# decoder_input = tokenizer(
#     context,
#     padding='max_length',
#     max_length=16,
#     truncation=True,
#     return_tensors="pt",
# ).to(model.device)

In [None]:
# #### Step 2 and 3: Feed in the integer token IDs and get out a sequence of token IDs as output
# outputs = model.generate(input_ids=encoded_input["input_ids"], decoder_input_ids=decoder_input["input_ids"])
# print("Output Token IDs: " + str(outputs))

In [None]:
# encoded_input

In [None]:
# #### Step 4: Feed in the integer token IDs and get out a sequence of token IDs as output
# output_text = [tokenizer.decode(x) for x in outputs]
# print("Output Text: " + str(output_text))

In [None]:
def read_from_file(filename):
    file = open(filename,"r")
    vocab = file.read().splitlines()
    return vocab

In [None]:
es_train_data_path = "/content/drive/MyDrive/505/Project Data/train/spanish/tweet_by_ID_30_4_2022__02_47_48.txt.text"
es_train_labels_path = "/content/drive/MyDrive/505/Project Data/train/spanish/tweet_by_ID_30_4_2022__02_47_48.txt.labels"
es_train_data = read_from_file(es_train_data_path)
es_train_labels = read_from_file(es_train_labels_path)

In [None]:
en_train_data_path = "/content/drive/MyDrive/505/Project Data/train/english/tweet_by_ID_30_4_2022__04_17_45.txt.text"
en_train_labels_path = "/content/drive/MyDrive/505/Project Data/train/english/tweet_by_ID_30_4_2022__04_17_45.txt.labels"
en_train_data = read_from_file(en_train_data_path)
en_train_labels = read_from_file(en_train_labels_path)

In [None]:
def combine_data_labels(data, labels, labels_map):
  combined_data = []
  for i, tweet in enumerate(data):
    tweet = tweet.split(' ')
    tweet.append(labels_map[int(labels[i])].upper())
    combined_data.append(' '.join(tweet))
  return combined_data

In [None]:
import pandas as pd
def get_emoji_mappings():
  mapping_path = '/content/drive/MyDrive/505/Project Data/mapping/consolidated_mapping.txt'
  emoji_df = pd.read_csv(mapping_path, sep='\t', names=['emoji','emoji_code', 'smt'])

  emoji_label_map = {}
  for emj_idx in range(len(emoji_df['emoji_code'])):
    emoji_label_map[emj_idx] = "<" + emoji_df['emoji_code'][emj_idx] + ">"
  # print(emoji_df)
  return emoji_label_map

In [None]:
emoji_label_map = get_emoji_mappings()

In [None]:
es_train_data_combined = combine_data_labels(es_train_data, es_train_labels, emoji_label_map)
en_train_data_combined = combine_data_labels(en_train_data, en_train_labels, emoji_label_map)

In [None]:
es_train_data_combined[1]

'Disfrutando de buena comida con buena compañía (@ Cátame Despacito in Murcia)  <_FIRE_>'

In [None]:
import pandas as pd

es_train_data_combined_df = pd.DataFrame(es_train_data_combined, columns=['Tweet'])
en_train_data_combined_df = pd.DataFrame(en_train_data_combined, columns=['Tweet'])

In [None]:
es_train_data_combined_df.to_csv("/content/drive/MyDrive/505/project/es_train_data_combined.csv")
en_train_data_combined_df.to_csv("/content/drive/MyDrive/505/project/en_train_data_combined.csv")

In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
import datasets
from datasets import load_dataset, list_datasets

In [None]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
def encode(batch): 
  input_ids = tokenizer([x.strip('\n\r') for x in batch['Tweet']], truncation=True, padding=True)
  # return {"input_ids": input_ids["input_ids"], "decoder_input_ids": input_ids["input_ids"], "attention_mask": input_ids['attention_mask']}
  return input_ids

es_train = load_dataset("csv", data_files='drive/MyDrive/505/project/es_train_data_combined.csv', split='train')
processed = es_train.map(encode, batched=True, batch_size=len(es_train)//5)
processed.set_format('torch', columns=['input_ids', 'attention_mask'])

Using custom data configuration default-0047c9ec3e00185b
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-0047c9ec3e00185b/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/5 [00:00<?, ?ba/s]

In [None]:
len(processed)

81165

In [None]:
# model.config.pad_token_id = model.config.eos_token_id

In [None]:
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/505/project/DistilGPT2/Outputs/Spanish',
    overwrite_output_dir=True,
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    logging_steps=100,
    weight_decay=0.01,
    # save_total_limit=1,
    logging_dir='/content/drive/MyDrive/505/project/DistilGPT2/Logs',
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=processed
)

In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: Tweet, Unnamed: 0. If Tweet, Unnamed: 0 are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 81165
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 20292
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss
100,3.8791
200,3.3656
300,3.2649
400,3.1619
500,3.0983
600,3.0666
700,3.0169
800,2.98
900,2.9166
1000,2.9155


Saving model checkpoint to /content/drive/MyDrive/505/project/DistilGPT2/Outputs/Spanish/checkpoint-500
Configuration saved in /content/drive/MyDrive/505/project/DistilGPT2/Outputs/Spanish/checkpoint-500/config.json
Model weights saved in /content/drive/MyDrive/505/project/DistilGPT2/Outputs/Spanish/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/505/project/DistilGPT2/Outputs/Spanish/checkpoint-500/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/505/project/DistilGPT2/Outputs/Spanish/checkpoint-500/special_tokens_map.json
Saving model checkpoint to /content/drive/MyDrive/505/project/DistilGPT2/Outputs/Spanish/checkpoint-1000
Configuration saved in /content/drive/MyDrive/505/project/DistilGPT2/Outputs/Spanish/checkpoint-1000/config.json
Model weights saved in /content/drive/MyDrive/505/project/DistilGPT2/Outputs/Spanish/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/505/project/Dis

TrainOutput(global_step=20292, training_loss=2.4485340731256278, metrics={'train_runtime': 3631.9849, 'train_samples_per_second': 89.389, 'train_steps_per_second': 5.587, 'total_flos': 8947188602634240.0, 'train_loss': 2.4485340731256278, 'epoch': 4.0})

In [None]:
distilgpt2 = pipeline('text-generation', model='/content/drive/MyDrive/505/project/DistilGPT2/Outputs/Spanish/checkpoint-20000', device=-1)
# gpt2 = pipeline('text-generation', model='gpt2', device=0)

In [None]:
x = "@user dedeu como está hoy de limpia la plaza?"
# test = distilgpt2(x)
# print(gpt2('Saint Petersburg is'))

In [None]:
test[0]['generated_text']

'Welcome to New York @ Times Square New York City  <_TWO_HEARTS_> - #tourismandtheworld #traveller #travel…  <_FACE_WITH_TEARS_OF_JOY'

In [None]:
model_path = '/content/drive/MyDrive/505/project/DistilGPT2/Outputs/Spanish/checkpoint-20000'
model_test = AutoModelForCausalLM.from_pretrained(model_path, pad_token_id=tokenizer.eos_token_id).to(device)

In [None]:
x = "Buenos días desde Valencia en Comunidad Valenciana, Spain"
input_ids = tokenizer.encode(x, return_tensors='pt').to(device)

# generate text until the output length (which includes the context length) reaches 50
greedy_output = model_test.generate(input_ids, max_length=len(input_ids[0])+max_len).to(device)

decoded_op = tokenizer.decode(greedy_output[0], skip_special_tokens=True)

print("Output:\n" + 100 * '-')
print(decoded_op)

Output:
----------------------------------------------------------------------------------------------------
Buenos días desde Valencia en Comunidad Valenciana, Spain  <_SMILING_FACE_WITH_HEARTEYES_> @user  <_SMILING_EYES_>_>  <_SM


In [None]:
len(input_ids[0])

10

In [None]:
s = '<'
i = 0
t = ''
while('>' not in t):
  t = tokenizer.decode(greedy_output[0][len(input_ids[0])+2+i], skip_special_tokens=True)
  s += t
  i += 1
print(s)

<_RED_HEART_>


In [None]:
def find_max_len_emoji(mapping):
  emojis = mapping.values()
  max_len = 0
  for emoji in emojis:
    if len(emoji) > max_len:
      max_len = len(emoji)

  return (max_len)

In [None]:
max_len = find_max_len_emoji(mapping=emoji_label_map)

In [None]:
max_len

34

In [None]:
es_test_data_path = "/content/drive/MyDrive/505/Project Data/test/es_test.text"
es_test_labels_path = "/content/drive/MyDrive/505/Project Data/test/es_test.labels"
es_test_data = read_from_file(es_test_data_path)
es_test_labels = read_from_file(es_test_labels_path)

In [None]:
def get_emoji_names(labels, mapping):
  new_labels = []
  for label in labels:
    emoji = mapping[int(label)].upper()
    new_labels.append(emoji[2:-2])
  return new_labels

In [None]:
es_test_labels = get_emoji_names(es_test_labels, emoji_label_map)

In [None]:
import re
from tqdm import tqdm
def test_data(data, labels):
  test_acc = 0
  for i in tqdm(range(len(data))):
    # test = distilgpt2(tweet)
    # gen_text = test[0]['generated_text'].split(' ')
    # gen_text = [word for word in gen_text if len(word) != 0]
    input_ids = tokenizer.encode(data[i], return_tensors='pt').to(device)

    # generate text until the output length (which includes the context length) reaches 50
    greedy_output = model_test.generate(input_ids, max_length=len(input_ids[0])+160).to(device)

    decoded_op = tokenizer.decode(greedy_output[0], skip_special_tokens=True)
    # decoded_op = distilgpt2(tweet)
    # decoded_op = decoded_op[0]['generated_text']
    # print(decoded_op)
    # if '<' not in decoded_op:
    #   continue
    # print(i)
    regex = re.search('<_(.+?)_>', decoded_op)
    if regex:
        gen_emoji = regex.group(1)
    # print(gen_emoji, labels[i])
    if gen_emoji == labels[i]:
      test_acc += 1
  return (test_acc * 100 / len(labels))

In [None]:
test_acc = test_data(es_test_data, es_test_labels)
print()
print("Testing accuracy:", test_acc)

100%|██████████| 10000/10000 [2:08:18<00:00,  1.30it/s]


Testing accuracy: 33.96





In [None]:
for i, tweet in enumerate(es_test_data[:5]):
    # test = distilgpt2(tweet)
    # gen_text = test[0]['generated_text'].split(' ')
    # gen_text = [word for word in gen_text if len(word) != 0]
    input_ids = tokenizer.encode(tweet, return_tensors='pt').to(device)

    # generate text until the output length (which includes the context length) reaches 50
    greedy_output = model_test.generate(input_ids, max_length=len(input_ids[0])+max_len).to(device)

    decoded_op = tokenizer.decode(greedy_output[0], skip_special_tokens=True)
    # print("Output:\n" + 100 * '-')
    print(decoded_op)
    # if '<' not in gen_text:
    #   continue
    # print(gen_text)

Buenos días desde Valencia en Comunidad Valenciana, Spain  <_SMILING_FACE_WITH_HEARTEYES_> @user  <_SMILING_EYES_>_>  <_SM
Anoche en la #prefería con @user,mi prima evazappoz y #Juan.Bien empezamos,bien #sur…  <_SMILING_FACE_WITH_HEARTEYES_>  <_SMILING_EYES_>_>_>  <_SMIL
Porfavor llevarlas a reciclar,necesitamos más papel para imprimir más propaganda. @user  <_SMILING_FACE_WITH_HEARTEYES_>  <_SMILING_EYES_>_>  <_
El vecino roquero que todos queremos tener en PasswordLearn #PuertaDelSol #PuertaDelSol…  <_SMILING_FACE_WITH_HEARTEYES_>  <
Es un placer contar con profesionales del sector de vuestra talla... gracias!!! ️@NukleeBCN @user  <_RED_HEART_>  <_RED_HEART_>_>  <_RED_HEART_>_>_>  <_RED_HEART


In [None]:
input_ids = tokenizer.encode(es_test_data[0], return_tensors='pt').to(device)

# generate text until the output length (which includes the context length) reaches 50
greedy_output = model_test.generate(input_ids, max_length=len(input_ids[0])+max_len).to(device)

decoded_op = tokenizer.decode(greedy_output[0], skip_special_tokens=True)

m = re.search('<_(.+?)_>', decoded_op)
if m:
    found = m.group(1)
print(found)

SMILING_FACE_WITH_HEARTEYES


In [None]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()