## Imports

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
!pip install emoji
!pip install transformers
!pip install tensorflow
!pip install keras
!pip install gpt_2_simple

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [12]:
import codecs
import collections
import emoji
import re
import gpt_2_simple as gpt2
from datetime import datetime
import transformers
import warnings
warnings.filterwarnings('ignore')

## Getting Data

In [13]:
# specific format for JSON
with codecs.open('/content/drive/MyDrive/Colab Notebooks/CSCI 544: ANLP/NLP_Group_Project/GPT_3_Model/human_chat.txt', encoding='utf8') as f:
    data = f.read()
data = data.split('\n')

In [14]:
for row in data[:15]:
  print(row)

Human 1: Hi!
Human 2: What is your favorite holiday?
Human 1: one where I get to meet lots of different people.
Human 2: What was the most number of people you have ever met during a holiday?
Human 1: Hard to keep a count. Maybe 25.
Human 2: Which holiday was that?
Human 1: I think it was Australia
Human 2: Do you still talk to the people you met?
Human 1: Not really. The interactions are usually short-lived but it's fascinating to learn where people are coming from and what matters to them
Human 2: Yea, me too. I feel like God often puts strangers in front of you, and gives you an opportunity to connect with them in that moment in deeply meaningful ways. Do you ever feel like you know things about strangers without them telling you?
Human 1: what do you mean?
Human 2: I think it's like a 6th sense, often seen as "cold readings" to people, but can be remarkably accurate. I once sat next to a man in a coffee and I felt a pain in my back. I asked the stranger if he had a pain. It turns o

## Pre-processing data

In [15]:
utterances = collections.defaultdict(list)

# replace emoji with text
for i in range(len(data)):

    sentence = data[i]
    data[i] = emoji.demojize(sentence)

    # removing redacted words
    data[i] = ' '.join(['' if word.startswith('<REDACTED') else word for word in data[i].split()])

    # removing extra whitespace
    data[i] = re.sub(' +', ' ', data[i])

    if sentence != data[i]:
      print('Original line: ', sentence)
      print('Transformed line: ', data[i])
      print()

    data[i] = '[' + data[i][:7] + ']' + data[i][8:]


data_to_write = '\n'.join(data)

outputfile = open("/content/drive/MyDrive/Colab Notebooks/CSCI 544: ANLP/NLP_Group_Project/GPT_3_Model/human_chat_new_processed_gpt2.txt", "wt", encoding='utf8')
n = outputfile.write(data_to_write)
outputfile.close()

Original line:  Human 2: Cool! very depressing plans ... stay home and work 😞 I have a project deadline very close.
Transformed line:  Human 2: Cool! very depressing plans ... stay home and work :disappointed_face: I have a project deadline very close.

Original line:  Human 1: 😐 hope you get your work done very soon! a bug free weekend!
Transformed line:  Human 1: :neutral_face: hope you get your work done very soon! a bug free weekend!

Original line:  Human 1:  Actually no idea, but it will take the entire day for that.
Transformed line:  Human 1: Actually no idea, but it will take the entire day for that.

Original line:  Human 2: yeah but I think it's good to have some rainy days in bay area, it's pretty dry here 😛
Transformed line:  Human 2: yeah but I think it's good to have some rainy days in bay area, it's pretty dry here :face_with_tongue:

Original line:  Human 2: I enjoy baking cookies. I am on a quest to bake the best chocolate chip cookie 🙂 What about you?
Transformed lin

In [16]:
for row in data[:15]:
  print(row)

[Human 1] Hi!
[Human 2] What is your favorite holiday?
[Human 1] one where I get to meet lots of different people.
[Human 2] What was the most number of people you have ever met during a holiday?
[Human 1] Hard to keep a count. Maybe 25.
[Human 2] Which holiday was that?
[Human 1] I think it was Australia
[Human 2] Do you still talk to the people you met?
[Human 1] Not really. The interactions are usually short-lived but it's fascinating to learn where people are coming from and what matters to them
[Human 2] Yea, me too. I feel like God often puts strangers in front of you, and gives you an opportunity to connect with them in that moment in deeply meaningful ways. Do you ever feel like you know things about strangers without them telling you?
[Human 1] what do you mean?
[Human 2] I think it's like a 6th sense, often seen as "cold readings" to people, but can be remarkably accurate. I once sat next to a man in a coffee and I felt a pain in my back. I asked the stranger if he had a pain

## Training the Transformer

In [17]:
# get the data file
file_path = "/content/drive/MyDrive/Colab Notebooks/CSCI 544: ANLP/NLP_Group_Project/GPT_3_Model/human_chat_new_processed_gpt2.txt"

# initialize parameters for the GPT-2-simple
learning_rate= 0.0001
save_path= "/content/drive/MyDrive/Colab Notebooks/CSCI 544: ANLP/NLP_Group_Project/GPT_3_Model/data/models_trained"
model_size= "124M"
steps= 600
restore_from= "fresh"
print_every= 1
sample_every= 300
save_every= 300

In [18]:
timestamp = datetime.utcnow().strftime('%Y%m%d%H%M%S')
model_dir = save_path
model_name = model_size
run_name = f"gp2simple_new_{model_size}_{timestamp}"
gpt2.download_gpt2(model_name=model_name, model_dir=model_dir)

Fetching checkpoint: 1.05Mit [00:00, 94.3Mit/s]                                                     
Fetching encoder.json: 1.05Mit [00:00, 5.81Mit/s]
Fetching hparams.json: 1.05Mit [00:00, 489Mit/s]                                                    
Fetching model.ckpt.data-00000-of-00001: 498Mit [00:10, 48.8Mit/s]
Fetching model.ckpt.index: 1.05Mit [00:00, 542Mit/s]                                                
Fetching model.ckpt.meta: 1.05Mit [00:00, 9.57Mit/s]
Fetching vocab.bpe: 1.05Mit [00:00, 9.65Mit/s]


In [None]:
# Fine-Tuning the model

# Commenting the code for training since already fine-tuned and saved the model weights
"""
sess = gpt2.start_tf_sess()

gpt2.finetune(sess,
              model_dir=model_dir,
              model_name=model_name,
              checkpoint_dir=model_dir,
              run_name=run_name,
              dataset=file_path,
              steps=steps,
              learning_rate=learning_rate,
              restore_from=restore_from,
              print_every=print_every,
              sample_every=sample_every,
              save_every=save_every)
"""

Loading checkpoint ./data/models_trained/124M\model.ckpt
INFO:tensorflow:Restoring parameters from ./data/models_trained/124M\model.ckpt
Loading dataset...


100%|██████████| 1/1 [00:00<00:00,  5.29it/s]


dataset has 32560 tokens
Training...
[1 | 46.45] loss=2.73 avg=2.73
[2 | 84.30] loss=2.79 avg=2.76
[3 | 122.23] loss=2.61 avg=2.71
[4 | 158.99] loss=2.47 avg=2.65
[5 | 198.33] loss=2.59 avg=2.64
[6 | 237.67] loss=2.45 avg=2.60
[7 | 273.43] loss=2.50 avg=2.59
[8 | 309.22] loss=2.37 avg=2.56
[9 | 345.35] loss=2.50 avg=2.55
[10 | 382.43] loss=2.22 avg=2.52
[11 | 417.51] loss=2.45 avg=2.51
[12 | 453.66] loss=2.27 avg=2.49
[13 | 487.90] loss=2.26 avg=2.47
[14 | 521.64] loss=2.25 avg=2.45
[15 | 559.90] loss=2.27 avg=2.44
[16 | 599.65] loss=2.20 avg=2.42
[17 | 640.53] loss=2.22 avg=2.41
[18 | 678.83] loss=2.21 avg=2.40
[19 | 722.59] loss=2.13 avg=2.38
[20 | 762.31] loss=2.08 avg=2.37
[21 | 801.19] loss=2.20 avg=2.36
[22 | 840.38] loss=1.99 avg=2.34
[23 | 876.70] loss=1.96 avg=2.32
[24 | 911.85] loss=2.21 avg=2.32
[25 | 946.89] loss=1.80 avg=2.29
[26 | 990.79] loss=1.91 avg=2.28
[27 | 1038.34] loss=1.93 avg=2.26
[28 | 1081.87] loss=1.88 avg=2.25
[29 | 1122.62] loss=1.80 avg=2.23
[30 | 1161.75]

In [19]:
# retrieve saved previosuly custom fine-tuned model
checkpoint_dir="/content/drive/MyDrive/Colab Notebooks/CSCI 544: ANLP/NLP_Group_Project/GPT_3_Model/data/models_trained/02_gp2simple_new124_124M_20221124005804"
sess = gpt2.start_tf_sess()
gpt2.load_gpt2(sess, checkpoint_dir=checkpoint_dir)

Loading checkpoint /content/drive/MyDrive/Colab Notebooks/CSCI 544: ANLP/NLP_Group_Project/GPT_3_Model/data/models_trained/02_gp2simple_new124_124M_20221124005804/run1/model-600


In [20]:
# Initialize Sentiment Analyzer
import nltk

nltk.download([
     "stopwords",
     "averaged_perceptron_tagger",
     "vader_lexicon",
     "punkt"
])

from nltk.sentiment import SentimentIntensityAnalyzer
sentimentAnalyzer = SentimentIntensityAnalyzer()
possibleSentiments = ['pos', 'neu', 'neg']

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [21]:
# Initialize Generic / Discrete Model
from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration

mname = "facebook/blenderbot-400M-distill"
discreteModel = BlenderbotForConditionalGeneration.from_pretrained(mname)
tokenizer = BlenderbotTokenizer.from_pretrained(mname)

Downloading:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/730M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/127k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/62.9k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/16.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [None]:
# # Generate response
# question = input()
# # print()

# prefix = '[Human 2] '+ question + '\n[Human 1]'
# temperature = 0.90

# # check the sentiment of input prompt
# sentimentScores = sentimentAnalyzer.polarity_scores(question)

# # Printing sentiment scores
# print('\nSentiment Scores -')
# for sentiment in possibleSentiments:
#   print('{}: {}'.format(sentiment, sentimentScores[sentiment]))


# # Get the highest score/probability sentiment
# quesSentiment = None
# maxVal = -1
# for sentiment in possibleSentiments:
#   if sentimentScores[sentiment] > maxVal:
#     maxVal = sentimentScores[sentiment]
#     quesSentiment = sentiment


# # check if we call fine-tuned transformer for response, or the discrete/generic model
# if quesSentiment in ['pos', 'neu']:
#   # Use the fine-tuned model if sentiment is positive or neutral
#   print('\nCalling Fine-Tuned Transformer for response -')
#   text_generated = gpt2.generate(
#       sess,
#       checkpoint_dir=checkpoint_dir,
#       length=80,
#       temperature=temperature,
#       prefix=prefix,
#       return_as_list=True
#   )
#   # print(text_generated[0])

#   response = text_generated[0].split(sep='\n')
#   response = response[:2]

#   for i, row in enumerate(response):
#       # print(row)
#       person, sentence = row.split(sep=']', maxsplit=1)
#       person = person[1:]
#       sentence = sentence.strip()
#       if person == 'Human 2':
#           person = '\nQuestion: '
#       else:
#           person = 'Response: '
#       response[i] = person + sentence

#   for row in response:
#       print(row)

# # Use Generic/Discrete Model otherwise in case of hateful prompt
# else:
#   print('\n\nCalling Generic/Discrete Model for response -')

#   inputs = tokenizer([question], return_tensors="pt")
#   reply_ids = discreteModel.generate(**inputs)
#   response = tokenizer.batch_decode(reply_ids)
#   response = response[0]
#   response = response.split('>')[1].split('<')[0]
#   print('\nQuestion: ', question)
#   print('Response: ', response)


Do you have a cat?

Sentiment Scores -
pos: 0.0
neu: 1.0
neg: 0.0

Calling Fine-Tuned Transformer for response -

Question: Do you have a cat?
Response: He's so cute I am imagining him playing soccer on his phone right now. I want to play it with him for a while, and then I'll get to play it with him actually. Are you planning anything for the holidays?


In [50]:
# Generate response
question = input()
# print()

prefix = '[Human 2] '+ question + '\n[Human 1]'
temperature = 0.90
res = ''

# check the sentiment of input prompt
sentimentScores = sentimentAnalyzer.polarity_scores(question)

# Printing sentiment scores
res += 'Sentiment Scores -'
for sentiment in possibleSentiments:
  res += '\n{}: {}'.format(sentiment, sentimentScores[sentiment])



# Get the highest score/probability sentiment
quesSentiment = None
maxVal = -1
for sentiment in possibleSentiments:
  if sentimentScores[sentiment] > maxVal:
    maxVal = sentimentScores[sentiment]
    quesSentiment = sentiment



# check if we call fine-tuned transformer for response, or the discrete/generic model
if quesSentiment in ['pos', 'neu']:
  # Use the fine-tuned model if sentiment is positive or neutral
  res += '\n\nCalling Fine-Tuned Transformer for response -'
  text_generated = gpt2.generate(
      sess,
      checkpoint_dir=checkpoint_dir,
      length=80,
      temperature=temperature,
      prefix=prefix,
      return_as_list=True
  )
  # print(text_generated[0])

  response = text_generated[0].split(sep='\n')
  response = response[:2]

  for i, row in enumerate(response):
      # print(row)
      person, sentence = row.split(sep=']', maxsplit=1)
      person = person[1:]
      sentence = sentence.strip()
      if person == 'Human 2':
          person = '\n\nQuestion: '
      else:
          person = '\nResponse: '
      response[i] = person + sentence

  for row in response:
      res += row

# Use Generic/Discrete Model otherwise in case of hateful prompt
else:
  res += '\n\nCalling Generic/Discrete Model for response -'

  inputs = tokenizer([question], return_tensors="pt")
  reply_ids = discreteModel.generate(**inputs)
  response = tokenizer.batch_decode(reply_ids)
  response = response[0]
  response = response.split('>')[1].split('<')[0]
  res += '\n\nQuestion: ' + question
  res += '\nResponse: ' + response


Want to travel to Europe?


In [51]:
print(res)

Sentiment Scores -
pos: 0.245
neu: 0.755
neg: 0.0

Calling Fine-Tuned Transformer for response -

Question: Want to travel to Europe?
Response: Yes! I want to one day go into space!
