<a href="https://colab.research.google.com/github/ericburdett/cs673-personal-tutor/blob/master/Personal_Tutor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Personal Tutor

This notebook contains code for the Personal Tutor System built for CS673: Computational Creativity.


## Imports and Setup

Restart the Runtime after running the top code block.

In [0]:
!pip install transformers
!python -m spacy download en_core_web_md

In [0]:
import torch
import torch.nn.functional as F
import pdb
import string
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import spacy
import numpy as np
# # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
# import logging
# logging.basicConfig(level=logging.INFO)

## GPT2 - Transformers Example


In [0]:
class LanguageModel():
  def __init__(self, mask=None, k=50):
    self.model = GPT2LMHeadModel.from_pretrained('gpt2').cuda()
    self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    self.k = k
    self.mask = mask

  def top_k_logits(self, logits):
    if self.k == 0:
        return logits
    values, _ = torch.topk(logits, self.k)
    min_values = values[-1]
    return torch.where(logits < min_values, torch.ones_like(logits, dtype=logits.dtype) * -1e10, logits)

  def set_mask(self, mask):
    self.mask = mask

  def get_sentence(self, prompt, length):
    generated = self.tokenizer.encode(prompt)
    context = torch.tensor([generated]).cuda()

    past = None

    for i in range(length):
      output, past = self.model(context, past=past)
      
      logits = output[..., -1, :].squeeze()

      topk_logits = self.top_k_logits(logits)
      topk_log_probs = F.softmax(topk_logits, dim=-1)
      token = torch.multinomial(topk_log_probs, num_samples=1)

      generated += [token.item()]
      context = token.unsqueeze(0)
    
    sequence = self.tokenizer.decode(generated)

    end_index = len(prompt.split('.'))

    return ".".join(sequence.split('.')[0:end_index]) + '.'


In [0]:
class Evaluator():
  def __init__(self):
    pass
  
  def related_score(self, sentence_doc):
    # Find Nouns and Adjectives
    nouns_adjs = []
    for token in doc:
      pos = token.pos
      if pos in [92, 96]: # NOUN, PNOUN, ADJ , 84
         nouns_adjs.append(token)

    # Sample Random Pairs
    pairs = get_random_pairs(nouns_adjs, 5)
    if pairs == None:
      return 0

    # Check Similarity
    similarities = []
    for pair in pairs:
      similarity = pair[0].similarity(pair[1])
      similarities.append(similarity)
      print('Comparing {} with {}, score: {:.4f}'.format(pair[0], pair[1], similarity))
    
    print(similarities)

    return np.mean(similarities)

In [28]:
LM = LanguageModel(k=50)

HBox(children=(IntProgress(value=0, description='Downloading', max=224, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=548118077, style=ProgressStyle(description_…




HBox(children=(IntProgress(value=0, description='Downloading', max=1042301, style=ProgressStyle(description_wi…




HBox(children=(IntProgress(value=0, description='Downloading', max=456318, style=ProgressStyle(description_wid…




In [0]:
def get_random_pairs(arr, size):
  pairs = []

  try:
    for i in range(size):
      pair = np.random.choice(arr, size=2, replace=False)
      pairs.append(pair)
  except:
    return None

  return pairs

In [0]:
LM.get_sentence('Basketball is my favorite sport.', 20)

In [0]:
nlp = spacy.load('en_core_web_md')

In [0]:
doc = nlp("I wish I could play basketball for the Knicks")
# print('{}, {} - Score: {:.4f}'.format(doc[1], doc[3], doc[1].similarity(doc[3])))
# print('{}, {} - Score: {:.4f}'.format(doc[1], doc[11], doc[1].similarity(doc[11])))
# print('{}, {} - Score: {:.4f}'.format(doc[9], doc[11], doc[9].similarity(doc[11])))
# print('{}, {} - Score: {:.4f}'.format(doc[0], doc[6], doc[0].similarity(doc[6])))
# print('{}, {} - Score: {:.4f}'.format(doc[0], doc[9], doc[0].similarity(doc[9])))

In [92]:
evaluator = Evaluator()
evaluator.related_score(doc)

Comparing basketball with Knicks, score: 0.3703
Comparing Knicks with basketball, score: 0.3703
Comparing Knicks with basketball, score: 0.3703
Comparing basketball with Knicks, score: 0.3703
Comparing Knicks with basketball, score: 0.3703
[0.37029457, 0.37029457, 0.37029457, 0.37029457, 0.37029457]


0.37029457

In [29]:
doc[3].pos

84

## Word Distribution

In [0]:
# Download the simple word distribution from GitHub
!wget -O word_dist_full.csv https://raw.githubusercontent.com/ericburdett/cs673-personal-tutor/master/data/word_dist_full.csv

--2020-02-20 18:26:39--  https://raw.githubusercontent.com/ericburdett/cs673-personal-tutor/master/data/word_dist_full.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 163042 (159K) [text/plain]
Saving to: ‘word_dist_full.csv’


2020-02-20 18:26:39 (5.17 MB/s) - ‘word_dist_full.csv’ saved [163042/163042]



In [0]:
class WordDist(Dataset):
  def __init__(self):
    self.df = pd.read_csv('word_dist_full.csv', header=None, names=['word', 'freq'])
  
  def getdf(self):
    return self.df

  def dict_normalized(self): 
    copy = self.df.copy()
    copy['freq'] = copy['freq'] / copy['freq'].max()

    return copy.set_index('word').to_dict()['freq']

  def __getitem__(self, index):
    return self.df['word'][index], self.df['freq'][index]

  def __len__(self):
    return len(self.df)

# Old Code

## Imports

In [0]:
!pip install gpt-2-simple
!pip install gtts

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import matplotlib.pyplot as plt
from torchvision import transforms, utils, datasets
from tqdm import tqdm
from torch.nn.parameter import Parameter
import pdb
import torchvision
import os
import string
import gzip
import tarfile
from PIL import Image, ImageOps
import gc
import pdb
import pandas as pd
import gpt_2_simple as gpt2
import requests
import tensorflow as tf
import os
from gtts import gTTS 
from IPython.core.ultratb import AutoFormattedTB
from IPython.display import Audio, HTML
__ITB__ = AutoFormattedTB(mode = 'Verbose',color_scheme='LightBg', tb_offset = 1)

assert torch.cuda.is_available(), "Request a GPU from Runtime > Change Runtime"

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [0]:
# Download the children's book corpus from GitHub
!wget -O wiki_simple.txt https://raw.githubusercontent.com/ericburdett/cs673-personal-tutor/master/data/wiki_simple.txt

--2020-02-20 18:26:35--  https://raw.githubusercontent.com/ericburdett/cs673-personal-tutor/master/data/wiki_simple.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 47667422 (45M) [text/plain]
Saving to: ‘wiki_simple.txt’


2020-02-20 18:26:36 (199 MB/s) - ‘wiki_simple.txt’ saved [47667422/47667422]



## Classes

In [0]:
# Class that deals with training and generating text from GPT2
class LanguageModel():
  def __init__(self, model='124M', genre='children', train_steps=200, max_length=150):
    self.download_model(model)
    self.genre = genre
    self.max_length = max_length

    tf.reset_default_graph()
    self.sess = gpt2.start_tf_sess()

    if genre == 'children':
      gpt2.finetune(self.sess, 'wiki_simple.txt', model_name=model, steps=train_steps)
    else:
      raise('The specified genre does not exist')

  # Returns a list of sample texts with a given prefix and suffix
  def generate_text(self, prefix='<|startoftext|>', suffix='.', include_prefix=False, nsamples=5):
    if nsamples < 1 or nsamples > 20:
      raise('Error: nsamples must be within the range 1 <= x <= 20')

    return gpt2.generate(self.sess, prefix=prefix, truncate=suffix, include_prefix=include_prefix, batch_size=nsamples, nsamples=nsamples, return_as_list=True, length=self.max_length)
  
  def download_model(self, model_name):
    if not os.path.isdir(os.path.join("models", model_name)):
      print(f"Downloading {model_name} model...")
      gpt2.download_gpt2(model_name=model_name)
    else:
      print(f"{model_name} model is already downloaded")

In [0]:
# A class that contains some knowledge that the user has acquired over time.
# For example, it may hold the words that the user knows (and how well the user knows them)
class UserKnowledge():
  def __init__(self):
    pass

  # We will likely need some place to store the knowledge we acquire about the user
  # so that we can access it from session to session
  def save_knowledge(self, path):
    pass

In [0]:
# Class that evaluates sentences based on what the system knows about the user
class SentenceEvaluator():
  def __init__(self, level='beginner', user_knowledge=None):
    self.level = 'beginner'
    self.word_dist = WordDist().dict_normalized()
    self.word_dist_threshold = 0.033
    self.word_dist_threshold_step = 0.005
    self.word_dist_difficulty_threshold = 7

    if user_knowledge == None:
      self.user_knowledge = UserKnowledge()
    else:
      self.user_knowledge = user_knowledge

  # We will likely want some method to update the user_knowledge in the evaluator
  # Maybe, we will only pass user_knowledge into the evaluate function?...
  def update_user_knowledge(user_knowledge):
    pass

  # Score the sentences and return the sentence with the highest score
  def evaluate(self, sentences):
    scores = self.score(sentences)
    high_score_index = np.argmax(scores)

    return sentences[high_score_index]

  # Score each sentence based on some criteria
  def score(self, sentences):
    scores = []
    for sentence in sentences:
      score = 0
      score += self.length_score(sentence)
      score += self.word_difficulty(sentence)

      # add other criteria for scoring
      # ...
      # ...
      scores.append(score)
    
    return scores

  # For beginners, we want to favor shorter sentences
  # This method should change as we increase difficulty level
  def length_score(self, sentence):
    length = len(sentence)

    if self.level == 'beginner':
      if length > 0 and length <= 15:
        return 6
      elif length > 15 and length <= 25:
        return 10
      elif length > 25 and length <= 35:
        return 7
      elif length > 35 and length <= 45:
        return 3
      elif length > 45 and length <= 55:
        return 1
      else:
        return 0
    else:
      raise('support for non-beginners is not supported')

  # For beginners, easier the better!
  # This method should change as we increase difficulty level
  def word_difficulty(self, sentence):
    word_scores = []

    for word in sentence.split(' '):
      word = word.lower()
      word_score = self.word_dist.get(word, 0) # Return the word or 0 if it doesn't exist

      if word_score >= self.word_dist_threshold:
        word_scores.append(10)
      elif word_score >= self.word_dist_threshold - self.word_dist_threshold_step:
        word_scores.append(8)
      elif word_score >= self.word_dist_threshold - (2 * self.word_dist_threshold_step):
        word_scores.append(6)
      elif word_score >= self.word_dist_threshold - (3 * self.word_dist_threshold_step):
        word_scores.append(4)
      elif word_score >= self.word_dist_threshold - (4 * self.word_dist_threshold_step):
        word_scores.append(2)
      else:
        word_scores.append(0)

    score_med = np.median(word_scores)

    if score_med >= self.word_dist_difficulty_threshold:
      return 10
    elif score_med >= self.word_dist_difficulty_threshold - 1:
      return 8
    elif score_med >= self.word_dist_difficulty_threshold - 2:
      return 6
    elif score_med >= self.word_dist_difficulty_threshold - 3:
      return 4
    elif score_med >= self.word_dist_difficulty_threshold - 4:
      return 2
    else:
      return 0

In [0]:
class SentenceGenerator():
  def __init__(self, language_model=None, evaluator=None):
    if language_model == None:
      self.language_model = LanguageModel()
    else:
      self.language_model = language_model
    if evaluator == None:
      self.evaluator = SentenceEvaluator()
    else:
      self.evaluator = evaluator

  # Generate a sentence, pick the best one based on evaluation, return the sentence
  def generate(self, print_all_sentences=False):
    # Determine the prefix/suffix based on some kind of criteria that is learned over time
    prefix = self.determine_prefix()
    if prefix == '<|startoftext|>':
      include_prefix = False
    else:
      include_prefix = True
    suffix = self.determine_suffix()

    sentences = self.language_model.generate_text(prefix=prefix, suffix=suffix, include_prefix=include_prefix, nsamples=15)
    sentences = self.filter_punctuation(sentences)
    best_sentence = self.evaluator.evaluate(sentences)

    if print_all_sentences:
      for sentence in sentences:
        print(sentence)

    return best_sentence
  
  # Used to filter unwanted punctuation GPT2 might produce, like newlines
  def filter_punctuation(self, sentences):
    filtered_sentences = []

    for sentence in sentences:
      new_sentence = sentence.replace('\n', ' ')
      new_sentence = new_sentence.translate(str.maketrans('', '', string.punctuation))
      filtered_sentences.append(new_sentence)

    return filtered_sentences

  def determine_prefix(self):
    # Good simple sentence starters...
    # starters = ['<|startoftext|>']
    starters = ['I', 'You', 'The', 'They', 'It', '<|startoftext|>', 'He', 'She', 'My']
    random_index = np.random.randint(0, len(starters)) 

    return starters[random_index]

  def determine_suffix(self):
    return '.'

## Tutoring System

In [0]:
model = LanguageModel('117M', train_steps=200) # Will fine-tune model everytime this is called! -- Will need to be fixed at some point

In [0]:
generator = SentenceGenerator(language_model=model)
best_sentence = generator.generate(print_all_sentences=True)
print("Best Sentence: ", best_sentence)

My own experience with it was that it was a very funny and funny movie 
My game is an open world game 
My second favorite place to eat is at a nearby lake 
My name is Washington SmootHart  and I am an agent of the United Nations 
Myrious s talk with the King of France was not well received and the book was banned 
My wife and her brother were at home when a sudden  thunderbolt  struck the house 
My money was full of things that are not in the movie and were not intended for audience or to be seen by children 
My favorite game is High Roller Ballet 
My own life was spent in the city and in the West Bank  and belonged to the family of the people who lived there 
My hair is round and it looks like the shaft of a gun 
My way was to go to a village called Namur in Afghanistan in 1959 
My students were supposed to be students at the University of East Anglia  but they were supposed to be studying in the department of English 
My statutes were changed to go with the Dukes of France  and the D

In [0]:
def print_options():
  print('0: I don\'t know what this means.')
  print('1: Choose words I don\'t know.')
  print('2: Generate a better sentence.')
  print('3: I need definitions.')
  print('4: I understand! Give me another!')
  print('5: Exit: I\'ve learned enough for today.')

## Text-to-Speech

In [0]:
speech = gTTS(text = best_sentence, lang = 'en', slow = False)
speech.save('speech.mp3')
Audio(filename='speech.mp3', autoplay=True)

In [0]:
Audio('Hedidnotlikethesoundofit.mp3', autoplay=True)

something


## Learn-A-Language Loop
* Terrible Name...
* We need to come up with something!

In [0]:
print("Learn-A-Language - English")

while True:
  print('\nGenerating personalized sentence... Please Wait.')
  sentence = generator.generate()
  print("\nTry this sentence:")
  print(sentence, '\n')
  speech = gTTS(text=sentence, lang='en', slow=False)
  filename = sentence.replace(' ', '') + '.mp3'
  speech.save(filename)
  

  while True:
    print_options()
    Audio(filename=filename, autoplay=False)
    code = input('Enter a code from above:')
    if code in ['0','1','2','3','4','5']:
      code = int(code)
      break
    print('')
  
  if code == 0:
    print('\nI\'m Sorry! This is as get as it gets...')
  elif code == 1:
    print('\nI\'m Sorry! This functionality isn\'t currently available.')
  elif code == 2:
    print('\nNew sentence coming right up!')
  elif code == 3:
    print('\nI\'m Sorry! This functionality isn\'t currently available.')
  elif code == 4:
    print('\nGreat Job! Here\'s another.')
  else:
    print('\nThanks for using Learn-A-Language! Play again soon!')
    break

Learn-A-Language - English

Generating personalized sentence... Please Wait.

Try this sentence:
He did not like the sound of it  

0: I don't know what this means.
1: Choose words I don't know.
2: Generate a better sentence.
3: I need definitions.
4: I understand! Give me another!
5: Exit: I've learned enough for today.


KeyboardInterrupt: ignored

In [0]:
print("Learn-A-Language - English")

while True:
  print('\nGenerating personalized sentence... Please Wait.')
  sentence = generator.generate()
  print("\nTry this sentence:")
  print(sentence, '\n')
  speech = gTTS(text=sentence, lang='en', slow=False)
  filename = sentence.replace(' ', '') + '.mp3'
  speech.save(filename)
  Audio(filename=filename, autoplay=False)

  while True:
    print_options()
    Audio(filename=filename, autoplay=False)
    code = input('Enter a code from above:')
    if code in ['0','1','2','3','4','5']:
      code = int(code)
      break
    print('')
  
  if code == 0:
    print('\nI\'m Sorry! This is as good as it gets...')
  elif code == 1:
    print('\nI\'m Sorry! This functionality isn\'t currently available.')
  elif code == 2:
    print('\nNew sentence coming right up!')
  elif code == 3:
    print('\nI\'m Sorry! This functionality isn\'t currently available.')
  elif code == 4:
    print('\nGreat Job! Here\'s another.')
  else:
    print('\nThanks for using Learn-A-Language! Play again soon!')
    break

Learn-A-Language - English

Generating personalized sentence... Please Wait.

Try this sentence:
He did not like the sound of it  

0: I don't know what this means.
1: Choose words I don't know.
2: Generate a better sentence.
3: I need definitions.
4: I understand! Give me another!
5: Exit: I've learned enough for today.


KeyboardInterrupt: ignored