<a href="https://colab.research.google.com/github/ericburdett/cs673-personal-tutor/blob/master/Personal_Tutor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Personal Tutor

This notebook contains code for the Personal Tutor System built for CS673: Computational Creativity.


## Imports

In [0]:
!pip install gpt-2-simple

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import matplotlib.pyplot as plt
from torchvision import transforms, utils, datasets
from tqdm import tqdm
from torch.nn.parameter import Parameter
import pdb
import torchvision
import os
import gzip
import tarfile
from PIL import Image, ImageOps
import gc
import pdb
import pandas as pd
import gpt_2_simple as gpt2
import requests
import tensorflow as tf
import os
from IPython.core.ultratb import AutoFormattedTB
__ITB__ = AutoFormattedTB(mode = 'Verbose',color_scheme='LightBg', tb_offset = 1)

assert torch.cuda.is_available(), "Request a GPU from Runtime > Change Runtime"

In [0]:
# Download a few different corpuses to work with GPT2
! wget -O ./text_files.tar.gz 'https://piazza.com/redirect/s3?bucket=uploads&prefix=attach%2Fjlifkda6h0x5bk%2Fhzosotq4zil49m%2Fjn13x09arfeb%2Ftext_files.tar.gz'
!tar -xvf text_files.tar.gz
!rm text_files.tar.gz

In [0]:
# Download the children's book corpus from GitHub
!wget -O cbt.txt https://raw.githubusercontent.com/ericburdett/cs673-personal-tutor/master/data/cbt_train.txt

## Word Distribution

In [3]:
# Download the simple word distribution from GitHub
!wget -O word_dist_full.csv https://raw.githubusercontent.com/ericburdett/cs673-personal-tutor/master/data/word_dist_full.csv

--2020-02-04 21:26:15--  https://raw.githubusercontent.com/ericburdett/cs673-personal-tutor/master/data/word_dist_full.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 163042 (159K) [text/plain]
Saving to: ‘word_dist_full.csv’


2020-02-04 21:26:16 (5.21 MB/s) - ‘word_dist_full.csv’ saved [163042/163042]



In [0]:
class WordDist(Dataset):
  def __init__(self):
    self.df = pd.read_csv('word_dist_full.csv', header=None, names=['word', 'freq'])
  
  def getdf(self):
    return self.df

  def __getitem__(self, index):
    return self.df['word'][index], self.df['freq'][index]

  def __len__(self):
    return len(self.df)

In [5]:
words = WordDist()
print('Num Words: ', words)
words[0:20]

Num Words:  <__main__.WordDist object at 0x7fb77d77a710>


(0      the
 1       of
 2      and
 3       to
 4        a
 5       in
 6      for
 7       is
 8       on
 9     that
 10      by
 11    this
 12    with
 13       i
 14     you
 15      it
 16     not
 17      or
 18      be
 19     are
 Name: word, dtype: object, 0     23135851162
 1     13151942776
 2     12997637966
 3     12136980858
 4      9081174698
 5      8469404971
 6      5933321709
 7      4705743816
 8      3750423199
 9      3400031103
 10     3350048871
 11     3228469771
 12     3183110675
 13     3086225277
 14     2996181025
 15     2813163874
 16     2633487141
 17     2590739907
 18     2398724162
 19     2393614870
 Name: freq, dtype: int64)

## Classes

In [0]:
# Class that deals with training and generating text from GPT2
class LanguageModel():
  def __init__(self, model='124M', genre='children', train_steps=200):
    self.download_model(model)
    self.genre = genre

    tf.reset_default_graph()
    self.sess = gpt2.start_tf_sess()

    if genre == 'children':
      gpt2.finetune(self.sess, 'cbt.txt', model_name=model, steps=train_steps)
    else:
      raise('The specified genre does not exist')

  # Returns a list of sample texts with a given prefix and suffix
  def generate_text(self, prefix='<|startoftext|>', suffix='.', include_prefix=False, nsamples=5):
    if nsamples < 1 or nsamples > 20:
      raise('Error: nsamples must be within the range 1 <= x <= 20')

    return gpt2.generate(self.sess, prefix=prefix, truncate=suffix, include_prefix=include_prefix, batch_size=nsamples, nsamples=nsamples, return_as_list=True)
  
  def download_model(self, model_name):
    if not os.path.isdir(os.path.join("models", model_name)):
      print("Downloading {model_name} model...")
      gpt2.download_gpt2(model_name=model_name)
    else:
      print(model_name, " model is already downloaded")

In [0]:
# A class that contains some knowledge that the user has acquired over time.
# For example, it may hold the words that the user knows (and how well the user knows them)
class UserKnowledge():
  def __init__(self):
    pass

  # We will likely need some place to store the knowledge we acquire about the user
  # so that we can access it from session to session
  def save_knowledge(self, path):
    pass

In [0]:
# Class that evaluates sentences based on what the system knows about the user
class SentenceEvaluator():
  def __init__(self, level='beginner', user_knowledge=None):
    self.level = 'beginner'
    if user_knowledge == None:
      self.user_knowledge = UserKnowledge()
    else:
      self.user_knowledge = user_knowledge

  # We will likely want some method to update the user_knowledge in the evaluator
  # Maybe, we will only pass user_knowledge into the evaluate function?...
  def update_user_knowledge(user_knowledge):
    pass

  # Score the sentences and return the sentence with the highest score
  def evaluate(self, sentences):
    scores = self.score(sentences)
    high_score_index = np.argmax(scores)

    return sentences[high_score_index]

  # Score each sentence based on some criteria
  def score(self, sentences):
    scores = []
    for sentence in sentences:
      score = 0
      score += self.length_score(sentence)
      # add other criteria for scoring
      # ...
      # ...
      scores.append(score)
    
    return scores

  # For beginners, we want to favor shorter sentences
  # This method should change as we increase difficulty level
  def length_score(self, sentence):
    length = len(sentence)

    if self.level == 'beginner':
      if length > 0 and length <= 15:
        return 6
      elif length > 15 and length <= 25:
        return 10
      elif length > 25 and length <= 35:
        return 7
      elif length > 35 and length <= 45:
        return 3
      elif length > 45 and length <= 55:
        return 1
      else:
        return 0
    else:
      raise('support for non-beginners is not supported')

  # For beginners, easier the better!
  # This method should change as we increase difficulty level
  def word_difficulty(self, sentence):
    pass

In [0]:
class SentenceGenerator():
  def __init__(self, language_model=None, evaluator=None):
    if language_model == None:
      self.language_model = LanguageModel()
    else:
      self.language_model = language_model
    if evaluator == None:
      self.evaluator = SentenceEvaluator()
    else:
      self.evaluator = evaluator

  # Generate a sentence, pick the best one based on evaluation, return the sentence
  def generate(self, print_all_sentences=False):
    # Determine the prefix/suffix based on some kind of criteria that is learned over time
    prefix = self.determine_prefix()
    if prefix == '<|startoftext|>':
      include_prefix = False
    else:
      include_prefix = True
    suffix = self.determine_suffix()

    sentences = self.language_model.generate_text(prefix=prefix, suffix=suffix, include_prefix=include_prefix, nsamples=10)
    ### TODO: May be good to remove newlines from sentences here ###
    best_sentence = self.evaluator.evaluate(sentences)

    if print_all_sentences:
      for sentence in sentences:
        print(sentence)

    return best_sentence

  def determine_prefix(self):
    # Good simple sentence starters...
    starters = ['I', 'You', 'The', 'It', '<|startoftext|>', 'This', 'My', 'What', 'When', 'Then', 'Why', 'Who', 'Where']
    random_index = np.random.randint(0, len(starters)) 

    return starters[random_index]

  def determine_suffix(self):
    return '.'

## Tutoring System

In [0]:
model = LanguageModel('124M', train_steps=100) # Will fine-tune model everytime this is called! -- Will need to be fixed at some point
generator = SentenceGenerator(language_model=model)

In [130]:
best_sentence = generator.generate(print_all_sentences=True)
print("Best Sentence: ", best_sentence)

You had my cat-sized tummy , ' and the only thing that it was really worth it was the cat-sized tummy , and I told her to put it back 
You is not going to run any further until I am satisfied with your home 
You get in a cow carriage and get a cow out of the barn and drive on into the country
You have made an acquaintance with the man you are visiting , and he has a son 
You have given it me only fifteen minutes to go and we shall be all right , but the sun is just coming over the hill and there is , I should say , no sign of it yet 
You have a new car and a new wife and a new baby<|endoftext|>Here is my father 's letter from England , which I will soon give to your mother 's , and give you some encouragement , and you will have a good time , and I shall be able to see you again , and I 'll do no wrong , and do not feel any fear lest you do wrong , and I will not be ashamed of you , and you shall always have my best wishes , which are always the best 
You : ''
`` I can neither speak no