<a href="https://colab.research.google.com/github/ericburdett/cs673-personal-tutor/blob/master/Personal_Tutor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Personal Tutor

This notebook contains code for the Personal Tutor System built for CS673: Computational Creativity.


## Imports

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import matplotlib.pyplot as plt
from torchvision import transforms, utils, datasets
from tqdm import tqdm
from torch.nn.parameter import Parameter
import pdb
import torchvision
import os
import gzip
import tarfile
from PIL import Image, ImageOps
import gc
import pdb
import pandas as pd
from IPython.core.ultratb import AutoFormattedTB
__ITB__ = AutoFormattedTB(mode = 'Verbose',color_scheme='LightBg', tb_offset = 1)

assert torch.cuda.is_available(), "Request a GPU from Runtime > Change Runtime"

## Word Distribution

In [2]:
# Download the simple word distribution from GitHub
!wget -O word_dist_full.csv https://raw.githubusercontent.com/ericburdett/cs673-personal-tutor/master/data/word_dist_full.csv

--2020-02-04 16:28:36--  https://raw.githubusercontent.com/ericburdett/cs673-personal-tutor/master/data/word_dist_full.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 163042 (159K) [text/plain]
Saving to: ‘word_dist_full.csv’


2020-02-04 16:28:37 (4.69 MB/s) - ‘word_dist_full.csv’ saved [163042/163042]



In [0]:
class WordDist(Dataset):
  def __init__(self):
    self.df = pd.read_csv('word_dist_full.csv', header=None, names=['word', 'freq'])
  
  def getdf(self):
    return self.df

  def __getitem__(self, index):
    return self.df['word'][index], self.df['freq'][index]

  def __len__(self):
    return len(self.df)

In [4]:
words = WordDist()
print('Num Words: ', words)
words[0:20]

Num Words:  <__main__.WordDist object at 0x7f04d3c68908>


(0      the
 1       of
 2      and
 3       to
 4        a
 5       in
 6      for
 7       is
 8       on
 9     that
 10      by
 11    this
 12    with
 13       i
 14     you
 15      it
 16     not
 17      or
 18      be
 19     are
 Name: word, dtype: object, 0     23135851162
 1     13151942776
 2     12997637966
 3     12136980858
 4      9081174698
 5      8469404971
 6      5933321709
 7      4705743816
 8      3750423199
 9      3400031103
 10     3350048871
 11     3228469771
 12     3183110675
 13     3086225277
 14     2996181025
 15     2813163874
 16     2633487141
 17     2590739907
 18     2398724162
 19     2393614870
 Name: freq, dtype: int64)

## Training

In [0]:
# Download a few different corpuses to work with GPT2
! wget -O ./text_files.tar.gz 'https://piazza.com/redirect/s3?bucket=uploads&prefix=attach%2Fjlifkda6h0x5bk%2Fhzosotq4zil49m%2Fjn13x09arfeb%2Ftext_files.tar.gz'
!tar -xvf text_files.tar.gz
!rm text_files.tar.gz

In [28]:
# Download the children's book corpus from GitHub
!wget -O cbt.txt https://raw.githubusercontent.com/ericburdett/cs673-personal-tutor/master/data/cbt_train.txt

--2020-02-04 18:41:03--  https://raw.githubusercontent.com/ericburdett/cs673-personal-tutor/master/data/cbt_train.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 25742364 (25M) [text/plain]
Saving to: ‘cbt.txt’


2020-02-04 18:41:04 (41.1 MB/s) - ‘cbt.txt’ saved [25742364/25742364]



In [29]:
!pip install gpt-2-simple



In [0]:
import gpt_2_simple as gpt2
import requests
import tensorflow as tf
import os

In [44]:
!ls

cbt.txt


In [0]:
!rm -r sample_data

In [45]:
model_name = "124M"
if not os.path.isdir(os.path.join("models", model_name)):
  print("Downloading {model_name} model...")
  gpt2.download_gpt2(model_name=model_name)


tf.reset_default_graph()
sess = gpt2.start_tf_sess()
gpt2.finetune(sess,
              'cbt.txt',
              model_name=model_name,
              steps=200)   # steps is max number of training steps


Fetching checkpoint:   0%|                                              | 0.00/77.0 [00:00<?, ?it/s][A
Fetching checkpoint: 1.05Mit [00:00, 84.8Mit/s]                                                     [A

Downloading {model_name} model...



Fetching encoder.json:   0%|                                           | 0.00/1.04M [00:00<?, ?it/s][A
Fetching encoder.json: 1.05Mit [00:00, 70.3Mit/s]                                                   [A
Fetching hparams.json:   0%|                                            | 0.00/90.0 [00:00<?, ?it/s][A
Fetching hparams.json: 1.05Mit [00:00, 250Mit/s]                                                    [A
Fetching model.ckpt.data-00000-of-00001:   0%|                          | 0.00/498M [00:00<?, ?it/s][A
Fetching model.ckpt.data-00000-of-00001:   1%|▎                | 7.34M/498M [00:00<00:06, 70.7Mit/s][A
Fetching model.ckpt.data-00000-of-00001:   2%|▎                | 10.5M/498M [00:00<00:09, 49.9Mit/s][A
Fetching model.ckpt.data-00000-of-00001:   3%|▌                | 14.7M/498M [00:00<00:13, 36.7Mit/s][A
Fetching model.ckpt.data-00000-of-00001:   6%|▉                | 28.3M/498M [00:00<00:10, 45.7Mit/s][A
Fetching model.ckpt.data-00000-of-00001:   7%|█▏               

Loading checkpoint models/124M/model.ckpt
INFO:tensorflow:Restoring parameters from models/124M/model.ckpt



  0%|          | 0/1 [00:00<?, ?it/s][A

Loading dataset...



100%|██████████| 1/1 [00:31<00:00, 31.72s/it][A
[A

dataset has 6312984 tokens
Training...
[1 | 7.30] loss=3.77 avg=3.77
[2 | 8.59] loss=3.21 avg=3.49
[3 | 9.89] loss=3.55 avg=3.51
[4 | 11.16] loss=3.23 avg=3.44
[5 | 12.44] loss=3.27 avg=3.40
[6 | 13.75] loss=3.60 avg=3.44
[7 | 15.03] loss=3.38 avg=3.43
[8 | 16.36] loss=3.57 avg=3.45
[9 | 17.66] loss=3.73 avg=3.48
[10 | 18.93] loss=3.31 avg=3.46
[11 | 20.21] loss=3.33 avg=3.45
[12 | 21.49] loss=3.49 avg=3.45
[13 | 22.77] loss=3.61 avg=3.47
[14 | 24.04] loss=3.37 avg=3.46
[15 | 25.33] loss=3.37 avg=3.45
[16 | 26.60] loss=3.39 avg=3.45
[17 | 27.88] loss=3.43 avg=3.45
[18 | 29.18] loss=3.33 avg=3.44
[19 | 30.45] loss=3.39 avg=3.44
[20 | 31.73] loss=3.32 avg=3.43
[21 | 33.01] loss=3.15 avg=3.42
[22 | 34.28] loss=3.51 avg=3.42
[23 | 35.57] loss=3.34 avg=3.42
[24 | 36.84] loss=3.46 avg=3.42
[25 | 38.13] loss=3.15 avg=3.41
[26 | 39.40] loss=3.40 avg=3.41
[27 | 40.69] loss=3.36 avg=3.40
[28 | 41.97] loss=3.50 avg=3.41
[29 | 43.25] loss=3.38 avg=3.41
[30 | 44.54] loss=3.29 avg=3.40
[31 | 45.83] 

## Tutoring System

In [0]:
def determine_prefix():
  return '<|startoftext|>'

def determine_suffix():
  return '.'

# What impacts the quality of a good sentence?
def determine_best_sentence(sentences):
  # Shorter sentences
  min_index=0
  for i in range(1, len(sentences)):
    if len(sentences[i]) < len(sentences[min_index]):
      min_index = i

  # Contains lower word difficulty
  # -- Utilizes word difficulty

  return sentences[i]

def generate_sentence():
  prefix = determine_prefix()
  suffix = determine_suffix()
  sentences = gpt2.generate(sess, prefix=prefix, truncate=suffix, return_as_list=True, batch_size=5, nsamples=5, temperature=.9, include_prefix=False)
  
  return determine_best_sentence(sentences)


In [56]:
generate_sentence()

"Dead*tantahawk Nautilus Aardvarkly Thru Gromm --- ''\nSo chirped And he felt himself to be lost in thought "