<a href="https://colab.research.google.com/github/daveshap/QuestionDetector/blob/main/QuestionDetector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Compile Training Data
Note: Generate the raw data with [this notebook](https://github.com/daveshap/QuestionDetector/blob/main/DownloadGutenbergTop100.ipynb)

In [None]:
import re
import random

datafile = '/content/drive/My Drive/Gutenberg/sentence_data.txt'
corpusfile = '/content/drive/My Drive/Gutenberg/corpus_data.txt'
testfile = '/content/drive/My Drive/Gutenberg/test_data.txt'
sample_cnt = 3000
test_cnt = 30

questions = list()
exclamations = list()
other = list()

with open(datafile, 'r', encoding='utf-8') as infile:
  body = infile.read()
sentences = re.split('\n\n', body)

for i in sentences:
  if 'í' in i or 'á' in i:
    continue 
  if '?' in i:
    questions.append(i)
  elif '!' in i:
    exclamations.append(i)
  else:
    other.append(i)

def flatten_sentence(text):
  text = text.lower()
  fa = re.findall('[\w\s]',text)
  return ''.join(fa)


def compose_corpus(data, count, label):
  result = ''
  random.seed()
  subset = random.sample(data, count)
  for i in subset:
    result += '<|SENTENCE|> %s <|LABEL|> %s <|END|>\n\n' % (flatten_sentence(i), label)
  return result

corpus = compose_corpus(questions, sample_cnt, 'question')
corpus += compose_corpus(exclamations, sample_cnt, 'other')
corpus += compose_corpus(other, sample_cnt, 'other')

with open(corpusfile, 'w', encoding='utf-8') as outfile:
  outfile.write(corpus)
print('Done!', corpusfile)

corpus = compose_corpus(questions, test_cnt, 'question')
corpus += compose_corpus(exclamations, test_cnt, 'other')
corpus += compose_corpus(other, test_cnt, 'other')

with open(testfile, 'w', encoding='utf-8') as outfile:
  outfile.write(corpus)
print('Done!', testfile)

# Finetune Model
Finetune GPT-2

In [None]:
!pip install tensorflow-gpu==1.15.0 --quiet
!pip install gpt-2-simple --quiet

import gpt_2_simple as gpt2

# note: manually mount your google drive in the file explorer to the left

model_dir = '/content/drive/My Drive/GPT2/models'
checkpoint_dir = '/content/drive/My Drive/GPT2/checkpoint'
#model_name = '124M'
model_name = '355M'
#model_name = '774M'


gpt2.download_gpt2(model_name=model_name, model_dir=model_dir)
print('\n\nModel is ready!')

run_name = 'QuestionDetector'
step_cnt = 4000

sess = gpt2.start_tf_sess()

gpt2.finetune(sess,
              dataset=corpusfile,
              model_name=model_name,
              model_dir=model_dir,
              checkpoint_dir=checkpoint_dir,
              steps=step_cnt,
              restore_from='fresh',  # start from scratch
              #restore_from='latest',  # continue from last work
              run_name=run_name,
              print_every=50,
              sample_every=1000,
              save_every=1000
              )

# Test Results

| Run | Model | Steps | Samples | Last Loss | Avg Loss | Accuracy |
|---|---|---|---|---|---|---|
| 01 | 124M | 2000 | 9000 | 0.07 | 0.69 | 71.4% |
| 02 | 355M | 2000 | 9000 | 0.24 | 1.63 | 66% |
| 03 | 355M | 4000 | 9000 | 0.06 | 0.83 | 58% |
| 04 | 355M | 4000 | 9000 | 0.11 | 0.68 | 74.4% |

Larger models seem to need more steps and/or data. Seems to perform very high on questions and less good on others. Test 04 was reduced to 2 classes. 



In [None]:
right = 0
wrong = 0

print('Loading test set...')
with open(testfile, 'r', encoding='utf-8') as file:
  test_set = file.readlines()

for t in test_set:
  t = t.strip()
  if t == '':
    continue
  prompt = t.split('<|LABEL|>')[0] + '<|LABEL|>'
  expect = t.split('<|LABEL|>')[1].replace('<|END|>', '').strip()
  #print('\nPROMPT:', prompt)
  response = gpt2.generate(sess, 
                           return_as_list=True,
                           length=30,  # prevent it from going too crazy
                           prefix=prompt,
                           model_name=model_name,
                           model_dir=model_dir,
                           truncate='\n',  # stop inferring here
                           include_prefix=False,
                           checkpoint_dir=checkpoint_dir,)[0]
  response = response.strip()
  if expect in response:
    right += 1
  else:
    wrong += 1
  print('right:', right, '\twrong:', wrong, '\taccuracy:', right / (right+wrong))
  #print('RESPONSE:', response)

print('\n\nModel:', model_name)
print('Samples:', max_samples)
print('Steps:', step_cnt)