In [146]:
%load_ext dotenv
%dotenv
import os
import requests
import random
import csv
from transformers import GPTNeoXTokenizerFast
from typing import List, Union, Any
import time
import numpy as np
import requests
from requests.adapters import HTTPAdapter, Retry
from tqdm.auto import tqdm
import json
import getpass

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [147]:
GPT_API_URL = os.getenv("GPT_API_URL")
AB_TESTER_URL = os.getenv("AB_TESTER_URL")

In [142]:
tokenizer = GPTNeoXTokenizerFast.from_pretrained("EleutherAI/gpt-neox-20b")

In [143]:
def token_length(prompt: str) -> int:
	return tokenizer(prompt, return_tensors="np").input_ids.shape[1]


def gpt_api(prompt: str, filters: List[Any], maximum_tokens: int, n_samples: int, eos_token: Union[int, None] = None, n_gens_per_context: int = 128) -> List[str]:
	r = requests.post(GPT_API_URL, json={
		'prompt': prompt,
		'filters': filters,
		'maximum_tokens': maximum_tokens,
		'n_samples': n_samples,
		'eos_token_id': eos_token,
		'n_gens_per_context': n_gens_per_context,
	})
	r.raise_for_status()
	return r.json()['tokens']

In [25]:
# Read metadata
with open('MS_digitised_books_2021-01-09.csv', 'r') as csvfile:
	reader = csv.reader(csvfile)
	rows = list(reader)
	headers = rows[0]
	print(headers)
	rows = rows[1:]
	titles = [row[headers.index('Title')] for row in rows]
	authors = [row[headers.index('Name')] for row in rows]
	variant_titles = [row[headers.index('Variant titles')] for row in rows]

# Filter out empty strings
titles = [title for title in titles if title != '']
authors = [author for author in authors if author != '']
variant_titles = [variant_title for variant_title in variant_titles if variant_title != '']

# Print some random titles
print("Titles:")
print("\n".join(random.sample(titles, 10)))

print("\nAuthors:")
print("\n".join(random.sample(authors, 10)))

#print("\nVariant titles:")
#print("\n".join(random.sample(variant_titles, 10)))

# Average title length:
print("\nAverage title length:", sum([tokenizer(title, return_tensors="np").input_ids.shape[1] for title in titles]) / len(titles))

# Median title length
print("\nMedian title length:", sorted([tokenizer(title, return_tensors="np").input_ids.shape[1] for title in titles])[int(len(titles) / 2)])

# Max title length
print("\nMax title length:", max([tokenizer(title, return_tensors="np").input_ids.shape[1] for title in titles]))

['BL record ID', 'Type of resource', 'Name', 'Dates associated with name', 'Type of name', 'Role', 'All names', 'Title', 'Variant titles', 'Series title', 'Number within series', 'Country of publication', 'Place of publication', 'Publisher', 'Date of publication', 'Edition', 'Physical description', 'Dewey classification', 'BL shelfmark', 'Topics', 'Genre', 'Languages', 'Notes', 'BL record ID for physical resource']
Titles:
The History and Antiquities of Eyam ... Third edition, with illustrations from photographs by J. A. Warwick
Mount Auburn: its scenes, its beauties, and its lessons
Rob Roy Macgregor, or Auld Lang Syne! A musical drama, in three acts. Founded on the popular novel of 'Rob Roy.'
Jerusalem the Holy. A brief history of ancient Jerusalem; with an account of the modern city ... With fifteen illustrations from photographs and four maps
Un' Ultima Confessione ... Poemetto inglese volgarizzato da E. Teza
Chronik von Wiener-Neustadt [With plates.]
Aspects of Paris
The Black Hig

In [27]:
# Generate book title
def generate_book_title(filters):
	# Generate a prompt using a list of real titles
	titles_prompt = "[Book Titles]\n"
	n_tokens = token_length(titles_prompt)

	while True:
		title = random.choice(titles) + "\n"
		title_tokens = token_length(title)

		# Exclude long titles
		if title_tokens > 32:
			continue
		
		if title_tokens + n_tokens > (2048 - 128):
			break

		titles_prompt += title
		n_tokens += title_tokens
	
	# Generate
	title = gpt_api(titles_prompt, filters=filters, maximum_tokens=64, n_samples=1, eos_token=tokenizer.encode("\n")[0])[0]
	return title.strip()

In [None]:
# Testing:

# Generate a prompt using a list of real titles
#titles_prompt = "[Book Titles]\n"
#n_tokens = token_length(titles_prompt)

# while True:
# 	title = random.choice(titles) + "\n"
# 	title_tokens = token_length(title)

# 	# Exclude long titles
# 	if title_tokens > 32:
# 		continue
	
# 	if title_tokens + n_tokens > (2048 - 128):
# 		break

# 	titles_prompt += title
# 	n_tokens += title_tokens

#print(f"Context length: {n_tokens}")
#print("Context:")
#print(titles_prompt)

# print("\n\nNone:")
# for _ in range(8):
# 	title = gpt_api(titles_prompt, temperature=0.7, top_k=0, top_p=0.5, maximum_tokens=64, n_samples=1, eos_token=None)[0]
# 	# Truncate at first newline
# 	title = title[:title.index("\n")]
# 	print(title)

# print("\n\nEOS:")
# for _ in range(8):
# 	title = gpt_api(titles_prompt, temperature=0.7, top_k=0, top_p=0.5, maximum_tokens=64, n_samples=1, eos_token=tokenizer.encode("\n")[0])[0]
# 	print(title)

# print("\n\nRAW:")
# filters = [
# 	{'repetition': {'param': 1.05, 'range': 560}},
# 	{'tailfree': 0.937},
# 	{'top_a': 0.085},
# 	{'typical': 0.965},
# 	{'top_p': 0.88},
# 	{'temp': 1.33},
# ]
# print(gpt_api(titles_prompt, filters=filters, maximum_tokens=128, n_samples=1, eos_token=None)[0])

# 6.1s, so .05s per token
# About 1 second per token...
# 2045 => 1.3s
# 2043 => 1.3s
# 1024 => 6.9s

In [15]:
for _ in range(8):
	title = generate_book_title()
	print(title)

The Whim; a comedy, [in three acts.] By Mr. W. H. Colles
The Wooing O't. A comedy
Tales of the Wonder Club
The Lore of Shakespeare, and other poems. By J. O. H [by John O'Brien Haggard.]
The Claverings. A novel
The History of St. Catharine's, Leicestershire, from its foundation to the present time
The Poetical Works of William Wordsworth. With a memoir, by his sister, and a preface by T. Hutchinson
The History of the Life and Adventures of Mrs. Mary Anne Talbot. By herself


In [128]:
def random_filter():
	#Repetition Penality => [1, 1.075]
	#Repetition Range => [496, 2048]
	#Tail-Free => [0.879, 0.997]
	#Top A => [0.06, 0.15]
	#Typical => [0.85, 0.996]
	#Nucleus => [0.88, 1]
	#Top-K => [10, 85]
	#Randomness => [0.34, 2.5]

	repetition_param = random.choice(np.linspace(1, 1.075, num=10))
	repetition_range = random.randrange(496, 2048)
	temp = random.choice(np.linspace(0.34, 2.5, num=10))

	n_filters = random.randrange(1, 5)
	filters = [
		{'tailfree': random.choice(np.linspace(0.879, 0.997, num=10))},
		{'top_a': random.choice(np.linspace(0.06, 0.15, num=10))},
		{'typical': random.choice(np.linspace(0.85, 0.996, num=10))},
		{'top_p': random.choice(np.linspace(0.88, 1, num=10))},
		{'top_k': int(random.choice(np.logspace(3.4, 10, base=2, num=10)))},
	]
	random.shuffle(filters)
	filters = filters[:n_filters] + [{'temp': temp}]
	random.shuffle(filters)
	filters = [{'repetition': {'param': repetition_param, 'range': repetition_range}}] + filters
	return filters

random_filter()

[{'repetition': {'param': 1.0166666666666666, 'range': 1801}},
 {'top_p': 0.92},
 {'temp': 1.06},
 {'top_k': 10}]

In [134]:
# Generate a bunch of titles
title_filter_combinations = {
	# Big Dumb
	'temp=0.7,top_k=40': [
		{'temp': 0.7},
		{'top_k': 40},
	],
	# Blue Lighter (similar to NovelAI's)
	'rep=1.05,rep_range=560,tailfree=0.937,top_a=0.085,typical=0.965,top_p=0.88,temp=1.33': [
		{'repetition': {'param': 1.05, 'range': 560}},
		{'tailfree': 0.937},
		{'top_a': 0.085},
		{'typical': 0.965},
		{'top_p': 0.88},
		{'temp': 1.33},
	],
	# Reverie (similar to NovelAI's)
	'rep=1.0225,rep_range=2048,tailfree=0.925,typical=0.85,top_a=0.12,top_p=0.985,temp=0.925,top_k=85': [
		{'repetition': {'param': 1.0225, 'range': 2048}},
		{'tailfree': 0.925},
		{'typical': 0.85},
		{'top_a': 0.12},
		{'top_p': 0.985},
		{'temp': 0.925},
		{'top_k': 85},
	],
	# Random-1
	'rep=1.0583333333333333,rep_range=1131,top_k=80,temp=1.54,top_p=0.92,typical=0.996,tailfree=0.879':[
		{'repetition': {'param': 1.0583333333333333, 'range': 1131}},
		{'top_k': 80},
		{'temp': 1.5400000000000003},
		{'top_p': 0.92},
		{'typical': 0.996},
		{'tailfree': 0.879}
	],
	# Less Dumb
	#'rep=1.05,rep_range=560,temp=0.7,top_k=40': [
	#	{'repetition': {'param': 1.05, 'range': 560}},
	#	{'temp': 0.7},
	#	{'top_k': 40},
	#],
}

for _ in tqdm(range(1024)):
	time.sleep(3)
	filter = random.choice(list(title_filter_combinations.keys()))
	#print(f"Filter: {filter}")

	title = generate_book_title(title_filter_combinations[filter])
	if token_length(title) > 32:
		continue

	with open("generated_titles.txt", "a") as f:
		f.write(f"{filter},title={title}\n")

100%|██████████| 1024/1024 [1:22:01<00:00,  4.81s/it]


In [145]:
# Read Project ID and Project Admin Token
project_id = getpass.getpass("Project ID")
project_admin_token = getpass.getpass("Project Admin Token")

In [135]:
# Generate a bunch of samples for A-B testing
real_titles = [title for title in titles if token_length(title) <= 32]
real_titles_lower = set(title.lower() for title in real_titles)
generated_titles = []

with open("generated_titles.txt", "r") as f:
	for line in f:
		x = line.index(",title=")
		filter = line[:x]
		title = line[x+7:].strip()
		generated_titles.append((filter, title))

n_filters = len(set(filter for filter,_ in generated_titles))
avg = len(generated_titles) // n_filters
print(f"Average titles per filter: {avg}")

possible_titles = [("real",title) for title in random.sample(real_titles, avg)] + generated_titles

s = requests.Session()
retries = Retry(total=10, backoff_factor=0.1, status_forcelist=[ 500, 502, 503, 504 ])
s.mount('https://', HTTPAdapter(max_retries=retries))

for filter,title in tqdm(possible_titles):
	# Filter out plagiarism
	if filter != 'real' and title.lower() in real_titles_lower:
		print(f"Plagiarism: {filter}")
		continue

	other_title = None
	while other_title is None:
		other_filter,other_title = random.choice(possible_titles)
		if other_title.lower() == title.lower():
			other_title = None
	
	resp = s.post(f"{AB_TESTER_URL}project/new_sample", headers={ "Authorization": f"Bearer {project_admin_token}" }, json={
		"project": project_id,
		"text1": title,
		"text2": other_title,
		"source1": filter,
		"source2": other_filter,
	})
	resp.raise_for_status()


Average titles per filter: 474


 24%|██▍       | 566/2372 [00:34<01:39, 18.15it/s]

Plagiarism: rep=1.0225,rep_range=2048,tailfree=0.925,typical=0.85,top_a=0.12,top_p=0.985,temp=0.925,top_k=85


 24%|██▍       | 573/2372 [00:34<01:35, 18.90it/s]

Plagiarism: rep=1.05,rep_range=560,tailfree=0.937,top_a=0.085,typical=0.965,top_p=0.88,temp=1.33


 25%|██▍       | 590/2372 [00:35<01:33, 19.08it/s]

Plagiarism: temp=0.7,top_k=40


 26%|██▋       | 625/2372 [00:37<01:36, 18.16it/s]

Plagiarism: temp=0.7,top_k=40


 34%|███▎      | 796/2372 [00:47<01:22, 19.05it/s]

Plagiarism: rep=1.0225,rep_range=2048,tailfree=0.925,typical=0.85,top_a=0.12,top_p=0.985,temp=0.925,top_k=85


 35%|███▌      | 841/2372 [00:50<01:20, 19.04it/s]

Plagiarism: rep=1.0225,rep_range=2048,tailfree=0.925,typical=0.85,top_a=0.12,top_p=0.985,temp=0.925,top_k=85


 36%|███▌      | 852/2372 [00:51<01:18, 19.39it/s]

Plagiarism: temp=0.7,top_k=40


 44%|████▍     | 1049/2372 [01:03<01:12, 18.28it/s]

Plagiarism: rep=1.0225,rep_range=2048,tailfree=0.925,typical=0.85,top_a=0.12,top_p=0.985,temp=0.925,top_k=85


 44%|████▍     | 1054/2372 [01:03<01:05, 20.02it/s]

Plagiarism: temp=0.7,top_k=40


 45%|████▌     | 1068/2372 [01:04<01:10, 18.62it/s]

Plagiarism: temp=0.7,top_k=40


 45%|████▌     | 1073/2372 [01:04<01:04, 20.21it/s]

Plagiarism: rep=1.0225,rep_range=2048,tailfree=0.925,typical=0.85,top_a=0.12,top_p=0.985,temp=0.925,top_k=85


 47%|████▋     | 1105/2372 [01:06<01:09, 18.14it/s]

Plagiarism: rep=1.05,rep_range=560,tailfree=0.937,top_a=0.085,typical=0.965,top_p=0.88,temp=1.33


 53%|█████▎    | 1252/2372 [01:15<01:01, 18.26it/s]

Plagiarism: temp=0.7,top_k=40


 55%|█████▍    | 1293/2372 [01:17<00:58, 18.33it/s]

Plagiarism: rep=1.05,rep_range=560,tailfree=0.937,top_a=0.085,typical=0.965,top_p=0.88,temp=1.33


 55%|█████▌    | 1305/2372 [01:18<00:50, 21.31it/s]

Plagiarism: temp=0.7,top_k=40
Plagiarism: rep=1.05,rep_range=560,tailfree=0.937,top_a=0.085,typical=0.965,top_p=0.88,temp=1.33


 56%|█████▌    | 1325/2372 [01:19<00:57, 18.36it/s]

Plagiarism: temp=0.7,top_k=40


 56%|█████▋    | 1338/2372 [01:19<00:53, 19.32it/s]

Plagiarism: rep=1.0225,rep_range=2048,tailfree=0.925,typical=0.85,top_a=0.12,top_p=0.985,temp=0.925,top_k=85


 57%|█████▋    | 1351/2372 [01:20<00:52, 19.39it/s]

Plagiarism: temp=0.7,top_k=40


 61%|██████    | 1438/2372 [01:25<00:49, 19.05it/s]

Plagiarism: temp=0.7,top_k=40
Plagiarism: temp=0.7,top_k=40


 61%|██████▏   | 1457/2372 [01:26<00:49, 18.42it/s]

Plagiarism: rep=1.05,rep_range=560,tailfree=0.937,top_a=0.085,typical=0.965,top_p=0.88,temp=1.33


 62%|██████▏   | 1474/2372 [01:27<00:48, 18.38it/s]

Plagiarism: temp=0.7,top_k=40


 68%|██████▊   | 1609/2372 [01:35<00:41, 18.45it/s]

Plagiarism: rep=1.05,rep_range=560,tailfree=0.937,top_a=0.085,typical=0.965,top_p=0.88,temp=1.33


 71%|███████   | 1684/2372 [01:40<00:37, 18.33it/s]

Plagiarism: temp=0.7,top_k=40


 74%|███████▍  | 1751/2372 [01:44<00:32, 19.05it/s]

Plagiarism: rep=1.05,rep_range=560,tailfree=0.937,top_a=0.085,typical=0.965,top_p=0.88,temp=1.33


 78%|███████▊  | 1844/2372 [01:49<00:28, 18.37it/s]

Plagiarism: rep=1.0225,rep_range=2048,tailfree=0.925,typical=0.85,top_a=0.12,top_p=0.985,temp=0.925,top_k=85


 83%|████████▎ | 1957/2372 [01:56<00:22, 18.35it/s]

Plagiarism: temp=0.7,top_k=40


 88%|████████▊ | 2086/2372 [02:04<00:15, 18.69it/s]

Plagiarism: temp=0.7,top_k=40


 89%|████████▊ | 2101/2372 [02:05<00:14, 18.46it/s]

Plagiarism: temp=0.7,top_k=40


 89%|████████▉ | 2110/2372 [02:05<00:13, 18.81it/s]

Plagiarism: rep=1.05,rep_range=560,tailfree=0.937,top_a=0.085,typical=0.965,top_p=0.88,temp=1.33


 89%|████████▉ | 2121/2372 [02:06<00:13, 18.63it/s]

Plagiarism: temp=0.7,top_k=40


 90%|█████████ | 2146/2372 [02:07<00:12, 18.36it/s]

Plagiarism: temp=0.7,top_k=40


 93%|█████████▎| 2203/2372 [02:11<00:08, 18.96it/s]

Plagiarism: rep=1.05,rep_range=560,tailfree=0.937,top_a=0.085,typical=0.965,top_p=0.88,temp=1.33


 94%|█████████▍| 2229/2372 [02:12<00:06, 21.57it/s]

Plagiarism: rep=1.05,rep_range=560,tailfree=0.937,top_a=0.085,typical=0.965,top_p=0.88,temp=1.33
Plagiarism: rep=1.05,rep_range=560,tailfree=0.937,top_a=0.085,typical=0.965,top_p=0.88,temp=1.33


 95%|█████████▍| 2250/2372 [02:13<00:06, 18.63it/s]

Plagiarism: temp=0.7,top_k=40


100%|██████████| 2372/2372 [02:21<00:00, 16.80it/s]


In [149]:
resp = requests.get(f"{AB_TESTER_URL}project/get_ratings", headers={ "Authorization": f"Bearer {project_id}" })
resp.raise_for_status()
ratings = resp.json()
print(len(ratings))

resp = requests.get(f"{AB_TESTER_URL}project/get_samples", headers={ "Authorization": f"Bearer {project_id}" })
resp.raise_for_status()
samples = resp.json()
print(len(samples))
samples = { sample['id']: sample for sample in samples}


1915
2335


In [93]:
pairs = {}

for rating in ratings:
	sample = samples[rating['sample_id']]
	pair = (sample['source1'], sample['source2'])
	value = [1, 0] if rating['rating'] == 0 else [0, 1]

	if pair[1] < pair[0]:
		pair = (pair[1], pair[0])
		value = [value[1], value[0]]
	
	if pair not in pairs:
		pairs[pair] = value
	else:
		pairs[pair][0] += value[0]
		pairs[pair][1] += value[1]


In [101]:
filter_names = list(title_filter_combinations.keys()) + ['real']

for filter in filter_names:
	print(f"{filter}:")

	for other_filter in filter_names:
		pair = (filter, other_filter)
		flipped = False
		if pair[1] < pair[0]:
			pair = (pair[1], pair[0])
			flipped = True
		
		if pair in pairs:
			value = pairs[pair]
			if flipped:
				value = [value[1], value[0]]
			print(f"\t{other_filter}: {value} ({100 * value[0] / (value[0] + value[1]):.2f}%)")
		else:
			print(f"\t{other_filter}: NO DATA")
	
	print()

temp=0.7,top_k=40:
	temp=0.7,top_k=40: [10, 10] (50.00%)
	rep=1.05,rep_range=560,tailfree=0.937,top_a=0.085,typical=0.965,top_p=0.88,temp=1.33: [32, 14] (69.57%)
	rep=1.0225,rep_range=2048,tailfree=0.925,typical=0.85,top_a=0.12,top_p=0.985,temp=0.925,top_k=85: [13, 39] (25.00%)
	rep=1.05,rep_range=560,temp=0.7,top_k=40: [25, 11] (69.44%)
	real: [30, 13] (69.77%)

rep=1.05,rep_range=560,tailfree=0.937,top_a=0.085,typical=0.965,top_p=0.88,temp=1.33:
	temp=0.7,top_k=40: [14, 32] (30.43%)
	rep=1.05,rep_range=560,tailfree=0.937,top_a=0.085,typical=0.965,top_p=0.88,temp=1.33: [9, 4] (69.23%)
	rep=1.0225,rep_range=2048,tailfree=0.925,typical=0.85,top_a=0.12,top_p=0.985,temp=0.925,top_k=85: [12, 33] (26.67%)
	rep=1.05,rep_range=560,temp=0.7,top_k=40: [11, 17] (39.29%)
	real: [22, 18] (55.00%)

rep=1.0225,rep_range=2048,tailfree=0.925,typical=0.85,top_a=0.12,top_p=0.985,temp=0.925,top_k=85:
	temp=0.7,top_k=40: [39, 13] (75.00%)
	rep=1.05,rep_range=560,tailfree=0.937,top_a=0.085,typical=0.965,to

Reverie BETTER THAN Big dumb
Reverie BETTER THAN Blue Lighter
Reverie BETTER THAN Big dumb
Reverie BETTER THAN Blue Lighter
Reverie BETTER THAN Less dumb
Reverie BETTER THAN Less dumb




In [110]:
for rating in ratings:
	sample = samples[rating['sample_id']]
	if 'rep=1.0225,rep_range=2048,tailfree=0.925,typical=0.85,top_a=0.12,top_p=0.985,temp=0.925,top_k=85' == sample['source1']:
		print(f"Reverie: {sample['text1']}")
	elif 'rep=1.0225,rep_range=2048,tailfree=0.925,typical=0.85,top_a=0.12,top_p=0.985,temp=0.925,top_k=85' == sample['source2']:
		print(f"Reverie: {sample['text2']}")
	
	if 'rep=1.05,rep_range=560,tailfree=0.937,top_a=0.085,typical=0.965,top_p=0.88,temp=1.33' == sample['source1']:
		print(f"Blue Lighter: {sample['text1']}")
	elif 'rep=1.05,rep_range=560,tailfree=0.937,top_a=0.085,typical=0.965,top_p=0.88,temp=1.33' == sample['source2']:
		print(f"Blue Lighter: {sample['text2']}")

Reverie: The Chronicles of the Schönberg Family, etc
Reverie: The Origin of Species by means of Natural Selection, or the Preservation of favoured races in the struggle for life
Reverie: The Deacon's Masterpiece. A novel
Reverie: The Early Italian Poets
Reverie: Leaves from the Journal of a Residence in Spain. In the years 1828-1829
Reverie: A Child of the Sea. A story
Blue Lighter: A Midsummer Ramble among the Hills about Richmond and Charlottesville, Va., during August, 1860
Reverie: The American Woman's Home; or, Principles of Domestic Science applied to the various duties of the household
Reverie: The Oaks of Mamre
Blue Lighter: Blaenau Festiniog: its history, antiquities, topography, inhabitants, trade, manufactures, &c., illustrated by numerous engravings
Reverie: The Heroines of Shakespeare's Plays
Blue Lighter: The Narrative of Arthur Gordon Pym .... New edition
Blue Lighter: Pictures of Travel, for Young People's Reading
Reverie: The Little Book of Italy
Blue Lighter: Folk Son

In [106]:
# Find the sample with "ropes of sand" in the title
for sample in samples.values():
	if "consummate" in sample['text1'].lower() or "consummate" in sample['text2'].lower():
		print(sample)

{'id': 1502, 'text1': 'The Early Days of the Abbey of St. Peter, Gloucester ... Reprinted from the Transactions of the Bristol & Gloucestershire Archæological Society', 'text2': 'The History of a Consummate Scoundrel', 'source1': 'real', 'source2': 'temp=0.7,top_k=40'}
{'id': 2097, 'text1': 'The History of a Consummate Scoundrel', 'text2': 'Ropes of Sand. A novel', 'source1': 'temp=0.7,top_k=40', 'source2': 'real'}
{'id': 2571, 'text1': 'The Life of the Right Hon. Sir Robert Peel ... By his Son', 'text2': 'The History of a Consummate Scoundrel', 'source1': 'temp=0.7,top_k=40', 'source2': 'temp=0.7,top_k=40'}


In [112]:
# Backup A-B Study
def backup_ab_study(project_id):
	resp = requests.get(f"{AB_TESTER_URL}project/get_samples", headers={ "Authorization": f"Bearer {project_id}" })
	resp.raise_for_status()
	samples = resp.json()

	resp = requests.get(f"{AB_TESTER_URL}project/get_ratings", headers={ "Authorization": f"Bearer {project_id}" })
	resp.raise_for_status()
	ratings = resp.json()

	with open(f"ab-testing-data/{project_id}.json", "w") as f:
		json.dump({
			"samples": samples,
			"ratings": ratings,
		}, f, indent=4)

project_id = getpass.getpass("Project ID")
backup_ab_study(project_id)

In [109]:
import csv

with open("MS_digitised_books_2021-01-09.csv", "r") as f:
	reader = csv.reader(f)
	for row in reader:
		# Check all columns for "ropes of sand"
		if any("ropes of sand" in column.lower() for column in row):
			print(row)

['014811142', 'Monograph', 'Francillon, Robert Edward', '', 'person', '', 'Francillon, Robert Edward [person]', 'Ropes of Sand. A novel', '', '', '', 'England', 'London', 'Chatto & Windus', '1893', '', '3 volumes (8°)', '', 'Digital Store 012637.g.9', '', '', 'English', '', '001305099']
['014815451', 'Monograph', 'Warren, John Byrne Leicester, Baron de Tabley', '1835-1895', 'person', '', 'Warren, John Byrne Leicester, Baron de Tabley, 1835-1895 [person]', 'Ropes of Sand. A novel. By W. P. Lancaster', '', '', '', 'England', 'Bungay ; London', '', '1869', '', '3 volumes (8°)', '', 'Digital Store 12626.a.7', '', '', 'English', '', '002064276']


In [2]:
# TODO: Generate book title
""

num_chapters = random.randint(5, 40)

for n_chapter in range(num_chapters):



['\nThe "Rush" was on. The "Rush," always', "\nThe next morning, when I awoke, the sun's rays were", '\nIt was a hot morning in June, and a little boy was seated', '\n"The very first time I saw her was when I was a very']


In [150]:
tokenizer.encode("\n")[0]

187

In [398]:
for _ in range(16):
	print(gpt_api("foobar", temperature=0.7, top_k=0, top_p=0.7, maximum_tokens=128, samples=1, eos_token=None)[0])

######
#                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

In [450]:
# Benchmark
prompt = ""
while token_length(prompt) < 1024:
	prompt += "abc"

print(f"Context length: {token_length(prompt)}")

for _ in range(4):
	start_time = time.time()
	gpt_api(prompt, temperature=0.7, top_k=0, top_p=0.5, maximum_tokens=64, n_samples=1)
	print(f"{time.time() - start_time}s")

Context length: 1024
3.4793848991394043s
3.350806951522827s
3.4069037437438965s
3.522256851196289s


# Without NVLink:

Context length: 1024
3.7007622718811035s
3.5063722133636475s
3.663666248321533s
3.5835208892822266s

# With NVLink

Context length: 1024
3.4793848991394043s
3.350806951522827s
3.4069037437438965s
3.522256851196289s

In [11]:
x = np.random.rand(1, 1020, 50432)
x[0,-1,:].shape

(50432,)

array([[[0.20583061, 0.25655888, 0.57376255, ..., 0.63722464,
         0.18469649, 0.72157616],
        [0.77890523, 0.67140767, 0.31467863, ..., 0.80892395,
         0.36885446, 0.32894237],
        [0.83927681, 0.78117119, 0.31400823, ..., 0.37632704,
         0.13194221, 0.79551261],
        ...,
        [0.82604518, 0.53916013, 0.89331545, ..., 0.63911151,
         0.7811522 , 0.98363328],
        [0.90330042, 0.03070979, 0.10536325, ..., 0.13524872,
         0.9917675 , 0.05303584],
        [0.77258278, 0.41883475, 0.58816712, ..., 0.86285744,
         0.1125331 , 0.52431572]]])