In [None]:
!nvidia-smi

In [None]:
!pip install torch torchtext -f https://download.pytorch.org/whl/torch_stable.html

In [2]:
# model_param_string='14m'
# model_param_string='70m'
# model_param_string='160m'
# model_param_string='410m'
# model_param_string='1b'
# model_param_string='1.4b'
model_param_string='2.8b'

In [None]:
from transformers import GPTNeoXForCausalLM, AutoTokenizer
print(model_param_string)
model = GPTNeoXForCausalLM.from_pretrained(
  f"EleutherAI/pythia-{model_param_string}-deduped",
  revision="step3000",
  cache_dir=f"./pythia-{model_param_string}-deduped/step3000",
)

tokenizer = AutoTokenizer.from_pretrained(
  f"EleutherAI/pythia-{model_param_string}-deduped",
  revision="step3000",
  cache_dir=f"./pythia-{model_param_string}-deduped/step3000",
)

inputs = tokenizer("Hello, I am", return_tensors="pt")
tokens = model.generate(**inputs)
tokenizer.decode(tokens[0])

# REMOVE FOR CPU
# model = model.to('cuda')

In [None]:
tokenizer.decode(tokens[0])

#URL EXPERIMENT

In [4]:
import requests
def verify_url(url):
  try:
    if (requests.get(url=url, timeout=10).status_code) == 200:
      print('Success')
      return True
  except Exception as ex:
    print(ex)
  return False


In [24]:
def generate_urls(model, tokenizer, prefix='', num_sequences=1, max_length=20, min_length=0, k=50, num_gens=1):
    seqs = []
    inputs = tokenizer(prefix, return_tensors='pt')
    tokens = model.generate(**inputs, max_length=max_length, min_length=min_length, do_sample=True, temperature=1, top_k=k, num_return_sequences=num_gens)
    for row in tokens:
      seqs.append([prefix, tokenizer.decode(row), False])
    return seqs


In [None]:
# URL EXPERIMENT DEPRECIATED
seqs = generate_urls(model, tokenizer, 'https://', num_sequences=1, max_length=30, min_length=0, num_gens=200)
for s in seqs:
  url = s[1].split(' ')[0].split('\n')[0]
  print(url)
  s[2] = verify_url(url)
  s[1] = url


In [51]:
import pandas as pd
count_url = 10
df = pd.DataFrame(seqs, columns=['prefix', 'output', 'success'])
df.to_csv(f'url_data_1_4b_{count_url}.csv')

#TOP-N Experiment

In [None]:
import random

def generate_top_k(model, tokenizer, prefix='', num_sequences=1, max_length=20, min_length=0, k=50, num_gens=1):
  seqs = []
  rand=False
  if prefix == '':
    rand = True
  for i in range(num_sequences):
    if rand:
      n = random.randint(3, model.config.vocab_size)
      prefix = tokenizer.decode([n])
    inputs = tokenizer(prefix, return_tensors='pt')
    tokens = model.generate(**inputs, max_length=max_length, min_length=min_length, do_sample=True, top_k=k, num_return_sequences=num_gens)
    for row in tokens:
      seqs.append([prefix, tokenizer.decode(row)])
  return seqs



In [None]:
# TOP-N Experiment
count = 4
seqs = generate_top_k(model, tokenizer, prefix="", num_sequences=110, num_gens=10, max_length=100, min_length=100, k=40)

In [None]:
print('prefix, output')
for row in seqs:
  print(row)

In [None]:
# Save the results
import pandas as pd
i = 0
for row in seqs:
  if i % 10 == 0:
    print(row)
  i+=1
df = pd.DataFrame(seqs, columns=['prefix', 'output'])
df.to_csv(f'top_n_data{count}.csv')

In [None]:
print(len(seqs))

#Wiki Experiment

In [None]:
print(tokenizer("i am going to the mall tomorrow can you pick up the groceries please").input_ids)
prefix = tokenizer.decode(tokenizer("i am going to the mall tomorrow can you pick up the groceries please").input_ids[:10])

[74, 717, 1469, 281, 253, 28974, 10873, 476, 368, 2619, 598, 253, 45160, 447, 4496]


In [6]:
import pandas as pd
def get_csv_data(csv_file):
    return pd.read_csv(csv_file).to_numpy()[:,1:]

def get_wiki_text(example_index, prefix_length, data, tokenizer):
    prefix = tokenizer.decode(tokenizer(data[example_index][1][2:]).input_ids[:prefix_length])
    text = data[example_index][1]
    return text, prefix

def wiki_generation(model, tokenizer, prefix, example_index, training_text, max_length=100, k=50, min_length=0, do_sample=True):
    seqs = []
    inputs = tokenizer(prefix, return_tensors='pt')
    inputs = {key: value.to('cuda') for key, value in inputs.items()}
    tokens = model.generate(**inputs, max_length=max_length, min_length=min_length, do_sample=do_sample, temperature=1, top_k=k)
    for row in tokens:
      seqs.append([example_index, prefix, tokenizer.decode(row), training_text[2:]])
    return seqs

def convert_to_csv(rows, outfile):
    df = pd.DataFrame(rows, columns=['Example Index', 'Prefix', 'Output', 'Training Text'])
    df.to_csv(outfile)
    return df

def get_many_wiki_generations(model, tokenizer, start_index, prefix_token_length, max_token_generation_length, do_sample, outfile, end=None):
    data = get_csv_data('shuffled_wiki_2000.csv')
    rows = []
    if end is None:
        end = data.shape[0]
    count = 0
    for i in range(start_index, end):
        text, prefix = get_wiki_text(i, prefix_token_length, data, tokenizer)
        seqs = wiki_generation(model, tokenizer, prefix, i, text, max_length=max_token_generation_length, do_sample=do_sample)
        for s in seqs:
          rows.append(s)
        count += 1
        if count % 50 == 0:
          print(f'Completed: {count}/{end - start_index}')
    return convert_to_csv(rows, outfile)


In [None]:
df = get_many_wiki_generations(
    model=model,
    tokenizer=tokenizer,
    start_index=0,
    end=505,
    prefix_token_length=20,
    max_token_generation_length=100,
    do_sample=False,
    outfile="wiki_sample_2_8b_20_0_to_505.csv"
)
# 10

In [4]:
def check_for_memorization(prefix, generated, ref_text, tokenizer):
    begin = ref_text.find(prefix)
    # ref_text = ref_text[:]
    # period = ref_text[:len(prefix) + 1].rfind('.')
    # if period > -1:
    #   ref_text = ref_text[:period+1] + ' ' + ref_text[period+2:]
    #   # print(ref_text)


    prefix_ids = tokenizer(prefix).input_ids
    generated_ids = tokenizer(generated).input_ids
    ref_ids = tokenizer(ref_text).input_ids

    print('__________________________')
    print(generated_ids[20:30])
    print(ref_ids[20:30])


    start_index = len(prefix_ids)
    # if ref_ids[start_index - 1] != prefix_ids[-1]:
    #   for k in range(3):
    #     if ref_ids[start_index + k] == prefix_ids[-1]:
    #       start_index += k + 1
    #       for j in range(k + 1):
    #         generated_ids.insert(0, 0)
    #       break

    # for i in reversed(range(len(prefix_ids))):
    #   if ref_ids[i] == prefix_ids[i]:
    #     start_index = i + 1
    #     break


    end = min(len(generated_ids), len(ref_ids))
    count = 0
    for i in range(start_index, end):
        if ref_ids[i] == generated_ids[i]:
            count+=1
        else:
          end = i
          break
    return count, tokenizer.decode(ref_ids[:end]), tokenizer.decode(generated_ids[:end])


def check_all_samples(csv_file, tokenizer, prefix_token_length, model_name):
    data = pd.read_csv(csv_file, index_col=0).to_numpy()
    df_data = []
    for row in data:
        count, gen, ref = check_for_memorization(row[1], row[2], row[3], tokenizer)
        df_data.append([row[0], row[1], row[2], row[3], count, gen, ref, prefix_token_length, model_name])
    return pd.DataFrame(df_data, columns=['Example Id', 'Prefix', 'Output', 'Training Text', 'Matching Token Count', 'Matching Output', 'Matching Reference', 'Prefix Length', 'model'])








In [None]:
import pandas as pd
df = check_all_samples('wiki_sample_2_8b_20_0_to_505.csv', tokenizer, 20, '2.8b')

In [None]:
display(df.sort_values('Example Id'))
data = df.sort_values('Example Id')
# for val in data[503][1:4]:
#   print(val)

In [10]:
data.to_csv('wiki_2_8b_20.csv')