In [1]:
from transformers import AutoTokenizer
import transformers
import torch
import utils

torch.cuda.empty_cache()

In [64]:
import re

In [2]:
model = "meta-llama/Llama-2-13b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model)
#tokenizer.pad_token_id = tokenizer.eos_token_id
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [120]:
def prompt_llama2(prompt):
    prompt_template=f'''[INST] <<SYS>>
    You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
    <</SYS>> {prompt} [/INST]'''
    
    sequences = pipeline(
        prompt_template,
        do_sample=True,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        #max_length=2048,
        temperature=0.7,
        top_p=0.95,
        top_k=40,
        repetition_penalty=1.1,
    )
    result = sequences[0]['generated_text']
    return result.split('[/INST]')[1].strip(), sequences

def count_tokens(prompt):
    tokens = tokenizer.tokenize(prompt)
    return len(tokens)

def shorten_tokens(prompt):
    tokens = tokenizer.tokenize(prompt)
    tokens = tokens[:4000]
    return tokenizer.convert_tokens_to_string(tokens)

### Try reproduce Automatic Prompt Engineer (APE) example

![alt text](./pictures/ape_workflow.png "APE Workflow")

In [30]:
words = ["sane", "direct", "informally", "unpopular", "subtractive", "nonresidential",
    "inexact", "uptown", "incomparable", "powerful", "gaseous", "evenly", "formality",
    "deliberately", "off"]
antonyms = ["insane", "indirect", "formally", "popular", "additive", "residential",
    "exact", "downtown", "comparable", "powerless", "solid", "unevenly", "informality",
    "accidentally", "on"]

prompt_gen_template = """I gave a friend an instruction. Based on the instruction they produced the following input-output pairs:
\n\n[full_DEMO]\n\n"""#The instruction was to [APE]"""

eval_template = 'Instruction: [PROMPT]\nInput: [INPUT]\nOutput: [OUTPUT]'

demos_template = 'Input: [INPUT]\nOutput: [OUTPUT]'

#### Generating Proposals

In [107]:
demo = ''
n = 5
for i in range(len(words)):
    demo_query = demos_template.replace('[INPUT]', words[i]).replace('[OUTPUT]', antonyms[i])
    demo += demo_query
    if i < len(words) - 1:
        demo += '\n'
print(demo)

Input: sane
Output: insane
Input: direct
Output: indirect
Input: informally
Output: formally
Input: unpopular
Output: popular
Input: subtractive
Output: additive
Input: nonresidential
Output: residential
Input: inexact
Output: exact
Input: uptown
Output: downtown
Input: incomparable
Output: comparable
Input: powerful
Output: powerless
Input: gaseous
Output: solid
Input: evenly
Output: unevenly
Input: formality
Output: informality
Input: deliberately
Output: accidentally
Input: off
Output: on


In [108]:
prompt = prompt_gen_template.replace('[full_DEMO]', demo)
prompt += f"""Please generate {n} possible instructions and display them as a list. In the output, surround each instruction with square brackets. For example: 
Instruction 1: [this is the first instruction]
Instruction 2: [this is the second instruction]
..and so on"""
print(prompt)

I gave a friend an instruction. Based on the instruction they produced the following input-output pairs:


Input: sane
Output: insane
Input: direct
Output: indirect
Input: informally
Output: formally
Input: unpopular
Output: popular
Input: subtractive
Output: additive
Input: nonresidential
Output: residential
Input: inexact
Output: exact
Input: uptown
Output: downtown
Input: incomparable
Output: comparable
Input: powerful
Output: powerless
Input: gaseous
Output: solid
Input: evenly
Output: unevenly
Input: formality
Output: informality
Input: deliberately
Output: accidentally
Input: off
Output: on

Please generate 5 possible instructions and display them as a list. In the output, surround each instruction with square brackets. For example: 
Instruction 1: [this is the first instruction]
Instruction 2: [this is the second instruction]
..and so on


In [121]:
out, sequences = prompt_llama2(prompt)
print(out)

Sure, I can help you with that! Here are five possible instructions for your friend, along with their corresponding output pairs:

Instruction 1: [Reverse the given word]
Input: sane
Output: insane
Input: direct
Output: indirect
Input: informally
Output: formally
Input: unpopular
Output: popular
Input: subtractive
Output: additive
Input: nonresidential
Output: residential
Input: inexact
Output: exact
Input: uptown
Output: downtown
Input: incomparable
Output: comparable
Input: powerful
Output: powerless
Input: gaseous
Output: solid
Input: evenly
Output: unevenly
Input: formality
Output: informality
Input: deliberately
Output: accidentally
Input: off
Output: on

Instruction 2: [Replace each letter with the letter that is three positions ahead of it in the alphabet]
Input: sane
Output: yenas
Input: direct
Output: tirect
Input: informally
Output: mformally
Input: unpopular
Output: pular
Input: subtractive
Output: tadditive
Input: nonresidential
Output: nresidential
Input: inexact
Output: x

In [110]:
proposals = re.findall(r'\[.*?\]', out)
proposals = [p[1:-1] for p in proposals]
proposals

['Use the opposite word of the given adjective to describe the noun.',
 'Use a synonym of the given word to describe the noun.',
 'Use a word that is the opposite of the given word to describe the noun.',
 'Use a word that is related to the given word to describe the noun.',
 'Use a word that is the same as the given word, but with a different ending to describe the noun.']

#### Evaluating proposals

In [123]:
sequences.scores

AttributeError: 'list' object has no attribute 'scores'

#### Resampling/Iterative Monte Carlo

In [83]:
prompt = prompt_gen_template.replace('[full_DEMO]', demo)
prompt += f"""\nThe instruction was to {proposals[0]}. 
Generate a new variation of the given instruction while keeping the semantic meaning, and surround the new instruction with square brackets.
Only print the new instruction, and do not give the original instruction back."""
print(prompt)

I gave a friend an instruction. Based on the instruction they produced the following input-output pairs:


Input: sane
Output: insane
Input: direct
Output: indirect
Input: informally
Output: formally
Input: unpopular
Output: popular
Input: subtractive
Output: additive
Input: nonresidential
Output: residential
Input: inexact
Output: exact
Input: uptown
Output: downtown
Input: incomparable
Output: comparable
Input: powerful
Output: powerless
Input: gaseous
Output: solid
Input: evenly
Output: unevenly
Input: formality
Output: informality
Input: deliberately
Output: accidentally
Input: off
Output: on


The instruction was to Reverse the output of the previous instruction. 
Generate a new variation of the given instruction while keeping the semantic meaning, and surround the new instruction with square brackets.
Only print the new instruction, and do not give the original instruction back.


In [84]:
out = prompt_llama2(prompt)
print(out)

Sure! Here's a new variation of the instruction that reverses the output of the previous instruction, while keeping the semantic meaning:

[Reverse the output of the following input-output pairs, while maintaining their original meaning.]

Here are the input-output pairs:

Input: sane
Output: insane (reverse: "insane" becomes "sane")
Input: direct
Output: indirect (reverse: "indirect" becomes "direct")
Input: informally
Output: formally (reverse: "formally" becomes "informally")
Input: unpopular
Output: popular (reverse: "popular" becomes "unpopular")
Input: subtractive
Output: additive (reverse: "additive" becomes "subtractive")
Input: nonresidential
Output: residential (reverse: "residential" becomes "nonresidential")
Input: inexact
Output: exact (reverse: "exact" becomes "inexact")
Input: uptown
Output: downtown (reverse: "downtown" becomes "uptown")
Input: incomparable
Output: comparable (reverse: "comparable" becomes "incomparable")
Input: powerful
Output: powerless (reverse: "pow

In [86]:
re.findall(r'\[.*?\]', out)[0][1:-1]

'Reverse the output of the following input-output pairs, while maintaining their original meaning.'

### Test on Citation-aware summarization example

In [124]:
urls = ['https://cs.illinois.edu/research/areas/artificial-intelligence',
        'https://cs.illinois.edu/research/undergraduate-research',
        'https://cs.illinois.edu/corporate#research']
from trafilatura import fetch_url, extract
paragraphs_dict = {}
for i in range(len(urls)):
    #p = utils.extract_paragraphs(urls[i])
    p = utils.extract_paragraphs_trafilatura(urls[i])
    #p = extract(fetch_url(urls[i]), favor_precision=True)
    paragraphs_dict[i] = p

In [125]:
input = ''
for i in range(len(urls)):
    s = ''.join(paragraphs_dict[i]).replace('  ', ' ')
    input += f'Here is passage number {i+1}: "{s}"\n\n'

In [126]:
output = """The University of Illinois has a world-renowned computer science department that offers various research opportunities for undergraduate students [Passage 2]. Additionally, the university boasts one of the largest computer science departments in the nation, providing students with a broad range of academic and research experiences [Passage 3]. The department has a strong reputation for revolutionary technology and groundbreaking research, addressing real-world problems in various fields, including medicine, engineering, business, and science [Passage 3]. The computer science department offers various research collaborations and recruitment opportunities for companies, including the Corporate Connection program, which provides a main point of entry for partnering with the department [Passage 3]. Companies can also sponsor senior design projects, which give company representatives the opportunity to pitch their projects to all enrolled students at the start of the fall semester [Passage 3]. Furthermore, the department has a global reputation for developing innovative technology, and its faculty and students are bringing their expertise to bear on many of society's most challenging problems [Passage 3]. Overall, the University of Illinois's computer science department is a great place for companies to find talented and highly educated individuals for recruitment and research collaborations. With its strong reputation, cutting-edge research, and various opportunities for student involvement, the department offers a unique and valuable experience for both companies and students [Passage 3]."""
output

"The University of Illinois has a world-renowned computer science department that offers various research opportunities for undergraduate students [Passage 2]. Additionally, the university boasts one of the largest computer science departments in the nation, providing students with a broad range of academic and research experiences [Passage 3]. The department has a strong reputation for revolutionary technology and groundbreaking research, addressing real-world problems in various fields, including medicine, engineering, business, and science [Passage 3]. The computer science department offers various research collaborations and recruitment opportunities for companies, including the Corporate Connection program, which provides a main point of entry for partnering with the department [Passage 3]. Companies can also sponsor senior design projects, which give company representatives the opportunity to pitch their projects to all enrolled students at the start of the fall semester [Passage

In [127]:
n = 3
demo = demos_template.replace('[INPUT]', input).replace('[OUTPUT]', output)
prompt = prompt_gen_template.replace('[full_DEMO]', demo)
prompt += f"""The overall task from the input to output is to summarize and provide citations to the original passages. Please generate {n} possible instructions and display them as a list. In the output, surround each instruction with square brackets. For example: 
Instruction 1: [this is the first instruction]
Instruction 2: [this is the second instruction]
..and so on"""
print(prompt)

I gave a friend an instruction. Based on the instruction they produced the following input-output pairs:



Here is passage number 2: "Apply to UIUC Make a Gift Undergraduates at Illinois Computer Science are an important part of our world-renowned research. From summer programs to paid research positions with faculty, there are multiple ways for our students to contribute to high impact research early in their careers. No events found SE 290 - Jeffrey Mikulina 151 Loomis Mathematical and Theoretical Physics Seminar: Generalized Black Hole Entropy is Von Neumann Entropy Gautam Satischandran, Princeton University Loomis Room 464 Hard Materials Seminar - "FC-STEM Analysis-Based K-Means Clustering" Hanyu Hou (Zuo) 100 Materials Science and Engineering Building, 1304 W. Green Street Hard Materials Seminar - "Advancing Glow Plug Materials: Enhancing Silicon Nitride Oxidation Resistance with Ca2+ Stabilizer" Prapassorn (Gloy) Numkiatsakul (Kriven) PhD Final Defense – Minsoo Sung Newmark Conf

In [105]:
out = prompt_llama2(prompt)
print(out)

Sure! Here are three possible instructions based on the input passages:

Instruction 1: [Summarize the key points of the passage, highlighting the university's strong reputation in computer science and the various research and recruitment
