Note: The script uses Berkeley Neural Parser to parse the generated instructions, and visualize the results using Plotly.

Please make sure to install benepar following their documentation [here](https://github.com/nikitakit/self-attentive-parser#installation).

In [21]:
# import nltk
# benepar.download('benepar_en3')

In [22]:
import benepar, spacy
nlp = spacy.load('en_core_web_md')
doc = nlp("The time for action is now. It's never too late to do something.")

if spacy.__version__.startswith('2'):
    nlp.add_pipe(benepar.BeneparComponent("benepar_en3"))
else:
    nlp.add_pipe("benepar", config={"model": "benepar_en3"})

In [23]:
def find_root_verb_and_its_dobj(tree_root):
    # first check if the current node and its children satisfy the condition
    if tree_root.pos_ == "VERB":
        for child in tree_root.children:
            if child.dep_ == "dobj" and child.pos_ == "NOUN":
                return tree_root.lemma_, child.lemma_
        return tree_root.lemma_, None
    # if not, check its children
    for child in tree_root.children:
        return find_root_verb_and_its_dobj(child)
    # if no children satisfy the condition, return None
    return None, None

def find_root_verb_and_its_dobj_in_string(s):
    doc = nlp(s)
    first_sent = list(doc.sents)[0]
    return find_root_verb_and_its_dobj(first_sent.root)

find_root_verb_and_its_dobj_in_string("Write me a story about education.")

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.

<class 'torch_struct.distributions.TreeCRF'> does not define `arg_constraints`. Please set `arg_constraints = {}` or initialize the distribution with `validate_args=False` to turn off validation.



('write', 'story')

In [24]:
def vis_instructions(sentences, figure_path=None):
    from tqdm import tqdm
    import  pandas as pd
    import plotly.graph_objects as go
    import plotly.express as px
    import plotly.io as pio   

    raw_phrases = []
    for text in tqdm(sentences):
        try:
            verb, noun = find_root_verb_and_its_dobj_in_string(text)
            raw_phrases.append({
                "verb": verb,
                "noun": noun,
                "text": text
            })
        except Exception as e:
            print(e)
            print(text)
    len(raw_phrases)
    
    raw_phrases = pd.DataFrame(raw_phrases)
    phrases = pd.DataFrame(raw_phrases).dropna()
    phrases[["verb", "noun"]].groupby(["verb", "noun"]).size().sort_values(ascending=False)

    top_verbs = phrases[["verb"]].groupby(["verb"]).size().nlargest(20).reset_index()
    df = phrases[phrases["verb"].isin(top_verbs["verb"].tolist())]
    # df = df[~df["noun"].isin(["I", "what"])]
    # df = phrases
    # df[~df["verb"].isin(top_verbs["verb"].tolist())]["verb"] = "other"
    # df[~df["verb"].isin(top_verbs["verb"].tolist())]["noun"] = "other"
    df = df.groupby(["verb", "noun"]).size().reset_index().rename(columns={0: "count"}).sort_values(by=["count"], ascending=False)
    # df = df[df["count"] > 10]
    df = df.groupby("verb").apply(lambda x: x.sort_values("count", ascending=False).head(4)).reset_index(drop=True)
    print(df)
    # df["blank"] = "ROOT"
    # df = phrases.groupby(["verb", "noun"]).size().sort_values(ascending=False).head(5).reset_index().rename(columns={0: "count"})

    # df = df[df["count"] > 30]
    pio.kaleido.scope.mathjax = None
    fig = px.sunburst(df, path=['verb', 'noun'], values='count')
    # fig.update_layout(uniformtext=dict(minsize=10, mode='hide'))
    fig.update_layout(
        margin=dict(l=0, r=0, t=0, b=0),
        font_family="Times New Roman",
        width=400,
        height=400,
    )
    if figure_path is None:
        fig.show()
    else:
        # fig.write_html("outputs/verb_noun.html")
        # fig.savefig("output/verb_noun.pdf")
        fig.write_image(figure_path)
    
    return phrases


In [27]:
# instruction_path = "../../scripts/step1_generate_instructions/outputs/instructions.gpt-4-0314.1000.txt"
instruction_path = "../../scripts/step1_generate_instructions/outputs/instructions.gpt-35-turbo-0301.1000.txt"
# instruction_path = "../../scripts/step1_generate_instructions/outputs/instructions.llama-2-7b-chat.100.txt"
# instruction_path = "../../scripts/step1_generate_instructions/outputs/instructions.llama-2-70b-chat.100.txt"

# read jsonl
import json
list_sentences = []
with open(instruction_path, "r") as f:
    for line in f:
        list_sentences.append(json.loads(line)["instruction"])
# randomly select 100
# import random
# random.seed(0)
# list_sentences = random.sample(list_sentences, 100)
print(len(list_sentences))
vis_instructions(list_sentences)

100



<class 'torch_struct.distributions.TreeCRF'> does not define `arg_constraints`. Please set `arg_constraints = {}` or initialize the distribution with `validate_args=False` to turn off validation.

100%|██████████| 100/100 [00:03<00:00, 27.65it/s]


        verb          noun  count
0    analyze   correlation      1
1   describe       process      5
2   describe   development      2
3   describe       dataset      1
4   describe     structure      1
5    explain  significance      4
6    explain        reason      2
7    explain  relationship      2
8    explain          role      2
9    provide      protocol     18
10   provide      analysis     13
11   provide   description     11
12   provide      overview      3


Unnamed: 0,verb,noun,text
0,provide,explanation,Provide a detailed explanation of the importan...
1,provide,protocol,Provide a detailed protocol for expanding and ...
2,provide,protocol,Provide a detailed protocol for measuring the ...
3,provide,description,Provide a detailed description of the study de...
4,provide,description,Provide a detailed description of the improvem...
...,...,...,...
95,describe,process,Describe the process of optimizing the culture...
96,explain,importance,Explain the importance of secondary metabolite...
97,explain,method,Explain the method of deconvolution and how it...
98,explain,concept,Explain the concept of efficient dendritic lea...
