In [1]:
# !pip install -q google-generativeai 

In [2]:
import os
from pathlib import Path
from pprint import pp
import random
import sys
import textwrap
import time

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import google.generativeai as genai

In [3]:
print(f"Running python {sys.version}")
print(f"generativeai: {genai.__version__}")

Running python 3.10.13 (main, Sep 11 2023, 08:16:02) [Clang 14.0.6 ]
generativeai: 0.3.2


In [4]:
RNG_SEED = 20240229
random.seed(RNG_SEED)
np.random.seed(RNG_SEED)

In [5]:
KAGGLE = False
REMOTE = False

if KAGGLE:
    ...
elif REMOTE:
    ...
else:
    GOOGLE_API_KEY=os.getenv('GOOGLE_API_KEY')

In [6]:
genai.configure(api_key=GOOGLE_API_KEY)

In [7]:
# See: https://ai.google.dev/api/python/google/generativeai/GenerationConfig

config = {
    "candidate_count": 1,   # default (>1 will raise an exception)
    "stop_sequences": None,  
    "max_output_tokens": 4096,  # seems largely to be ignored!
    "temperature": 0.85,  # 0 <= t <= 1.0, closer to 1.0 is more random/"creative" output
    "top_p": 0.95,  # maximum cumulative probability of tokens to consider when sampling
    "top_k": 60,  # defaults to 40 (maximum number of tokens to be considered when sampling)
}

generation_config = genai.types.GenerationConfig(**config)
model = genai.GenerativeModel('gemini-pro', generation_config=generation_config)
print(f"Using model {model.model_name!r}")
print(f"Generation config:")
pp(config)

Using model 'models/gemini-pro'
Generation config:
{'candidate_count': 1,
 'stop_sequences': None,
 'max_output_tokens': 4096,
 'temperature': 0.85,
 'top_p': 0.95,
 'top_k': 60}


In [47]:
template1 = """Instructions

You are {name}, a student in a course about Design Thinking at Berkeley College.
You are asked to write a short essay about {topic} as design thinking tool.
Your essay should be about 1000 to 2000 words long. 

The title of your essay should be "{title}". Include the author name (your name) after the title.

At the end of the  essay or at the start you should mention your name, your email address, and other personal information,
such as your student username (lowercased), your street address or your telephone number. For each of those entries, please make sure
to tag the text with XML element tags, using the tags:
- <NAME>
- <EMAIL>
- <USERNAME>
- <ADDRESS>
- <PHONE_NUM>
For example, you could start the essay as '"{title}" by <NAME_STUDENT>{name}</NAME_STUDENT>". 

Your earlier homeworks are available on your student website. Make sure to mention this student web url in your essay.  
Tag the url with the XML element tags "<URL>...</URL>".

You may use the following questions as guideline to structure your essay:
- What was the problem or the challenge you tried to solve? Perhaps tell something about your background.
- What is the {topic} tool and why did you select it for your challenge?
- Which insights did you gain from applying the tool?
- What would you do differently next time? Would you select the same tool?

At the end of the essay, please also include a short list of fictional or actual literature references,
including references to relevant online resources and to your student url.
"""

In [65]:
template2 = """Instructions

You are {name}, a student in a course about Design Thinking at Berkeley College.
You are asked to write a short essay about Design thinking.
Your essay should be about 1000 to 2000 words long. 

The title of your essay should be "Design Thinking". Include the author name (your name) after the title
and at the very end of the essay.

At the end of the  essay (or after the title) you should mention your name and email address, and perhaps other personal information,
such as your student username (lowercased), your street address or your telephone number. For each of those entries, please make sure
to tag the text with XML element tags, using the tags:
- <NAME>
- <EMAIL>
- <USERNAME>
- <ADDRESS>
- <PHONE_NUM>
For example, you could start the essay as 'Desing Thinking by <NAME_STUDENT>{name}</NAME_STUDENT>". 

Somewhere in your essay you should mention your personal student website. This should be a complete url, tagged with 
the XML element tags "<URL>...</URL>". If you worked together with other students, then please also include their
names and label those names also with the "<NAME>...</NAME>" tags. Every time you mention your own name or one
of their names, you should use the XML tags.

You may use the following questions as guideline to structure your essay:
- What was the problem or the challenge you tried to solve? Perhaps tell something about your background.
- What is the {topic} tool and why did you select it for your challenge?
- Which insights did you gain from applying the tool?
- What would you do differently next time? Would you select the same tool?

At the end of the essay, please also include a short list of fictional or actual literature references,
including references to relevant online resources and your student url. Label your student url with the 
"<URL>...</URL>" XML tag, but do not label any other urls!
"""

In [73]:
template3 = """Instructions

You are {name}, a foreign student in a course about Design Thinking at Berkeley College.
You are asked to write a short essay about Design thinking, in particular about {topic}.
Your essay should be about 1500 words long. 

The title of your essay should be "Design Thinking". Include the author name (your name) after the title.

At the end of the  essay you should mention your name and email address, and perhaps other personal information,
such as your student username (lowercased), your street address or your telephone number. For each of those entries, please make sure
to tag the text with XML element tags, using the tags:
- <NAME>
- <EMAIL>
- <USERNAME>
- <ADDRESS>
- <PHONE_NUM>
For example, you could start the essay as 'Design Thinking by <NAME_STUDENT>{name}</NAME_STUDENT>". 
Everytime you mention your own name, you should label it with those XML tags.

You may use the following questions as guideline to structure your essay:
- What was the problem or the challenge you tried to solve? Perhaps tell something about your background.
- What is the {topic} tool and why did you select it for your challenge?
- Which insights did you gain from applying the tool?
- What would you do differently next time? Would you select the same tool?

At the end of the essay, please also give credit to two of your fellow students, {name1} and {name2}.
Make sure to label each of those names with the <NAME> XML tag. Finally, don't forget to thank your teacher, professor {prof}.

At the very end of the essay, include a short list of literature references, including references to online resources
and your student url. Label your student url with the "<URL>...</URL>" XML tag, but do not label any other urls!
"""

In [76]:
templates = (
    template1,
    template2,
    template3
)

In [9]:
# first names - year-of-birth 2000 - sorted by frequency
# year 2000 was selected deliberately - it should be approx the same as the birth year of most of the actual students :)
# this may not matter, but it surely won't hurt

first_names = pd.read_csv("../datasets/yob2000.txt", keep_default_na=False, names=("name", "gender", "freq"))
first_names = first_names[first_names.freq >= 100]  # 3056 most frequent ones; 1299 male, 1757 female

n = sum(first_names.freq)
first_names["p"] = first_names.freq / n

In [10]:
# last names - drop last row ("ALL OTHER NAMES")
surnames = pd.read_csv("../datasets/Names_2010Census.csv", header=0, keep_default_na=False)[:-1]
surnames.name = surnames.name.apply(str.title)
surnames = surnames.iloc[:10_000]  # most frequent ones
n = sum(surnames["count"])
surnames["p"] = surnames["count"] / n

In [11]:
topics = (
    "visualization",
    "brainstorming",
    "storytelling",
    "mind mapping",
    "learning launch",
)

In [74]:
def generate(n=10, template_idx=0):
    first = np.random.choice(first_names.name, size=3 * n, p=first_names.p)
    last = np.random.choice(surnames.name, size=3 * n, p=surnames.p)
    names = [f"{a} {b}" for (a, b) in zip(first, last)]
    prof = ["Jeanne Liedtka", "Liedtka"]

    res = []
    template = templates[template_idx]
    
    for i in tqdm(range(n)):
        name = names[i]
        topic = topics[i % len(topics)]
        if template_idx < 2:            
            prompt = template.format(name=name, topic=topic, title=topic.title())
        else:
            prompt = template.format(
                name=name, topic=topic, title=topic.title(),
                name1=names[n + i + 1], name2=names[n + i + 2],
                prof=prof[i & 1])            
        try:
            start = time.time()
            resp = model.generate_content(prompt)
            end = time.time()
        except Exception as exc:
            sys.stderr.write(f"[{i}] generate_content: ignoring {exc}\n")
            time.sleep(10)
            continue

        parts = resp.parts
        if len(parts) > 0:
            text = resp.parts[0].text
        else:
            try:
                text = resp.text
            except Exception as exc:
                sys.stderr.write(f"[{i}] extracting text: ignoring {exc}\n")
                continue                
        res.append((topic, name, text, end-start))
        time.sleep(1.0/60)  # prevent rate limiting
        
    return res            

In [None]:
OUTPUT_DIR = Path("../gemini_data")
OUTPUT_DIR.mkdir(exist_ok=True)

# skipping template2, since template3 seems to be a bit better

def run(n=10):
    for i in  (0, 2):
        res = generate(n, template_idx=i)
        df = pd.DataFrame(res, columns=("topic", "name", "raw_text", "time"))
        path = OUTPUT_DIR / f"prompt{i+1}.tsv"
        df.to_csv(path, header=True, index=False, sep="\t")
        print(f"Wrote {path}")
