# Lab 7 Data Generation: Unstructured Data Review

This notebook generates the synthetic dataset for Lab 7. We use Python for the structured fields and Google's Gemini Flash model to generate messy, realistic unstructured notes.

In [9]:
%run ../../code/imports.ipynb

from google import genai
import json
import random
import string

Set LOG_LEVEL="INFO" before running the import file to get moar output.
Set LOG_FORMAT to change log format.
Numpy (np): 2.4.2
Scipy (sp, stats): 1.17.0
Pandas (pd): 3.0.0
MatPlotLib (mpl, plt): 3.10.8
Seaborn (sns): 0.13.2
Scikit-Learn (sk): 1.8.0
Statsmodels (sm): 0.14.6
Patsy (pt): 1.0.2
SQLAlchemy (sa): 2.0.46
ROOT_DIR: 5150_Analytics
CODE_DIR: 5150_Analytics\code
DATA_DIR: 5150_Analytics\data
FIG_DIR: 5150_Analytics\labs_hw\week7_unstructured-review\figures


In [2]:
# 1. Setup Gemini API
# Adjust the path logic if your folder structure differs
try:
    # Navigate up from labs_hw/week7... to Documents/
    key_file = Path(".").absolute().parent.parent.parent.parent.parent / "Programming/Python/gemini/gemini_api_key.json"
    with open(key_file) as f:
        keys = json.load(f)
        os.environ["GEMINI_API_KEY"] = keys["GEMINI_API_KEYS"][1]

    client = genai.Client()
    print("Gemini Client initialized.")
except Exception as e:
    print(f"Failed to load API key: {e}")

Gemini Client initialized.


In [None]:
# generate a uid
def generate_uid():
    return "".join(random.choices(string.ascii_uppercase + string.digits, k=8))

'i\x05\x19BAcz/\x0c\x1f_'

In [119]:
# 2. Define Data Generators for Structured Columns
def generate_random_date(num_dates=1, start_year=2023, end_year=2025):
    start = dt.datetime(start_year, 1, 1)
    end = dt.datetime(end_year, 12, 31)
    delta = end - start
    if num_dates == 1:
        random_days = random.randrange(delta.days)
        return (start + dt.timedelta(days=random_days)).strftime("%Y-%m-%d")
    else:
        return [(start + dt.timedelta(days=random.randrange(delta.days))).strftime("%Y-%m-%d") for _ in range(num_dates)]

def get_random_string(length, chars=string.ascii_uppercase + string.digits):
    """Generate a random string of fixed length."""
    return ''.join(random.choice(chars) for i in range(length))

def generate_sku(prefix, length):
    return f"{prefix}{get_random_string(length - len(prefix))}"

def generate_skus(prefix, num_skus, length):
    """Generate a list of unique SKUs.
    
    Example: `generate_skus(prefix='L855', num_skus=10, length=9)`"""
    sku_list = set() # Use a set to ensure uniqueness
    while len(sku_list) < num_skus:
        sku_list.add(generate_sku(prefix, length))
    return list(sku_list)

def generate_customer_id(num=1, prefix='C', min=1000, max=9999, num_unique=None):
    if num == 1:
        return generate_customer_id(2, prefix, min, max)[0]
    elif num_unique is None or num_unique >= num:
        return [f"{prefix}{random.randint(min, max)}" for _ in range(num)]
    else:
        return random.choices(generate_customer_id(num_unique, prefix, min, max), k=num)

In [45]:
# 3. Generate Unstructured Notes using Gemini

def generate_messy_notes(n=10):
    prompt = f"""
    Generate {n} distinct entries for a 'Notes' field in a business dataset.
    Each entry acts as a raw log for a transaction.

    Desired Information fields in each note separate on new lines:
    1. Internal Account Representative Name
    2. General Notes (Free text, can have colons and newlines, but no pipes (|) since we will use that as an entry delimiter)

    CRITICAL Rules for formatting:
    - Each entry MUST be separated by a massive delimiter: "|||"
    - Inside an entry, separate the fields with newlines (\n).
    - Each field must have the label, either "Rep:" or "Notes:", followed by the content.
    - Introduce VARIANCE and REAL-WORLD MESSINESS. Examples:
        - "Invoice #:" vs "Inv #:" vs "Inv#:" vs just the number.
        - Typos in names, sometimes middle names, some times initials, etc.
        - The 'General Notes' section might contain newlines itself. IF it does, ensure it is the LAST part of the note so it doesn't break parsing of the other fields too badly.
    - Do NOT number the entries (1., 2.). Just raw text separated by |||.

    Example Output Style (follow my lord of the rings and hobbit style of writing & naming, with a touch of medieval business jargon):
    ```
    Rep: Gandalf the Grey
    Notes: Some damage due to orc attack in transit.
    |||
    Rep: Frodo Baggins
    Notes: Call back to intro contact and follow up on invoice.
    |||
    Rep: The Balrog
    Notes: Checked inventory with Legolas on: 2023-11-30.
    Everything arrived undamaged.
    |||
    ...
    ```
    """

    try:
        response = client.models.generate_content(
            model="gemini-flash-lite-latest",
            contents=prompt
        )
        # Split by the delimiter
        raw_text = response.text
        items = raw_text.split("|||")
        # Clean up whitespace
        clean_items = [item.strip() for item in items if item.strip()]
        return clean_items
    except Exception as e:
        print(f"Gemini Error: {e}")
        return ["Error generating note"] * n

In [48]:
# 4. Assemble the Dataset

NUM_ROWS = 320
BATCH_SIZE = 20 # Generate notes in batches to avoid token limits

try:
    all_notes
except NameError:
    all_notes = []

In [50]:
for i in range(0, NUM_ROWS, BATCH_SIZE):
    if i // BATCH_SIZE < len(all_notes):
        print(f"Batch {i // BATCH_SIZE} already generated, skipping...")
        continue
    year = 2022 + (i // BATCH_SIZE) // 5 # Increment year every 7 batches to add some temporal variety
    print(f"Requesting {year} batch {i} - {i + BATCH_SIZE} / {NUM_ROWS}")
    if "do it live":
        notes_batch = generate_messy_notes(BATCH_SIZE)
        all_notes.append((i, year, notes_batch))
        time.sleep(1) # Be nice to the API

if len(all_notes):
    print(len(all_notes), "notes generated. Sample note:")
    print(all_notes[0])

Requesting 2022 batch 0 - 20 / 320
Requesting 2022 batch 20 - 40 / 320
Requesting 2022 batch 40 - 60 / 320
Requesting 2022 batch 60 - 80 / 320
Requesting 2022 batch 80 - 100 / 320
Requesting 2023 batch 100 - 120 / 320
Requesting 2023 batch 120 - 140 / 320
Requesting 2023 batch 140 - 160 / 320
Requesting 2023 batch 160 - 180 / 320
Requesting 2023 batch 180 - 200 / 320
Requesting 2024 batch 200 - 220 / 320
Requesting 2024 batch 220 - 240 / 320
Requesting 2024 batch 240 - 260 / 320
Requesting 2024 batch 260 - 280 / 320
Requesting 2024 batch 280 - 300 / 320
Requesting 2025 batch 300 - 320 / 320
16 notes generated. Sample note:
(0, 2022, ['Rep: Elrond Half-elven\nNotes: Initial contact made regarding overdue tithe payment for the Shire holdings. Seems the hobbit delegation was slow to remit. Advised immediate transfer.', 'Rep: Aragorn Elessar\nNotes: Reviewed land deeds for Westmarch sector. Found discrepancy in boundary markers near the Fangorn outskirts. Requires physical inspection by Su

In [51]:
print('\n---'.join(all_notes[0][2])[:580])

Rep: Elrond Half-elven
Notes: Initial contact made regarding overdue tithe payment for the Shire holdings. Seems the hobbit delegation was slow to remit. Advised immediate transfer.
---Rep: Aragorn Elessar
Notes: Reviewed land deeds for Westmarch sector. Found discrepancy in boundary markers near the Fangorn outskirts. Requires physical inspection by Surveyor Samwise.
---Rep: Galadriel Light of Eärendil
Notes: Signed off on the high-value contract for Mithril shipment ETA Q1 next year. Terms are exceptionally favorable for Gondor. Note: Must confirm exchange rate stability 


In [106]:
if not "overwrite":
    with open("generated_fake_data.json", "w") as f:
        json.dump(all_notes, f, indent=2)

In [108]:
try:
    all_notes
except NameError:
    with open("generated_fake_data.json") as f:
        all_notes = json.load(f)

In [139]:
data = []
for i, year, notes in all_notes:
    for note in notes:
        data.append({
            'date': generate_random_date(start_year=year, end_year=year),
            'notes_raw': re.sub(r'20[12]\d', str(year), note)
        })

df = pd.DataFrame(data).sort_values(S('date notes_raw')).reset_index(drop=True).reset_index(names='id').assign(date=lambda df: pd.to_datetime(df.date))
df.head()

Unnamed: 0,id,date,notes_raw
0,0,2022-01-02,Rep: Smeagol G.\nNotes: Notes: He called three...
1,1,2022-01-08,Rep: Meriadoc Brandybuck\nNotes: Rep: Merry B....
2,2,2022-01-10,Rep: Elrond Half-elven\nNotes: Discussed long-...
3,3,2022-01-13,Rep: Gríma Wormtongue\nNotes: Attempted contac...
4,4,2022-01-21,Rep: Boromir of Gondor\nNotes: Customer reques...


In [140]:
assert len(df[~df.notes_raw.str.contains('(?:rep|notes):', case=False, regex=True)]) == 0, "Some notes entries do not contain the expected 'Rep:' or 'Notes:' fields."
df['note_parts'] = df.notes_raw.str.split(re.compile(r'\s*(?:rep|notes):', re.I)).apply(lambda xs: [x for x in xs if x.strip()])

In [141]:
df['rep'] = df.note_parts.apply(lambda parts: parts[0] if len(parts) > 0 else None)
df['general_notes'] = df.note_parts.apply(lambda parts: parts[1] if len(parts) > 1 else None)

In [142]:
# id, date, customer ID, UPC, number, total, notes
df['customer_id'] = generate_customer_id(num=len(df), num_unique=len(df) // 5) # More unique customers than rows to ensure some repetition

# Generate UPCs with 3 prefixes
prefix = {'ARMS': (1, 20, 1000), 'VICT': (100, 2000, .5), 'TREAS': (1, 100, 500)}
df['upcp'] = [random.choice(list(prefix.keys())) for _ in range(len(df))]
for _p,_g in df.groupby('upcp'):
    df.loc[_g.index, 'upc'] = generate_skus(prefix=_p, num_skus=len(_g), length=9)

# Generate random numbers and totals
for _p,_g in df.groupby('upcp'):
    df.loc[_g.index, 'number_items'] = [random.randint(*prefix[_p][:2]) for _ in range(len(_g))]
    df.loc[_g.index, 'total'] = df.loc[_g.index, 'number_items'] * prefix[_p][2] * (1 + random.uniform(-0.2, 0.2)) # Add some price variance

In [None]:
# Recreate notes
#    inv, rep, follow up date?, gen notes
inv_choices = ['Invoice #', 'Invoice#', 'Inv #', 'Inv#']
df['notes1'] = df.apply(lambda row: f"{random.choice(inv_choices)}: INV{row.date.year}-{random.randint(1000, 99999):05d}\n", axis=1)

rep_choices = ['Rep', 'Representative', 'Account Rep', 'ARep']
df['notes2'] = df.apply(lambda row: f"{random.choice(rep_choices)}: {row.rep}\n", axis=1)

date_choices = ['Follow-up Date', 'Follow up Date', 'Follow-up On', 'Follow-up By', 'Follow-up']
df['notes3'] = df.apply(lambda row: f"{random.choice(date_choices)}: {row.date + pd.Timedelta(days=random.randint(1, 90)):%Y-%m-%d}\n" if random.randrange(10) < 6 else "", axis=1)

note_choices = ['Notes', 'Notes', 'Notes', 'Note']
df['notes4'] = df.apply(lambda row: f"{random.choice(note_choices)}: {row.general_notes}", axis=1)

df['notes'] = df['notes1'] + df['notes2'] + df['notes3'] + df['notes4']

In [165]:
dfout = (
    df[['id', 'date', 'customer_id', 'upc', 'number_items', 'total', 'notes']]
    .assign(
        id=lambda df: df.groupby(df.date.dt.year).cumcount(),
        date = lambda df: df.date.dt.date,
    )
)

In [167]:
dfout.head()

Unnamed: 0,id,date,customer_id,upc,number_items,total,notes
0,0,2022-01-02,C7249,TREASGMK0,13,6578.0,Invoice #: INV2022-12248\nRep: Smeagol G.\nFo...
1,1,2022-01-08,C2947,VICTHRONU,1066,533.036,Invoice #: INV2022-35079\nARep: Meriadoc Bran...
2,2,2022-01-10,C2913,TREASIVCL,100,50600.0,Invoice #: INV2022-35726\nRep: Elrond Half-el...
3,3,2022-01-13,C9242,ARMSL9YQ8,18,19613.0,Invoice #: INV2022-97055\nARep: Gríma Wormton...
4,4,2022-01-21,C5121,ARMSBKMDG,5,5448.0,Inv#: INV2022-02471\nRep: Boromir of Gondor\n...


In [None]:
# 5. Save to Excel
dfout.to_excel("lab7_data_OG.xlsx", index=False)

In [171]:
import pandas as pd
import re
dfsolution = pd.read_excel("lab7_data.xlsx")
dfsolution.head()

Unnamed: 0,id,date,customer_id,upc,number_items,total,notes
0,0,2022-01-02,C7249,TREASGMK0,13,6578.0,Invoice #: INV2022-12248\nRep: Smeagol G.\nFo...
1,1,2022-01-08,C2947,VICTHRONU,1066,533.036,Invoice #: INV2022-35079\nARep: Meriadoc Bran...
2,2,2022-01-10,C2913,TREASIVCL,100,50600.0,Invoice #: INV2022-35726\nRep: Elrond Half-el...
3,3,2022-01-13,C9242,ARMSL9YQ8,18,19613.0,Invoice #: INV2022-97055\nARep: Gríma Wormton...
4,4,2022-01-21,C5121,ARMSBKMDG,5,5448.0,Inv#: INV2022-02471\nRep: Boromir of Gondor\n...


In [176]:

# invoice:  r"(?:Inv\w*\s*#?):\s*([^\n]+)"
dfsolution['Invoice_Number'] = dfsolution['notes'].str.extract(r"(?:Inv\w*\s*#?):\s*([^\n]+)", flags=re.I)
# contact:  r"(?:A[count ]*)?Rep(?:[^:]*):\s*([^\n]+)"
dfsolution['Contact_Name'] = dfsolution['notes'].str.extract(r"(?:A[count ]*)?Rep(?:[^:]*):\s*([^\n]+)", flags=re.I)
# followUp:  r"Follow(?:[^:]*):\s*([^\n]+)"
dfsolution['Follow_Up_Date'] = dfsolution['notes'].str.extract(r"Follow(?:[^:]*):\s*([^\n]+)", flags=re.I)
# notes:  r"Notes?:\s*(.+)"
dfsolution['Clean_Notes'] = dfsolution['notes'].str.extract(r"Notes?:\s*(.+)", flags=re.I)

display(dfsolution.count().to_frame().T)
dfsolution.head()

Unnamed: 0,id,date,customer_id,upc,number_items,total,notes,Invoice_Number,Contact_Name,Follow_Up_Date,Clean_Notes
0,321,321,321,321,321,321,321,321,321,186,321


Unnamed: 0,id,date,customer_id,upc,number_items,total,notes,Invoice_Number,Contact_Name,Follow_Up_Date,Clean_Notes
0,0,2022-01-02,C7249,TREASGMK0,13,6578.0,Invoice #: INV2022-12248\nRep: Smeagol G.\nFo...,INV2022-12248,Smeagol G.,2022-03-11,He called three times about the price of fish ...
1,1,2022-01-08,C2947,VICTHRONU,1066,533.036,Invoice #: INV2022-35079\nARep: Meriadoc Bran...,INV2022-35079,Meriadoc Brandybuck,2022-02-04,Merry B. Initial contact concerning supply cha...
2,2,2022-01-10,C2913,TREASIVCL,100,50600.0,Invoice #: INV2022-35726\nRep: Elrond Half-el...,INV2022-35726,Elrond Half-elven,2022-04-01,Discussed long-term alliance structure. Key de...
3,3,2022-01-13,C9242,ARMSL9YQ8,18,19613.0,Invoice #: INV2022-97055\nARep: Gríma Wormton...,INV2022-97055,Gríma Wormtongue,2022-02-22,Attempted contact regarding the maintenance co...
4,4,2022-01-21,C5121,ARMSBKMDG,5,5448.0,Inv#: INV2022-02471\nRep: Boromir of Gondor\n...,INV2022-02471,Boromir of Gondor,2022-02-24,Customer requested a rush delivery of 100 bush...
