# Synthetic Data Generation


In [1]:
import json
import sys
import csv
sys.path.append('..')


import tinytroupe
from tinytroupe.agent import TinyPerson
from tinytroupe.environment import TinyWorld, TinySocialNetwork
from tinytroupe.factory import TinyPersonFactory
from tinytroupe.extraction import default_extractor as extractor
from tinytroupe.extraction import ResultsReducer
import tinytroupe.control as control


!!!!
DISCLAIMER: TinyTroupe relies on Artificial Intelligence (AI) models to generate content. 
The AI models are not perfect and may produce inappropriate or inacurate results. 
For any serious or consequential use, please review the generated content before using it.
!!!!

Looking for default config on: d:\windsurf_ai\TinyTroupe\examples\..\tinytroupe\config.ini
Found custom config on: d:\windsurf_ai\TinyTroupe\examples\config.ini

Current TinyTroupe configuration 
[OpenAI]
api_type = openai
azure_api_version = 2023-05-15
model = gpt-4o-mini
max_tokens = 4000
temperature = 1.5
freq_penalty = 0.3
presence_penalty = 0.0
timeout = 60
max_attempts = 5
waiting_time = 2
exponential_backoff_factor = 5
embedding_model = text-embedding-3-small
cache_api_calls = False
cache_file_name = openai_api_cache.pickle
max_content_display_length = 1024

[Simulation]
rai_harmful_content_prevention = True
rai_copyright_infringement_prevention = True

[Logging]
loglevel = ERROR



Let's create the specific types of agents we need to collect data.

In [2]:
factory = TinyPersonFactory("A random knowledge worker in a company providing marketing services.")

In [3]:
people = []
for i in range(2):
    person = factory.generate_person(temperature=1.6)
    print(person.minibio())
    people.append(person)

len(people)

Elena Martinez is a 29 year old Digital Marketing Specialist, Spanish, currently living in Spain. Elena is a creative individual who thrives on collaboration, often finding inspiration in brainstorming sessions with her team. While she enjoys the social aspects of her job, she also cherishes her alone time, using it to recharge and reflect. Outside of work, she has a passion for photography, capturing moments during her travels, and enjoys experimenting with diverse recipes in the kitchen. On weekends, she likes to hike, which helps her disconnect from the fast-paced demands of her profession and manage her anxiety about deadlines.
Lucas Thompson is a 34 year old Marketing Analyst, American, currently living in United States. Lucas Thompson is not only dedicated to his role as a Marketing Analyst but also possesses a keen analytical mind that thrives on solving complex problems. His introverted nature leads him to prefer intimate gatherings, where he can engage in meaningful conversati

2

In [4]:
company = TinyWorld("Some Corp Inc.", people)

In [5]:
company.make_everyone_accessible()

In [6]:
company.broadcast("Get some work done together, help each other.")

In [7]:
company.run(5)

We can now extract the conversations, which form the synthetic corpus we wanted.

In [8]:
people[0].pp_current_interactions()

In [9]:
reducer = ResultsReducer()

def aux_extract_content(focus_agent: TinyPerson, source_agent:TinyPerson, target_agent:TinyPerson, kind:str, event: str, content: str, timestamp:str):

    if event == "TALK":
        author = focus_agent.name
    elif event == "CONVERSATION":
        if source_agent is None:
            author = "USER"
        else:
            author = source_agent.name
    else:
        raise ValueError(f"Unknown event: {event}")
    
    
    entry = (author, content)
    print(entry)
    return entry
    


reducer.add_reduction_rule("TALK", aux_extract_content)
reducer.add_reduction_rule("CONVERSATION", aux_extract_content)

Finally, we obtain the dataframe with the data and save it to a `.csv`, for later use in other applications.

In [10]:
df = reducer.reduce_agent_to_dataframe(people[0], column_names=["author", "content"])
df

('USER', 'Get some work done together, help each other.')
('Elena Martinez', "Let's get some work done together and help each other out! What do you think we should start with?")
('Lucas Thompson', 'I think we should start with analyzing the latest campaign performance data. It will give us a good foundation to work from. What do you think?')
('Elena Martinez', "I agree, starting with the campaign performance data is a great idea. Let's dive into it and see what insights we can gather!")
('Lucas Thompson', "Great! Let's start by looking at the key metrics from the last campaign. I have some data ready that we can analyze together.")
('Elena Martinez', "That sounds perfect! Let's take a look at the key metrics together. What do you have prepared?")
('Lucas Thompson', "I have prepared a summary of the campaign's performance, including key metrics like conversion rates and engagement levels. Let's go through them together!")
('Elena Martinez', "That sounds great! Let's go through the summ

Unnamed: 0,author,content
0,USER,"Get some work done together, help each other."
1,Elena Martinez,Let's get some work done together and help eac...
2,Lucas Thompson,I think we should start with analyzing the lat...
3,Elena Martinez,"I agree, starting with the campaign performanc..."
4,Lucas Thompson,Great! Let's start by looking at the key metri...
5,Elena Martinez,That sounds perfect! Let's take a look at the ...
6,Lucas Thompson,I have prepared a summary of the campaign's pe...
7,Elena Martinez,That sounds great! Let's go through the summar...
8,Lucas Thompson,Let's start by looking at the conversion rates...
9,Elena Martinez,Let's dive into the conversion rates! What tre...


In [11]:
df.to_csv("../data/extractions/synthetic_data_generation.out.csv", index=False)