# Fake WhatsApp data generator
This notebook can be used to generated fake WhatsApp data that can be used to interact with What's Viz without having to use own data. Except for the number of particpants in a group that follow a pareto distrubtion and the word distribution that follows english vocab distribution, all other data is randomly generated.

Structure of the data:

````
{
    "messages":{
        <message_id>: {
            "chat": <chat_id>,
            "sent-by": <contact-id>,
            "message": <message-content>,
            "id": <message_id>,
            "timestamp": "yyyy-dd-mm hh:mm:ss +time zone",
        },
        ...
    },
    "contacts":{
        <contact_id>:{
            "status": <contact status>,
            "registered": <true: if user in registred in your contacts, ow. false>,
            "name": <>,
            "avatar": <contact picture link>,
        },
        ...
    },
    "groups": {
        "name": <group name>,
        "topic": <group description>,
        "participants": [<contact_id>, <contact_id>, ...],
        "owner_id": <contact_id>,
        "avatar": <group picture link>,
    }
}
````


In [27]:
# load csv file with two column name, occurences

import pandas as pd
import numpy as np

df = pd.read_csv('data_for_dummy_generator/unigram_freq.csv')
df.head()

total_word_occurences = df['count'].sum()

def sample_word():
    word = np.random.choice(df['word'], p=df['count']/total_word_occurences)
    return str(word)

def sample_sentence(max_length=20):
    sentence = ""
    random = np.random.randint(3, max_length)
    for i in range(random):
        sentence += sample_word() + " "
    return sentence


In [28]:
sample_sentence()

'vacation online that cancels act '

In [29]:
from tqdm import tqdm

# Create json with fake data with abobve structure
import random
import json

def generate_contacts(data, total, num_registred):
    print("Start generating contacts...")

    # load json with actors data
    with open('data_for_dummy_generator/actor_nodes.json') as f:
        actors = json.load(f)

    # load json with actors data
    with open('data_for_dummy_generator/actors_stats.json') as f:
        actors_stats = json.load(f)

    random.shuffle(actors)

    for i in range(total):
        actor = actors[i % len(actors)] 
        actor_stats = actors_stats[actor["id"]]
        data["contacts"][f"0{i}"] = {
            "status": sample_sentence(10),
            "registered": num_registred > i,
            "name": actor["id"],
            "avatar": f'https://image.tmdb.org/t/p/w200/{actor_stats["profile_path"]}' if "profile_path" in actor_stats else '',
        }
        
    print(f"Done generating {total} contacts\n")

def generate_groups(data, total, own_id):
    print("Start creating groups...")

    from scipy.stats import pareto

    distribution = pareto(1.16, loc=5, scale=1)
    num_participants = [int(i) for i in distribution.rvs(total)]

    contact_ids = list(data["contacts"].keys())

    proportion_of_people_in_group = random.random() * 0.20 + 0.75 # Between 0.75 and 0.95
    print(f"Proportion of people in group: {proportion_of_people_in_group}")
    num_people_in_group = int(len(contact_ids) * proportion_of_people_in_group)
    people_in_group = list(np.random.choice(contact_ids, size=num_people_in_group, replace=False))

    # Split people in groups according to num_participants
    sum_participants = 0
    particpants_ids_per_group = []
    for i, num in enumerate(num_participants):
        if sum_participants + num < num_people_in_group:
            particpants_ids_per_group.append(people_in_group[sum_participants:sum_participants + num])
            sum_participants += num
        else:
            break
            
    print(f"Propotion group filled without overlap: {len(particpants_ids_per_group) / total}")
    # Fill remaining groups with people assigned to other groups
    while len(particpants_ids_per_group) < total:
        num = num_participants[len(particpants_ids_per_group)]
        current_group = []
        while len(current_group) < num:
            group_id = np.random.randint(0, len(particpants_ids_per_group))
            group_participants = particpants_ids_per_group[group_id]
            filtered_participants = [contact_id for contact_id in group_participants if contact_id not in current_group]
            selected_contact_ids = np.random.choice(filtered_participants, size=min(min(len(group_participants)//2, len(filtered_participants)), num-len(current_group)), replace=False)
            current_group+=list(selected_contact_ids)
        particpants_ids_per_group.append(current_group)

    # Create groups
    for i, participants in enumerate(particpants_ids_per_group):
        participants.append(own_id)

        data["groups"][f"1{i}"] = {
            "name": sample_word(),
            "topic": sample_sentence(10),
            "participants": participants,
            "owner_id": participants[0],
            "avatar": ''
        }
    print(f"Done generating {total} groups\n")

def generate_messages(data, total, own_id):
    print("Start generating messages...")
    count_message = 0
    def send_message(chat_id, sender, id):
        data["messages"][f"2{id}"] = {
            "chat": chat_id,
            "sent-by": sender,
            "message": sample_sentence(20),
            "id": f"2{i}",
            "timestamp": f"202{np.random.randint(0,3)}-{np.random.randint(1,29)}-{np.random.randint(1,13)} {np.random.randint(0,23)}:{np.random.randint(0,59)}:{np.random.randint(0,59)} +0200 UTC+2",
        }

    group_ids = list(data["groups"].keys())
    contact_ids = list(data["contacts"].keys())

    proportion_own_messages = random.random() * 0.25 + 0.25 # Between 0.25 and 0.5
    num_own_messages = int(total * proportion_own_messages)

    people_in_group = set()
    for group_id in group_ids:
        people_in_group = people_in_group.union(data["groups"][group_id]["participants"])
    people_in_group = list(people_in_group)

    proportion_people_with_PM = random.random() * 0.25 + 0.15 # Between 0.15 and 0.4
    target_num_people_with_PM = int(len(contact_ids) * proportion_people_with_PM)
    
    people_not_in_group = [contact_id for contact_id in contact_ids if contact_id not in people_in_group]
    people_with_PM = people_not_in_group
    diff = target_num_people_with_PM - len(people_with_PM)
    
    print("People not in a group", len(people_not_in_group)/len(contact_ids))
    print(f"Proportion of people with PM: {proportion_people_with_PM}")
    
    if diff > 0:
        people_in_group_with_PM = list(np.random.choice(people_in_group, size=diff, replace=False))
        people_with_PM += people_in_group_with_PM

    print(f"Proportion of people with PM that are in a group: {0 if diff <= 0 else len(people_in_group_with_PM) / target_num_people_with_PM}")

    # Generate own messages
    print(f"Generate own message: {num_own_messages}")
    for i in tqdm(range(num_own_messages)):
        if random.random() < 0.5:
            chat_id = random.choice(group_ids)
        else:
            chat_id = random.choice(people_with_PM)
        send_message(chat_id, own_id, count_message)
        count_message += 1

    # Generate messages from other people
    print(f"Generate message from other: {total - num_own_messages}")
    for i in tqdm(range(total - num_own_messages)):
        # If group, pick a contact at random
        if random.random() < 0.85:
            chat_id = random.choice(group_ids)
            sender = random.choice(data["groups"][chat_id]["participants"])
        # If contact, sender is either the contact or the user
        else:
            chat_id = random.choice(people_with_PM)
            sender = chat_id 
        
        send_message(chat_id, sender, count_message)
        count_message += 1

    #for i in range(total):
        # # If group, pick a contact at random
        # if random.random() < 0.5:
        #     chat_id = random.choice(list(data["groups"].keys()))
        #     sender = random.choice(data["groups"][chat_id]["participants"])
        # # If contact, sender is either the contact or the user
        # else:
        #     chat_id = random.choice(list(data["contacts"].keys()))
        #     if random.random() < 0.5:
        #         sender = chat_id 
        #     else:
        #         sender = own_id

   

In [30]:
data = {
    "messages": {},
    "contacts": {},
    "groups": {}
}

generate_contacts(data, 200, 200)
own_id = random.choice(list(data["contacts"].keys()))
generate_groups(data, 40, own_id)
# This might take a few minutes depending on the number of messages (~5 minutes for 4000 messages on my machine)
generate_messages(data, 4000, own_id)

dummy_data_path = "../wa-visualization/public/dummy-data.json"

# save json
with open(dummy_data_path, 'w') as outfile:
    json.dump(data, outfile)

print(f"Saved file: {dummy_data_path}")


Start generating contacts...
Done generating 200 contacts

Start creating groups...
Proportion of people in group: 0.7510832926046972
Propotion group filled without overlap: 0.525
Done generating 40 groups

Start generating messages...
People not in a group 0.255
Proportion of people with PM: 0.17251631863234548
Proportion of people with PM that are in a group: 0
Generate own message: 1890


100%|██████████| 1890/1890 [02:08<00:00, 14.73it/s]


Generate message from other: 2110


100%|██████████| 2110/2110 [01:56<00:00, 18.17it/s]


Saved file: wa-visualization/public/dummy-data.json
