# Fake WhatsApp data generator
This notebook can be used to generated fake WhatsApp data that can be used to interact with What's Viz without having to use own data. Except for the number of particpants in a group that follow a pareto distrubtion and the word distribution that follows english vocab distribution, all other data is randomly generated.

Structure of the data:

````
{
    "messages":{
        <message_id>: {
            "chat": <chat_id>,
            "sent-by": <contact-id>,
            "message": <message-content>,
            "id": <message_id>,
            "timestamp": "yyyy-dd-mm hh:mm:ss +time zone",
        },
        ...
    },
    "contacts":{
        <contact_id>:{
            "status": <contact status>,
            "registered": <true: if user in registred in your contacts, ow. false>,
            "name": <>,
            "avatar": <contact picture link>,
        },
        ...
    },
    "groups": {
        "name": <group name>,
        "topic": <group description>,
        "participants": [<contact_id>, <contact_id>, ...],
        "owner_id": <contact_id>,
        "avatar": <group picture link>,
    }
}
````


In [8]:
# load csv file with two column name, occurences

import pandas as pd
import numpy as np

df = pd.read_csv('data_for_dummy_generator/unigram_freq.csv')
df.head()

total_word_occurences = df['count'].sum()

def sample_word():
    word = np.random.choice(df['word'], p=df['count']/total_word_occurences)
    return str(word)

def sample_sentence(max_length=20):
    sentence = ""
    random = np.random.randint(3, max_length)
    for i in range(random):
        sentence += sample_word() + " "
    return sentence


In [9]:
sample_sentence()

'women an megaworks '

In [10]:
# Create json with fake data with abobve structure
import random
import json

data = {
    "messages": {},
    "contacts": {},
    "groups": {}
}

def generate_contacts(total, num_registred):

    # load json with actors data
    with open('data_for_dummy_generator/actor_nodes.json') as f:
        actors = json.load(f)

    # load json with actors data
    with open('data_for_dummy_generator/actors_stats.json') as f:
        actors_stats = json.load(f)

    random.shuffle(actors)

    for i in range(total):
        actor = actors[i % len(actors)] 
        actor_stats = actors_stats[actor["id"]]
        data["contacts"][f"0{i}"] = {
            "status": sample_sentence(10),
            "registered": num_registred > i,
            "name": actor["id"],
            "avatar": f'https://image.tmdb.org/t/p/w200/{actor_stats["profile_path"]}' if "profile_path" in actor_stats else '',
        }

def generate_groups(total, own_id):

    from scipy.stats import pareto

    distribution = pareto(1.16, loc=5, scale=1)
    num_participants = distribution.rvs(total)

    for i in range(total):
        participants = list(np.random.choice(list(data["contacts"]), size=int(num_participants[i]), replace=False))
        participants.append(own_id)

        data["groups"][f"1{i}"] = {
            "name": sample_sentence(5),
            "topic": sample_sentence(10),
            "participants": participants,
            "owner_id": participants[0],
            "avatar": ''
        }

def generate_messages(total, own_id):

    for i in range(total):
        # If group, pick a contact at random
        if random.random() < 0.5:
            chat_id = random.choice(list(data["groups"].keys()))
            sender = random.choice(data["groups"][chat_id]["participants"])
        # If contact, sender is either the contact or the user
        else:
            chat_id = random.choice(list(data["contacts"].keys()))
            if random.random() < 0.5:
                sender = chat_id 
            else:
                sender = own_id

    
        data["messages"][f"2{i}"] = {
            "chat": chat_id,
            "sent-by": sender,
            "message": sample_sentence(20),
            "id": f"2{i}",
            "timestamp": f"202{np.random.randint(0,3)}-{np.random.randint(1,29)}-{np.random.randint(1,13)} {np.random.randint(0,23)}:{np.random.randint(0,59)}:{np.random.randint(0,59)} +0200 UTC+2",
        }

In [11]:
generate_contacts(200, 200)
own_id = random.choice(list(data["contacts"].keys()))
print("Done contacts")
generate_groups(30, own_id)
print("Done groups")
# This might take a few minutes depending on the number of messages (~5 minutes for 4000 messages on my machine)
generate_messages(4000, own_id)
print("Done messages")

dummy_data_path = "wa-visualization/public/dummy-data.json"

# save json
with open(dummy_data_path, 'w') as outfile:
    json.dump(data, outfile)

print(f"Saved file: {dummy_data_path}")


Done contacts
Done groups
Done messages
Saved file: wa-visualization/public/dummy-data.json
