# Event Generator

### This notebook generates an event stream

In [1]:
import sys
!{sys.executable} -m pip install faker



In [2]:
%run config.py

In [3]:
OUTPUT_STREAM = 'generated-stream'

In [4]:
STREAM_PATH = STREAM_CONFIGS.get(OUTPUT_STREAM).get('path')
SHARDS_COUNT = STREAM_CONFIGS.get(OUTPUT_STREAM).get('shard_count')

In [5]:
import json
from random import randint, random
import math
from faker import Faker
import uuid
from datetime import datetime, timedelta

def gen_postcode(is_churn):
    # if is_churn is true the postcode modulu 3 will return 0 or 1
    # if is_churn is false the postcode modulu 3 will return 0 or 2
    # this will encode information in postcode that our ML model will learn
    base_postcode = 3 * randint(3334,33333)
    group = randint(0,1)
    if is_churn:
        return base_postcode + group
    else:
        return base_postcode + (group * 2)

# event functions
def new_registration(fake, id, event_time, is_churn):
    return {'user_id': id,
            'event_type': 'registration',
            'event_time': event_time,
            'name':fake.name(),
            'date_of_birth': fake.date(),
            'street_address': fake.street_address(),
            'city': fake.city(),
            'country': fake.country(),
            'postcode': gen_postcode(is_churn),
            'affiliate_url': fake.image_url(),
            'campaign': fake.ean8()}

def new_purchase(fake, id, event_time):
    return {'user_id': id,
            'event_type': 'purchase',
            'event_time': event_time,
            'amount': fake.randomize_nb_elements(number=50)}

def new_bet(fake, id, event_time):
    return {'user_id': id,
            'event_type': 'bet',
            'event_time': event_time,
            'bet_amount': fake.randomize_nb_elements(number=10)}
    
def new_win(fake, id, event_time):
    return {'user_id': id,
            'event_type': 'win',
            'event_time': event_time,
            'win_amount': fake.randomize_nb_elements(number=200)}

def gen_event_date(is_churn, prev_event_date=None):
    if prev_event_date is None:
        #generate first event date
        return str(datetime.now() - timedelta(hours=randint(48,96)))
    else:
        prev_dt = datetime.strptime(prev_event_date,'%Y-%m-%d %H:%M:%S.%f')
        if prev_dt + timedelta(hours=30) < datetime.now() and not is_churn and randint(1,1000) <= 5:
            # if the user is not churned and it is possible, generate event in the following day with prbability 0.005
            return str(prev_dt + timedelta(hours=randint(15,24)))
        else:
            return str(prev_dt + timedelta(seconds=randint(5,100)))
        
def generate_events(fake, user_ids, events_dist, num_events, is_churn):
    events = []
    for id in user_ids:
        # register
        event_time = gen_event_date(is_churn)
        reg_event = new_registration(fake, id, event_time, is_churn)
        reg_event['label'] = int(is_churn)
        events.append(reg_event)
        for _ in range(num_events):
            # generate event according to dist
            acc_prob = 0
            rand = random()
            for event_dist in events_dist:
                if rand <= event_dist['probability']+acc_prob:
                    event_time = gen_event_date(is_churn, event_time)
                    new_event = event_dist['generator'](fake, id, event_time)
                    events.append(new_event)
                    prob_threshold = 0
                    break
                else:
                    acc_prob += event_dist['probability']
    return events


# 70% churn users 
NUM_USERS_GROUP1 = 1400
NUM_USERS_GROUP2 = 600 
NUM_USERS = NUM_USERS_GROUP1+NUM_USERS_GROUP2

EVENTS_PER_USER = 1000

GROUP1_EVENTS_DIST = [{'probability': 0.1, 'generator': new_purchase}, 
                      {'probability': 0.89, 'generator': new_bet}, 
                      {'probability': 0.01, 'generator': new_win}]

GROUP2_EVENTS_DIST = [{'probability': 0.1, 'generator': new_purchase}, 
                      {'probability': 0.85, 'generator': new_bet},
                      {'probability': 0.05, 'generator': new_win}]


## Generate Events

In [6]:
fake = Faker()

group1_user_ids = (str(uuid.uuid4()) for _ in range(NUM_USERS_GROUP1))
group2_user_ids = (str(uuid.uuid4()) for _ in range(NUM_USERS_GROUP2))

group1_events = generate_events(fake, group1_user_ids, GROUP1_EVENTS_DIST, EVENTS_PER_USER, True)
group2_events = generate_events(fake, group2_user_ids, GROUP2_EVENTS_DIST, EVENTS_PER_USER, False)


print(f'Events generated: {len(group1_events)+len(group2_events)}')
print(f'Events preview: {group1_events[1:5]}')

Events generated: 2002000
Events preview: [{'user_id': 'ce89f471-a2ef-436c-be7b-532cff0bc9f1', 'event_type': 'bet', 'event_time': '2020-07-30 16:44:26.331422', 'bet_amount': 10}, {'user_id': 'ce89f471-a2ef-436c-be7b-532cff0bc9f1', 'event_type': 'bet', 'event_time': '2020-07-30 16:44:57.331422', 'bet_amount': 8}, {'user_id': 'ce89f471-a2ef-436c-be7b-532cff0bc9f1', 'event_type': 'bet', 'event_time': '2020-07-30 16:46:27.331422', 'bet_amount': 7}, {'user_id': 'ce89f471-a2ef-436c-be7b-532cff0bc9f1', 'event_type': 'bet', 'event_time': '2020-07-30 16:46:43.331422', 'bet_amount': 7}]


## Write generated events to V3IO Steam

#### Transform the event to stream records

In [7]:
records = []
for event in group1_events + group2_events:
    records.append({'data': json.dumps(event)})

#### Ingest in small batches to V3IO Stream

In [8]:
batch_size = 1000
for i in range(0, len(records), batch_size):
    resp = v3io_client.put_records(container=CONTAINER, path=STREAM_PATH, records=records[i:i+batch_size])