# Event Generator

### This notebook generates an event stream

In [1]:
import os
import json
from random import randint, random
import math
import v3io.dataplane
from faker import Faker
import uuid
from datetime import datetime, timedelta


V3IO_ACCESS_KEY = os.getenv('V3IO_ACCESS_KEY')
V3IO_USERNAME = os.getenv('V3IO_USERNAME')
CONTAINER = 'users'
STREAM_PATH = os.path.join(V3IO_USERNAME, 'examples/rapid-prototype/generated-stream')
SHARDS_COUNT = 8

def gen_postcode(is_churn):
    # if is_churn is true the postcode modulu 3 will return 0 or 1
    # if is_churn is false the postcode modulu 3 will return 0 or 2
    # this will encode information in postcode that our ML model will learn
    base_postcode = 3 * randint(3334,33333)
    group = randint(0,1)
    if is_churn:
        return base_postcode + group
    else:
        return base_postcode + (group * 2)

# event functions
def new_registration(fake, id, event_time, is_churn):
    return {'user_id': id,
            'event_type': 'registration',
            'event_time': event_time,
            'name':fake.name(),
            'date_of_birth': fake.date(),
            'street_address': fake.street_address(),
            'city': fake.city(),
            'country': fake.country(),
            'postcode': gen_postcode(is_churn),
            'affiliate_url': fake.image_url(),
            'campaign': fake.ean8()}

def new_purchase(fake, id, event_time):
    return {'user_id': id,
            'event_type': 'purchase',
            'event_time': event_time,
            'amount': fake.randomize_nb_elements(number=50)}

def new_bet(fake, id, event_time):
    return {'user_id': id,
            'event_type': 'bet',
            'event_time': event_time,
            'bet_amount': fake.randomize_nb_elements(number=10)}
    
def new_win(fake, id, event_time):
    return {'user_id': id,
            'event_type': 'win',
            'event_time': event_time,
            'win_amount': fake.randomize_nb_elements(number=200)}

def gen_event_date(is_churn, prev_event_date=None):
    if prev_event_date is None:
        #generate first event date
        return str(datetime.now() - timedelta(hours=randint(48,96)))
    else:
        prev_dt = datetime.strptime(prev_event_date,'%Y-%m-%d %H:%M:%S.%f')
        if prev_dt + timedelta(hours=30) < datetime.now() and not is_churn and randint(1,1000) <= 5:
            # if the user is not churned and it is possible, generate event in the following day with prbability 0.005
            return str(prev_dt + timedelta(hours=randint(15,24)))
        else:
            return str(prev_dt + timedelta(seconds=randint(5,100)))
        
def generate_events(fake, user_ids, events_dist, num_events, is_churn):
    events = []
    for id in user_ids:
        # register
        event_time = gen_event_date(is_churn)
        reg_event = new_registration(fake, id, event_time, is_churn)
        reg_event['label'] = int(is_churn)
        events.append(reg_event)
        for _ in range(num_events):
            # generate event according to dist
            acc_prob = 0
            rand = random()
            for event_dist in events_dist:
                if rand <= event_dist['probability']+acc_prob:
                    event_time = gen_event_date(is_churn, event_time)
                    new_event = event_dist['generator'](fake, id, event_time)
                    events.append(new_event)
                    prob_threshold = 0
                    break
                else:
                    acc_prob += event_dist['probability']
    return events


# 70% churn users 
NUM_USERS_GROUP1 = 1400
NUM_USERS_GROUP2 = 600 
NUM_USERS = NUM_USERS_GROUP1+NUM_USERS_GROUP2

EVENTS_PER_USER = 1000

GROUP1_EVENTS_DIST = [{'probability': 0.1, 'generator': new_purchase}, 
                      {'probability': 0.89, 'generator': new_bet}, 
                      {'probability': 0.01, 'generator': new_win}]

GROUP2_EVENTS_DIST = [{'probability': 0.1, 'generator': new_purchase}, 
                      {'probability': 0.85, 'generator': new_bet},
                      {'probability': 0.05, 'generator': new_win}]


## Create V3IO Client

In [2]:
v3io_client = v3io.dataplane.Client(endpoint='http://v3io-webapi:8081', access_key=V3IO_ACCESS_KEY)

## Create V3IO Stream

In [3]:
resp = v3io_client.create_stream(container=CONTAINER,
                           path=STREAM_PATH,
                           shard_count=SHARDS_COUNT)
resp.status_code

204

## Generate Events

In [4]:
fake = Faker()

group1_user_ids = (str(uuid.uuid4()) for _ in range(NUM_USERS_GROUP1))
group2_user_ids = (str(uuid.uuid4()) for _ in range(NUM_USERS_GROUP2))

group1_events = generate_events(fake, group1_user_ids, GROUP1_EVENTS_DIST, EVENTS_PER_USER, True)
group2_events = generate_events(fake, group2_user_ids, GROUP2_EVENTS_DIST, EVENTS_PER_USER, False)


print(f'Events generated: {len(group1_events)+len(group2_events)}')
print(f'Events preview: {group1_events[1:5]}')

Events generated: 2002000
Events preview: [{'user_id': 'cd5f38bc-b7d4-48ee-a578-a78bb0cdc902', 'event_type': 'purchase', 'event_time': '2020-07-26 21:29:48.697929', 'amount': 52}, {'user_id': 'cd5f38bc-b7d4-48ee-a578-a78bb0cdc902', 'event_type': 'bet', 'event_time': '2020-07-26 21:31:24.697929', 'bet_amount': 12}, {'user_id': 'cd5f38bc-b7d4-48ee-a578-a78bb0cdc902', 'event_type': 'bet', 'event_time': '2020-07-26 21:32:05.697929', 'bet_amount': 11}, {'user_id': 'cd5f38bc-b7d4-48ee-a578-a78bb0cdc902', 'event_type': 'bet', 'event_time': '2020-07-26 21:33:25.697929', 'bet_amount': 8}]


## Write generated events to V3IO Steam

#### Transform the event to stream records

In [5]:
records = []
for event in group1_events + group2_events:
    records.append({'data': json.dumps(event)})


#### Ingest in small batches to V3IO Stream

In [9]:
batch_size = 1000
for i in range(0, len(records), batch_size):
    resp = v3io_client.put_records(container=CONTAINER, path=STREAM_PATH, records=records[i:i+batch_size])


2020-07-29 14:58:55,941 [info] Remote disconnected while waiting for response: {'retries_left': 1, 'connection_idx': 7}
2020-07-29 14:58:55,962 [info] Remote disconnected while waiting for response: {'retries_left': 1, 'connection_idx': 0}
2020-07-29 14:58:55,977 [info] Remote disconnected while waiting for response: {'retries_left': 1, 'connection_idx': 1}
2020-07-29 14:58:55,992 [info] Remote disconnected while waiting for response: {'retries_left': 1, 'connection_idx': 2}
2020-07-29 14:58:56,006 [info] Remote disconnected while waiting for response: {'retries_left': 1, 'connection_idx': 3}
2020-07-29 14:58:56,020 [info] Disconnected while attempting to send. Recreating connection: {'e': <class 'BrokenPipeError'>}
2020-07-29 14:58:56,034 [info] Remote disconnected while waiting for response: {'retries_left': 1, 'connection_idx': 5}
2020-07-29 14:58:56,048 [info] Remote disconnected while waiting for response: {'retries_left': 1, 'connection_idx': 6}


## Delete the stream

In [5]:
resp = v3io_client.delete_stream(container=CONTAINER, path=STREAM_PATH)
resp.status_code

204