# Event Generator

### This notebook generates an event stream

In [11]:
import os
import json
from random import randint, random
import math
import v3io.dataplane
from faker import Faker
import uuid
from datetime import datetime, timedelta

# V3IO Settings
V3IO_ACCESS_KEY = os.getenv('V3IO_ACCESS_KEY')
V3IO_USERNAME = os.getenv('V3IO_USERNAME')
os.environ['V3IO_ACCESS_KEY'] = V3IO_ACCESS_KEY
os.environ['V3IO_USERNAME'] = V3IO_USERNAME
CONTAINER = 'users'
STREAM_PATH = os.path.join(V3IO_USERNAME, 'examples/rapid-churn/generated-stream')
SHARDS_COUNT = 8

# Generator settings
ACTIVITIES = 6
MAX_PLAY_DURATION = 10
MAX_SCORE = 100
MAX_PURCHASE_AMOUNT = 1000

def gen_postcode(is_churn):
    # if is_churn is true the postcode modulu 3 will return 0 or 1
    # if is_churn is false the postcode modulu 3 will return 0 or 2
    # this will encode information in postcode that our ML model will learn
    base_postcode = 3 * randint(3334,33333)
    group = randint(0,1)
    if is_churn:
        return base_postcode + group
    else:
        return base_postcode + (group * 2)

'''
User information:
    Base (created from new user event):
    User ID
    Full name
    Birthdate 
    City (time-zone)
    Registration date
    Affiliate / Banner details
    Campaign ID
    Affiliate ID / Source
    Creative ID
'''
def new_registration(fake, id, event_time, is_churn):
    return {'user_id': id,
            'event_type': 'registration',
            'event_time': event_time,
            'name':fake.name(),
            'date_of_birth': fake.date(),
            'street_address': fake.street_address(),
            'city': fake.city(),
            'country': fake.country(),
            'affiliate_url': fake.image_url(),
            'campaign': fake.ean8(),
            'creative': fake.ean13(),
            'postcode': gen_postcode(is_churn)}

'''
In-App purchases log:
    User ID
    Session ID
    Date
    Amount 
    Product/service (category)
'''
def new_purchase(fake, id, event_time):
    return {'user_id': id,
            'event_type': 'purchase',
            'event_time': event_time,
            'product_id': fake.random.randint(0, 5),
            'amount': fake.randomize_nb_elements(number=MAX_PURCHASE_AMOUNT)}

'''
User activity log:
    User ID
    Session ID
    Activity (category)
    Start Date
    Duration (e.g. game duration) 
    # Percent Completed (e.g. left after 25% or completed 100%)
    Score
    Is success (e.g. won the game)
    # Game Level after activity (game level was updated)
'''
def new_activity(fake, id, event_time):
    return {'user_id': id,
            'event_type': 'activity',
            'event_time': event_time,
            'activity': fake.random.randint(0, ACTIVITIES),
            'duration': fake.random.randint(0, MAX_PLAY_DURATION),
            'score': fake.random.randint(0, MAX_SCORE),
            'is_win': fake.random.randint(0, 1)}

def new_win(fake, id, event_time):
    activity = new_activity(fake, id, event_time)

    # Keep score in the top half and win=1
    half_max_score = MAX_SCORE/2
    if activity['score'] <= (half_max_score):
       activity['score'] = fake.random.randint(half_max_score, MAX_SCORE)
    activity['is_win'] = 1
    return activity

def new_lose(fake, id, event_time):
    activity = new_activity(fake, id, event_time)

    # Keep score in the bottom half and win=0
    half_max_score = MAX_SCORE/2
    if activity['score'] >= (half_max_score):
       activity['score'] = fake.random.randint(0, half_max_score)
    activity['is_win'] = 0
    return activity

'''
Membership Info (from update membership event):
    Start date 
    Expiration date 
    Payment method 
    Membership level / price
    Cancelation date
    Auto renew 
'''

def gen_event_date(is_churn, prev_event_date=None):
    if prev_event_date is None:
        #generate first event date
        return str(datetime.now() - timedelta(hours=randint(48,96)))
    else:
        prev_dt = datetime.strptime(prev_event_date,'%Y-%m-%d %H:%M:%S.%f')
        if prev_dt + timedelta(hours=30) < datetime.now() and not is_churn and randint(1,1000) <= 5:
            # if the user is not churned and it is possible, generate event in the following day with prbability 0.005
            return str(prev_dt + timedelta(hours=randint(15,24)))
        else:
            return str(prev_dt + timedelta(seconds=randint(5,100)))
        
def generate_events(fake, user_ids, events_dist, num_events, is_churn):
    events = []
    for id in user_ids:
        # register
        event_time = gen_event_date(is_churn)
        events.append(new_registration(fake, id, event_time, is_churn))
        for _ in range(num_events):
            # generate event according to dist
            acc_prob = 0
            rand = random()
            for event_dist in events_dist:
                if rand <= event_dist['probability']+acc_prob:
                    event_time = gen_event_date(is_churn, event_time)
                    events.append(event_dist['generator'](fake, id, event_time))
                    prob_threshold = 0
                    break
                else:
                    acc_prob += event_dist['probability']
    return events


# 70% churn users 
NUM_USERS_GROUP1 = 1400
NUM_USERS_GROUP2 = 600 
NUM_USERS = NUM_USERS_GROUP1+NUM_USERS_GROUP2

EVENTS_PER_USER = 1000

''' Set user groups to better model the behaviour and churn 
- Group 1: Losers
- Group 2: Winners
'''
GROUP1_EVENTS_DIST = [{'probability': 0.1, 'generator': new_purchase}, 
                      {'probability': 0.89, 'generator': new_lose}, 
                      {'probability': 0.01, 'generator': new_win}]

GROUP2_EVENTS_DIST = [{'probability': 0.1, 'generator': new_purchase}, 
                      {'probability': 0.85, 'generator': new_lose},
                      {'probability': 0.05, 'generator': new_win}]

## Create V3IO Client

In [13]:
v3io_client = v3io.dataplane.Client(endpoint='http://v3io-webapi:8081', access_key=V3IO_ACCESS_KEY)

## Create V3IO Stream

In [14]:
resp = v3io_client.create_stream(container=CONTAINER,
                           path=STREAM_PATH,
                           shard_count=SHARDS_COUNT)
resp.status_code

204

## Generate Events

In [12]:
fake = Faker()

group1_user_ids = (str(uuid.uuid4()) for _ in range(NUM_USERS_GROUP1))
group2_user_ids = (str(uuid.uuid4()) for _ in range(NUM_USERS_GROUP2))

group1_events = generate_events(fake, group1_user_ids, GROUP1_EVENTS_DIST, EVENTS_PER_USER, True)
group2_events = generate_events(fake, group2_user_ids, GROUP2_EVENTS_DIST, EVENTS_PER_USER, False)


print(f'Events generated: {len(group1_events)+len(group2_events)}')
print(f'Events preview: {group1_events[1:5]}') 

Events generated: 2002000
Events preview: [{'user_id': '67229d4a-f6a4-43cb-973a-5a5a510013fc', 'event_type': 'activity', 'event_time': '2020-07-26 12:16:02.291167', 'activity': 0, 'duration': 10, 'score': 47, 'is_win': 0}, {'user_id': '67229d4a-f6a4-43cb-973a-5a5a510013fc', 'event_type': 'activity', 'event_time': '2020-07-26 12:17:29.291167', 'activity': 1, 'duration': 7, 'score': 29, 'is_win': 0}, {'user_id': '67229d4a-f6a4-43cb-973a-5a5a510013fc', 'event_type': 'activity', 'event_time': '2020-07-26 12:18:09.291167', 'activity': 0, 'duration': 5, 'score': 3, 'is_win': 0}, {'user_id': '67229d4a-f6a4-43cb-973a-5a5a510013fc', 'event_type': 'activity', 'event_time': '2020-07-26 12:19:06.291167', 'activity': 5, 'duration': 10, 'score': 41, 'is_win': 0}]


## Write generated events to V3IO Steam

#### Transform the event to stream records

In [15]:
records = []
for event in group1_events + group2_events:
    records.append({'data': json.dumps(event)})


#### Ingest in small batches to V3IO Stream

In [16]:
batch_size = 1000
STREAM_PATH = os.path.join(V3IO_USERNAME, 'examples/rapid-churn/generated-stream')
for i in range(0, len(records), batch_size):
    resp = v3io_client.put_records(container=CONTAINER, path=STREAM_PATH, records=records[i:i+batch_size])


## Delete the stream

In [None]:
resp = v3io_client.delete_stream(container=CONTAINER, path=STREAM_PATH)
resp.body

In [2]:
%env V3IO_ACCESS_KEY

'4253f5c9-020f-4e6c-ae66-c9e9c5ad24ed'