In [24]:
# Essential libraries
import pandas as pd
import datetime
import time
import uuid
import random
import numpy as np


In [2]:
# Loading data from stage 1 (which is just raw data from somewhere)
ifile = "../data/events-combined-stage1.csv"
dfi = pd.read_csv(ifile)


In [27]:
# Validating data
dfi[:10]

Unnamed: 0,id,productStyle,age,gender,region,node,stageNum,stageName,timestamp
0,1,style1,under-30,male,us,1,1,Order Received,11/1/17 17:27
1,2,style1,under-30,male,asia,2,1,Order Received,11/1/17 4:03
2,3,style1,under-30,male,asia,2,1,Order Received,11/1/17 15:20
3,4,style1,under-30,male,asia,2,1,Order Received,11/1/17 16:33
4,5,style1,under-30,male,europe,1,1,Order Received,11/1/17 15:40
5,6,style1,under-30,male,europe,2,1,Order Received,11/1/17 0:18
6,7,style1,under-30,female,us,1,1,Order Received,11/1/17 7:32
7,8,style1,under-30,female,us,1,1,Order Received,11/1/17 17:51
8,9,style1,under-30,female,us,1,1,Order Received,11/1/17 20:15
9,10,style1,under-30,female,us,2,1,Order Received,11/1/17 13:00


In [3]:
# Writing data to stage1 a CSV file
dfi.to_csv("./data_stage1.csv", index=False)

In [4]:
# function to add a unique ID to the data
# This is like assigning an ID before the order goes into the manufacturing cycle
def add_id(data):
    print('Starting with the datafile: ', data)
    df = pd.read_csv(data)
    df = df.assign(id=uuid.uuid4())
    for i, event in df.iterrows():
        df.loc[i, "id"] = uuid.uuid4()
    df.to_csv(data, index=False)
    print('Output datafile: ', data)
    return data

In [12]:
# function to add a type field to data
# Here we can do some magic like indicate the result of manufacturing
def add_type(data, type):
    print('Starting with the datafile: ', data)
    df = pd.read_csv(data)
    df = df.assign(type = "NA")
    for i, event in df.iterrows():
        # pick one from the list of types
        df.loc[i, "type"] = random.choice(type)
    df.to_csv(data, index=False)
    print('Output datafile: ', data)
    return data

In [9]:
# Change the date format to ISO

def date_to_iso(data):    
    print('Starting with the datafile: ', data)
    # Read the file
    df = pd.read_csv(data)   
    # Name for the output file
    out_filename_string = data
    df_out = df   
    # Go through each record and process  
    for i, event in df_out.iterrows():        
        # Extract the datetime
        timestamp = df_out.loc[i, "timestamp"]
        # Convert from timestamp to the datetime object
        dt = datetime.datetime.strptime(timestamp,"%m/%d/%y %H:%M")
        # Convert to Iso format 
        s_dt = dt.isoformat()
        df_out.loc[i, "timestamp"] = s_dt
    # Do not write the index or it'll keep adding one to each stage
    df_out.to_csv(out_filename_string, index=False)
    print('Output datafile: ', out_filename_string)
    return out_filename_string


In [10]:
# Change the date format to ISO
date_to_iso("./data_stage1.csv")

Starting with the datafile:  ./data_stage1.csv
Output datafile:  ./data_stage1.csv


'./data_stage1.csv'

In [13]:
# add type to the data
type = ["atype", "btype", "ctype", "dtype", "etype", "ftype"]
add_type("./data_stage1.csv", type)

Starting with the datafile:  ./data_stage1.csv
Output datafile:  ./data_stage1.csv


'./data_stage1.csv'

In [14]:
# add a unique ID to the data
add_id("./data_stage1.csv")

Starting with the datafile:  ./data_stage1.csv
Output datafile:  ./data_stage1.csv


'./data_stage1.csv'

In [15]:
# Take data through the manufacturing stage
# Each unique ID event would be transformed to indicate the result of manufacturing

def process(data, stage = 2, stageName='Manufacturing', process_time=3600):    
    print('Starting with the datafile: ', data)
    print('Processing stage: ', stage)    
    # Read the file
    df = pd.read_csv(data)   
    # Name for the output file
    out_filename_string = "./data_stage" + str(stage) + ".csv"
    df_out = df   
    # Go through each record and process  
    for i, event in df_out.iterrows():        
        # Add the stage number
        df_out.loc[i, "stageNum"] = stage       
        # Add the stageName
        df_out.loc[i, "stageName"] = stageName
        # Extract the datetime
        timestamp = df_out.loc[i, "timestamp"]
        # Convert from timestamp to the datetime object
        dt = datetime.datetime.strptime(timestamp,"%Y-%m-%dT%H:%M:%S")
        # Add the normal process time
        dt = dt + datetime.timedelta(seconds = process_time)
        # Convert to Iso format 
        s_dt = dt.isoformat()
        df_out.loc[i, "timestamp"] = s_dt
    # Do not write the index or it'll keep adding one to each stage
    df_out.to_csv(out_filename_string, index=False)
    print('Output datafile: ', out_filename_string)
    return out_filename_string


In [34]:
# Take data through the manufacturing stage
# Each unique ID event would be transformed to indicate the result of manufacturing
# Also we add a new timestamp corresponding to the latest stage

def process2(data, stage = 2, stageName='Manufacturing', process_time=1):    
    print('Starting with the datafile: ', data)
    print('Processing stage: ', stage)    
    # Read the file
    df = pd.read_csv(data)   
    # Name for the output file
    out_filename_string = "./data_stage" + str(stage) + ".csv"
    t_name = "timestamp" + str(stage)
    df_out = df   
    # Go through each record and process  
    for i, event in df_out.iterrows():        
        # Add the stage number
        df_out.loc[i, "stageNum"] = stage       
        # Add the stageName
        df_out.loc[i, "stageName"] = stageName
        # Extract the datetime
        timestamp = df_out.loc[i, "timestamp"]
        # Convert from timestamp to the datetime object
        dt = datetime.datetime.strptime(timestamp,"%Y-%m-%dT%H:%M:%S")
        # Add the normal process time
        delta = np.random.normal(loc=process_time, scale=3, size=1)
        dt = dt + datetime.timedelta(hours = int(delta[0]))
        # Convert to Iso format 
        s_dt = dt.isoformat()
        df_out.loc[i, t_name] = s_dt
    # Do not write the index or it'll keep adding one to each stage
    df_out.to_csv(out_filename_string, index=False)
    print('Output datafile: ', out_filename_string)
    return out_filename_string


In [35]:
# Add the manufacturing stage
process2("./data_stage1.csv")


Starting with the datafile:  ./data_stage1.csv
Processing stage:  2
Output datafile:  ./data_stage2.csv


'./data_stage2.csv'

In [36]:
# Add a delay for certain types
def add_delay(data, delay=5, stage=2):
    print('Starting with the datafile: ', data) 
    # Read the file
    df = pd.read_csv(data)   
    # Name for the output file. We'll just rewrite the file
    out_filename_string = data
    t_name = "timestamp" + str(stage)
    df_out = df   
    # Go through each record and process  
    for i, event in df_out.iterrows():
        # We add delay to a subset
        if(df_out.loc[i, "region"] == "europe" and df_out.loc[i, "productStyle"] == "style1"):
            timestamp = df_out.loc[i, t_name]
            # Convert from timestamp to the datetime object
            dt = datetime.datetime.strptime(timestamp,"%Y-%m-%dT%H:%M:%S")
            # Add the delay
            dt = dt + datetime.timedelta(hours = delay)
            # Convert to Iso format 
            s_dt = dt.isoformat()
            df_out.loc[i, t_name] = s_dt
    # Do not write the index or it'll keep adding one to each stage
    df_out.to_csv(out_filename_string, index=False)
    print('Output datafile: ', out_filename_string)
    return out_filename_string

In [37]:
# Add the delay
add_delay("./data_stage2.csv")


Starting with the datafile:  ./data_stage2.csv
Output datafile:  ./data_stage2.csv


'./data_stage2.csv'

In [25]:
np.random.normal(loc=0.0, scale=3, size=1)

array([1.44884882])

In [26]:
np.random.normal(loc=0.0, scale=3, size=1)

array([7.32780301])