In [1]:
# Essential libraries
import pandas as pd
import datetime
import time
import uuid
import random


In [2]:
# Loading data from stage 1 (which is just raw data from somewhere)
ifile = "../data/events-2017.11.csv"
dfi = pd.read_csv(ifile)


In [3]:
# Validating data
dfi[:10]

Unnamed: 0,id,productStyle,age,gender,region,node,stageNum,stageName,timestamp
0,1,style1,under-30,male,asia,1,1,Order Received,11-01-2017 18:09:43
1,2,style1,under-30,male,asia,1,1,Order Received,11-01-2017 18:46:50
2,3,style1,under-30,male,europe,1,1,Order Received,11-01-2017 05:55:44
3,4,style1,under-30,male,europe,1,1,Order Received,11-01-2017 22:29:42
4,5,style1,under-30,male,europe,2,1,Order Received,11-01-2017 05:19:35
5,6,style1,under-30,female,us,1,1,Order Received,11-01-2017 00:58:56
6,7,style1,under-30,female,asia,1,1,Order Received,11-01-2017 00:28:13
7,8,style1,under-30,female,asia,1,1,Order Received,11-01-2017 06:11:01
8,9,style1,under-30,female,asia,2,1,Order Received,11-01-2017 10:27:13
9,10,style1,under-30,female,europe,1,1,Order Received,11-01-2017 00:28:00


In [4]:
# Writing data to stage1 a CSV file
dfi.to_csv("./data_stage1.csv", index=False)

In [5]:
# function to add a unique ID to the data
# This is like assigning an ID before the order goes into the manufacturing cycle
def add_id(data):
    print('Starting with the datafile: ', data)
    df = pd.read_csv(data)
    df = df.assign(id=uuid.uuid4())
    for i, event in df.iterrows():
        df.loc[i, "id"] = uuid.uuid4()
    df.to_csv(data, index=False)
    print('Output datafile: ', data)
    return data

In [6]:
# function to add a type field to data
# Here we can do some magic like indicate the result of manufacturing
def add_type(data, type):
    print('Starting with the datafile: ', data)
    df = pd.read_csv(data)
    df = df.assign(type = "NA")
    for i, event in df.iterrows():
        # pick one from the list of types
        df.loc[i, "type"] = random.choice(type)
    df.to_csv(data, index=False)
    print('Output datafile: ', data)
    return data

In [7]:
# add type to the data
type = ["atype", "btype", "ctype", "dtype", "etype", "ftype"]
add_type("./data_stage1.csv", type)

Starting with the datafile:  ./data_stage1.csv
Output datafile:  ./data_stage1.csv


'./data_stage1.csv'

In [8]:
# add a unique ID to the data
add_id("./data_stage1.csv")

Starting with the datafile:  ./data_stage1.csv
Output datafile:  ./data_stage1.csv


'./data_stage1.csv'

In [12]:
# Take data through the manufacturing stage
# Each unique ID event would be transformed to indicate the result of manufacturing

def process(data, stage = 2, stageName='Manufacturing', process_time=3600):    
    print('Starting with the datafile: ', data)
    print('Processing stage: ', stage)    
    # Read the file
    df = pd.read_csv(data)   
    # Name for the output file
    out_filename_string = "./data_stage" + str(stage) + ".csv"
    df_out = df   
    # Go through each record and process  
    for i, event in df_out.iterrows():        
        # Add the stage number
        df_out.loc[i, "stageNum"] = stage       
        # Add the stageName
        df_out.loc[i, "stageName"] = stageName
        # Extract the datetime
        timestamp = df_out.loc[i, "timestamp"]
        # Convert from timestamp to the datetime object
        dt = datetime.datetime.strptime(timestamp,"%m-%d-%Y %H:%M:%S")
        # Add the normal process time
        dt = dt + datetime.timedelta(seconds = process_time)
        # Convert to Iso format 
        s_dt = dt.isoformat()
        df_out.loc[i, "timestamp"] = s_dt
    # Do not write the index or it'll keep adding one to each stage
    df_out.to_csv(out_filename_string, index=False)
    print('Output datafile: ', out_filename_string)
    return out_filename_string


In [13]:
# Add the manufacturing stage
process("./data_stage1.csv")


Starting with the datafile:  ./data_stage1.csv
Processing stage:  2
Output datafile:  ./data_stage2.csv


'./data_stage2.csv'

In [20]:
# Add a delay for certain types
def add_delay(data, delay=7200):
    print('Starting with the datafile: ', data) 
    # Read the file
    df = pd.read_csv(data)   
    # Name for the output file. We'll just rewrite the file
    out_filename_string = data
    df_out = df   
    # Go through each record and process  
    for i, event in df_out.iterrows():
        # We add delay to a subset
        if(df_out.loc[i, "region"] == "europe" and df_out.loc[i, "productStyle"] == "style1"):
            timestamp = df_out.loc[i, "timestamp"]
            # Convert from timestamp to the datetime object
            dt = datetime.datetime.strptime(timestamp,"%Y-%m-%dT%H:%M:%S")
            # Add the delay
            dt = dt + datetime.timedelta(seconds = delay)
            # Convert to Iso format 
            s_dt = dt.isoformat()
            df_out.loc[i, "timestamp"] = s_dt
    # Do not write the index or it'll keep adding one to each stage
    df_out.to_csv(out_filename_string, index=False)
    print('Output datafile: ', out_filename_string)
    return out_filename_string

In [21]:
# Add the delay
add_delay("./data_stage2.csv")


Starting with the datafile:  ./data_stage2.csv
Output datafile:  ./data_stage2.csv


'./data_stage2.csv'