In [1]:
# Required libraries
import datetime as dt
import matplotlib.pyplot as plt

import os
import time
import numpy as np
import pandas as pd

import random

In [None]:
# Constants
my_data = "../data"

In [2]:
# Load data into Pandas dataframe
datafile = "Online Retail.xlsx"
full_path = os.path.join(my_data, datafile)
df_sales = pd.read_excel(full_path)
print(df_sales.shape)

(541909, 8)


# Create list of objects

## Create list of items

In [3]:
# Real item ID start with a number, remove other ones.
df_sales = df_sales[df_sales['StockCode'].astype(str).str.slice(0,1).str.isnumeric()]

# Keep only positive items and prices
df_sales = df_sales[df_sales['Quantity'] > 0]
df_sales = df_sales[df_sales['UnitPrice'] > 0]
df_items = df_sales[['StockCode', 'Description', 'UnitPrice']].drop_duplicates().reset_index(drop=True)
df_items.head()

Unnamed: 0,StockCode,Description,UnitPrice
0,85123A,WHITE HANGING HEART T-LIGHT HOLDER,2.55
1,71053,WHITE METAL LANTERN,3.39
2,84406B,CREAM CUPID HEARTS COAT HANGER,2.75
3,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,3.39
4,84029E,RED WOOLLY HOTTIE WHITE HEART.,3.39


## Create a frequency table of ordered items

This table will be used to randomly select items to be put in a customer sales ordre, to have a realistic sampling.

In [4]:
# Create items frequency table
df_items_frequency = df_sales[['StockCode','Description']].groupby('StockCode').count().reset_index()
df_items_frequency = df_items_frequency.rename(index=str, columns={'Description': 'Occurences'})
df_items_frequency.head()

Unnamed: 0,StockCode,Occurences
0,10002,71
1,10080,22
2,10120,30
3,10125,94
4,10133,198


## Create a frequency table of quantities

In [5]:
# Create quantities frequency table
df_qty_frequency = df_sales[['Quantity','Description']].groupby('Quantity').count().reset_index()
df_qty_frequency = df_qty_frequency.rename(index=str, columns={'Description': 'Occurences'})
df_qty_frequency.head()

Unnamed: 0,Quantity,Occurences
0,1,146369
1,2,81363
2,3,36825
3,4,38328
4,5,11651


## Create a list of dummy customers

In [6]:
customers = []
for customer in range(1,30):
    customers = customers + ['customer' + str(customer)]



## Create a distribution of the number of lines in an order

This distribution is used as a weight to randomly select the number of lines a new order should have

In [7]:
# Get number of invoice lines
df_order_lines = df_sales[['InvoiceNo','StockCode','Description']].groupby('InvoiceNo').count().reset_index(drop=True)

# Get number of invoices having the same number of lines
df_order_histo = df_order_lines.groupby('StockCode').count().reset_index()

df_order_histo = df_order_histo.rename(index=str, columns={'StockCode': 'Lines', 'Description': 'Occurences'})

df_order_histo.head(10)

Unnamed: 0,Lines,Occurences
0,1,1492
1,2,816
2,3,694
3,4,676
4,5,669
5,6,618
6,7,597
7,8,600
8,9,624
9,10,542


In [8]:
# Empty the sales dataframe
df_sales = df_sales.loc[0:-1,]

In [9]:
random.seed()
# Create several invoices
for i in range(0,100):
    # Choose randomly a customer, a date and a number of lines for the order
    customer = random.choice(customers)
    invoice_date = dt.date(2018, 1, 1) + dt.timedelta(random.random()*365)
    nb_lines = random.choices(population=df_order_histo['Lines'], weights=df_order_histo['Occurences'])
    invoice_id = i + 70000
    print('Customer {0} - date {1} - lines {2}'.format(customer, invoice_date, nb_lines[0]))    
    # Create line items for each order ID
    for l in range(0,nb_lines[0]):
        item = random.choices(population=df_items_frequency['StockCode'], weights=df_items_frequency['Occurences'])
        item_desc = df_items[df_items['StockCode'] == item[0]]['Description']
        item_price = df_items[df_items['StockCode'] == item[0]]['UnitPrice']
        quantity = random.choices(population=df_qty_frequency['Quantity'], weights=df_qty_frequency['Occurences'])
        
        df_sales = df_sales.append(pd.DataFrame({'InvoiceNo': invoice_id,
                                                 'StockCode': item,
                                                 'Description': item_desc.iloc[0],
                                                 'Quantity': quantity,
                                                 'UnitPrice': item_price.iloc[0],
                                                 'InvoiceDate': invoice_date,
                                                 'CustomerID': customer,
                                                 'Country': 'United Kingdom'}),
                                   ignore_index=True)


Customer customer25 - date 2018-04-19 - lines 4
Customer customer28 - date 2018-05-15 - lines 3
Customer customer21 - date 2018-08-28 - lines 3
Customer customer18 - date 2018-03-26 - lines 35
Customer customer24 - date 2018-02-02 - lines 22
Customer customer27 - date 2018-02-21 - lines 15
Customer customer2 - date 2018-08-31 - lines 3
Customer customer3 - date 2018-09-28 - lines 32
Customer customer7 - date 2018-08-29 - lines 32
Customer customer24 - date 2018-03-17 - lines 2
Customer customer19 - date 2018-12-21 - lines 9
Customer customer1 - date 2018-05-16 - lines 15
Customer customer22 - date 2018-02-22 - lines 25
Customer customer24 - date 2018-03-18 - lines 55
Customer customer21 - date 2018-09-18 - lines 1
Customer customer21 - date 2018-03-07 - lines 3
Customer customer16 - date 2018-08-02 - lines 4
Customer customer20 - date 2018-05-08 - lines 5
Customer customer5 - date 2018-06-09 - lines 20
Customer customer2 - date 2018-11-04 - lines 25
Customer customer3 - date 2018-07-28

In [10]:
# Convert data type
df_sales['InvoiceNo'] = df_sales['InvoiceNo'].astype('int64')

In [11]:
# Save file to CSV
datafile = "datazon_random_invoices.csv"
full_path = os.path.join(my_data, datafile)
df_sales.to_csv(full_path, encoding="utf-8", index=False)
