In [2]:
# data processing
import numpy as np
import pandas as pd
from datetime import date

# plotting
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# full path of the orders dataset
orders_file = 'C:/Users/short/Desktop/python_vscode/631_Assignment3/data/olist_orders_dataset.csv'

# full path of the customer dataset
cust_file = 'C:/Users/short/Desktop/python_vscode/631_Assignment3/data/olist_customers_dataset.csv'

In [4]:
def read_olist_data(file1, file2, use_cols, cutoff_date, verbose):
    # read the orders data
    orders = pd.read_csv(file1)
    if verbose:
        print (f'{len(orders):,} read from the orders file.')
    # date-time conversion
    orders['order_purchase_timestamp'] = pd.to_datetime(orders['order_purchase_timestamp'])
    # let's convert the order purchase timestamps into dates
    orders['order_purchase_date'] = orders['order_purchase_timestamp'].dt.date
    # create an order date col
    orders['order_date'] = pd.to_datetime(orders['order_purchase_date'], format = '%Y/%m/%d') 
    # extract month from the order date
    orders['weekday'] = orders['order_purchase_timestamp'].dt.day_name()
    # read the file that contains the unique customer identifier
    # also, let's keep only the following two columns: customer_id, customer_unique_id
    cust = pd.read_csv(file2, usecols = ['customer_id', 'customer_unique_id', 'customer_state'])
    if verbose:
        print (f'{len(cust):,} read from the customer file.')
    # merge orders and cust dataframes using an inner join on customer_id
    orders_out = pd.merge(orders, cust, on = 'customer_id', how  = 'inner')
    # discard (incomplete) data -- data collected on/after 8/22/2018
    # we do this here by using a boolean (True/False) mask
    mask = orders_out['order_purchase_date'] <= cutoff_date
    # apply the mask so that we only grab TRUE records.
    orders_out = orders_out[mask]
    # sort the output dataset by order date for each customer
    orders_out = orders_out[use_cols].sort_values(['customer_unique_id', 'order_purchase_timestamp'])
    if verbose:
        print (f'{len(orders_out):,} records in the output  file.')
    return orders_out

In [5]:
# we will keep only these columns for this exercise
use_cols = ['customer_unique_id', 'order_id', 'order_purchase_timestamp', 'order_date', 'weekday', 'customer_state']
# we will discard (incomplete) data
cutoff_date = date(2018, 8, 22)
orders = read_olist_data(orders_file, cust_file, use_cols, cutoff_date, 1)

99,441 read from the orders file.
99,441 read from the customer file.
98,906 records in the output  file.


In [6]:
# get the state names from wikipedia
states = pd.read_html('https://en.wikipedia.org/wiki/ISO_3166-2:BR')
#print(f'Total tables: {len(states2)}')

# first set of brackets, states2[0], is the table we want from the list states2
# then get all rows, cols 0 - 2
br_states = states[0].iloc[:, 0:2]

br_states.reset_index(drop = True, inplace = True)

br_states.columns = ['cust_state', 'state_name']

br_states['customer_state'] = br_states['cust_state'].str.replace('BR-', '')

br_states = br_states.drop('cust_state', axis = 1)

# make this a dictionary we can use as a lookup table
repl = br_states.set_index('customer_state')['state_name'].to_dict()

# replace the state abbrevs with the names of each state
orders2 = orders.replace(repl)

In [7]:
orders2.head()

Unnamed: 0,customer_unique_id,order_id,order_purchase_timestamp,order_date,weekday,customer_state
52798,0000366f3b9a7992bf8c76cfdf3221e2,e22acc9c116caa3f2b7121bbb380d08e,2018-05-10 10:56:27,2018-05-10,Thursday,São Paulo
73889,0000b849f77a49e4a4ce2b2a4ca5be3f,3594e05a005ac4d06a72673270ef9ec9,2018-05-07 11:11:27,2018-05-07,Monday,São Paulo
26460,0000f46a3911fa3c0805444483337064,b33ec3b699337181488304f362a6b734,2017-03-10 21:05:03,2017-03-10,Friday,Santa Catarina
98493,0000f6ccb0745a6a4b88665a16c9f078,41272756ecddd9a9ed0180413cc22fb6,2017-10-12 20:29:41,2017-10-12,Thursday,Pará
41564,0004aac84e0df4da2b147fca70cf8255,d957021f1127559cd947b62533f484f7,2017-11-14 19:45:42,2017-11-14,Tuesday,São Paulo
