In [None]:
def get_contingency_matrix(order_details, verbose=0):

    # define item_list to hold our product pairing
    item_list = pd.DataFrame(columns=['a', 'b'])
    
    n_orders = order_details['order_id'].nunique()
    combo_list = []
    start = time.time()
    
    # loop through order history by order ID
    for i, (order_id, order_products) in enumerate(order_details.groupby('order_id')):
        
        if verbose > 0 and i > 0 and i%(n_orders//1000) == 0:
            print('Building list... processing orders ({:.1f}%)'.format(100*(i+1)/n_orders), end='\r')

        # Get unique list of products in order
        product_ids = list(order_products['product_id'].unique())

        # Add all product combinations to main list
        combo_list += list(itertools.product(product_ids,product_ids))

    # Convert list of product combos to dataframe
    item_list = pd.DataFrame(product_list, columns=['a','b'])    
    # Create crosstab matrix from dataframe
    if verbose > 0:
        print('Performing cross tabulation...', ' '*50, end='\r')
    matrix = pd.crosstab(item_list.a, item_list.b)
    
    # Sort column values by purchase quantity (in descending order)
    sorted_names = matrix.sum().sort_values(ascending=False).index.tolist()
    
    if verbose > 0:
        print('Complete ({:.0fs})'.format(time.time()-start), ' '*50)
    
    # return a sorted contingency matrix
    return matrix.loc[sorted_names, sorted_names]

In [None]:
def get_contingency_matrix(order_details, verbose=0):

    # define item_list to hold our product pairing
    item_list = pd.DataFrame(columns=['a', 'b'])
    
    n_orders = order_details['order_id'].nunique()
    combo_list = []
    start = time.time()
    
    # loop through order history by order ID
    for i, (order_id, order_products) in enumerate(order_details.groupby('order_id')):
        
        if verbose > 0 and i > 0 and i%(n_orders//1000) == 0:
            print('Building list... processing orders ({:.1f}%)'.format(100*(i+1)/n_orders), end='\r')

        # Get unique list of products in order
        product_ids = list(order_products['product_id'].unique())

        # Add all product combinations to main list
        combo_list += [frozenset(a,b) for a,b in itertools.product(product_ids,product_ids)]

    # Convert list of product combos to dataframe
    item_list = pd.DataFrame(product_list, columns=['a','b'])    
    # Create crosstab matrix from dataframe
    if verbose > 0:
        print('Performing cross tabulation...', ' '*50, end='\r')
    matrix = pd.crosstab(item_list.a, item_list.b)
    
    # Sort column values by purchase quantity (in descending order)
    sorted_names = matrix.sum().sort_values(ascending=False).index.tolist()
    
    if verbose > 0:
        print('Complete ({:.0fs})'.format(time.time()-start), ' '*50)
    
    # return a sorted contingency matrix
    return matrix.loc[sorted_names, sorted_names]

In [None]:
# Returns frequency counts for items and item pairs
def freq(iterable):
    if type(iterable) == pd.core.series.Series:
        return iterable.value_counts().rename("freq")
    else: 
        return pd.Series(Counter(iterable)).rename("freq")

    
# Returns number of unique orders
def order_count(order_item):
    return len(set(order_item.index))


# Returns generator that yields item pairs, one at a time
def get_item_pairs(order_item):
    order_item = order_item.reset_index().as_matrix()
    for order_id, order_object in groupby(order_item, lambda x: x[0]):
        item_list = [item[1] for item in order_object]
              
        for item_pair in combinations(item_list, 2):
            yield item_pair

In [None]:
from mlxtend.preprocessing import OnehotTransactions

In [None]:
def create_oht_df(order_details, verbose=0):
    order_details_flattened = []
    order_ids = []
    n_orders = order_details['order_id'].nunique()
    # loop through each order & flatten order to a single line
    for i, (group, data) in enumerate(order_details.groupby('order_id')):
        
        if verbose > 0 and i > 0 and i%(n_orders//1000) == 0:
            print('{}/{} ({:.1f}%)'.format(i+1, n_orders, 100*(i+1)/n_orders), end='\r')
        
        # find product names
        products_on_order = list(data['product_id'].values)

        # append order Id and product names to new array
        order_ids.append(group)
        order_details_flattened.append(products_on_order)

    # create one hot transaction
    oht = TransactionEncoder()

    # convert our flattened order data structure
    oht_arr = oht.fit(order_details_flattened).transform(order_details_flattened)

    # convert results to a dataframe and return
    return pd.DataFrame(oht_arr, columns=oht.columns_, index=order_ids, dtype=np.int8)

In [None]:
from mlxtend.preprocessing import TransactionEncoder

In [None]:
def product_contingency_matrix(data, dtype=np.uint32, sort_matrix=False, verbose=0):
    # Function start time
    start = time.time()
    
    # --- Part A: Product Combo Counts ---
    
    # Part A start time
    start_a = time.time()
    
    # Number of unique orders to process
    n_orders= data['order_id'].nunique()
    
    # Initialize counter
    combo_counter = Counter()
    
    for i, (order_id, order_products) in enumerate(data.groupby('order_id')):

            if verbose > 0 and i%(n_orders//1000) == 0:
                print('Getting product counts... ({:.1f}%)'.format(100*(i+1)/n_orders), end='\r')

            # Get unique list of products in current order
            product_ids = list(order_products['product_id'].unique())

            # Increment counter for all 2-product combinations
            for a,b in itertools.combinations(product_ids,2):
                combo_counter[frozenset({a,b})] += 1
                
            # Increment counter for independent products (i.e. combined with self)
            for a in product_ids:
                combo_counter[frozenset({a})] += 1
    
    if verbose > 0:
        td = time.time()-start_a
        td_min = td//60
        td_sec = td%60
        td_pretty = ('{:.0f}min '.format(td_min) if td_min > 0 else '') + ('{:.0f}s'.format(td_sec))
        print('Product counts completed [{}]'.format(td_pretty), ' '*50)
    
    # --- Part B: Build Contingency Matrix ---
    
    # Part B start time
    start_b = time.time()
    
    # Number of combinations to process
    n_combos = len(combo_counter)
    # List of unique product ids
    unique_products = data['product_id'].unique()
    # Number of unique products
    n_products = unique_products.shape[0]
    
    # Create numpy array of zeros to populate with counts
    count_arr = np.zeros((n_products, n_products), dtype=dtype)
    
    for i, (p_set, n) in enumerate(combo_counter.items()):
        # Convert count to reduce memory consumption
        n = dtype(n)
        
        if verbose > 0 and i%(n_combos//1000)==0:
            print('Populating table... ({:.1f}%)'.format(100*(i+1)/n_combos), end='\r')
        
        # Handle regular product combo counts
        p_tup = tuple(p_set)
        if len(p_tup)>1:            
            i = np.where(unique_products == p_tup[0])
            j = np.where(unique_products == p_tup[1])
            count_arr[i,j] = n
            count_arr[j,i] = n
        
        # Handle combo counts with self
        else:
            i = np.where(unique_products == p_tup[0])
            count_arr[i,i] = n
    
    # Counter no longer needed
    del combo_counter
    
    df = pd.DataFrame(data=count_arr, index=unique_products, columns=unique_products, dtype=dtype)
    
    # Numpy array no longer needed
    del count_arr
    
    if verbose > 0:
        td = time.time()-start_b
        td_min = td//60
        td_sec = td%60
        td_pretty = ('{:.0f}min '.format(td_min) if td_min > 0 else '') + ('{:.0f}s'.format(td_sec))
        print('Populating table completed [{}]'.format(td_pretty), ' '*50)
    
    # --- Part C: Sort Matrix (Optional) ---
    if sort_matrix:
        start_c = time.time()
        if verbose > 0:
            print('Sorting values...',end='\r')
        
        # Get list of sorted ids by total counts
        sorted_ids = pd.Series(np.diag(df), index=df.index).sort_values(ascending=False).index.tolist()
        # Sort dataframe by sorted ids
        df = df.loc[sorted_ids, sorted_ids]
        
        if verbose > 0:
            td = time.time()-start_c
            td_min = td//60
            td_sec = td%60
            td_pretty = ('{:.0f}min '.format(td_min) if td_min > 0 else '') + ('{:.0f}s'.format(td_sec))
            print('Sorting values completed [{}]'.format(td_pretty))
    
    if verbose > 0:
        td = time.time()-start
        td_min = td//60
        td_sec = td%60
        td_pretty = ('{:.0f}min '.format(td_min) if td_min > 0 else '') + ('{:.0f}s'.format(td_sec))
        print('Total Runtime: [{}]'.format(td_pretty))
    
    return df