In [1]:
# import libraries
from datetime import datetime, timedelta
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import math
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from IPython.display import clear_output
import copy

<div class="span5 alert alert-info">
<h3>Define Useful Functions</h3>
</div>

In [2]:
#getRecommendataionsFromCategoryGroupDict
#  Take a list of orders, a unique customer id in the orders and a Category DataFrame (columns represent 
#  categories of the Orders, and each row is a Group that sums to 100% with this percentage spread out 
#  among the columns and every row is unique) and return a dictionary of product ids as keys and the cosine
#  simularity between this users orders and a row of the Category DataFrame.  
#
#  Only products that the customer has not bought before will appear in the returned dictionary, and products
#  that are repeated will be listed once with its highest simularity score.  In addition, the user passes in
#  a minimum simularity score, and any Group whose simularity with this user is below this threshold will not
#  appear in the returned recommendations
#
#  orders - a list of orders from the OL_Data dataset, either the whole thing or simplified columns
#  custUniqueId - the ID of the customer to create recommendations for
#  productColName - name of the column that contains the unique product identifier
#  customerColName - name of the column that contains the unique customer identifier
#  categoryGroupsDF - Category DataFrame as described above, columns are categories, rows are Groups
#  categoryGroupsDF_DropCols - Columns in categoryGroupsDF to ignore
#  categoryGroupsDF_CatColumnName - the column name in the orders that contains the categories in the columns
#                                   of categoryGroupsDF
#  categoryGroupsDF_groupIdName - the column name of the unique Group (row) identifier in categoryGroupsDF
#  minimumSimularity - value between 0 and 1, any simularity lower than this will be ignored
#  includeRFM (optional) - Boolean to include/exclude the RFM score in recommendation
#  RFM_ColName (optional) - Column for RFM Score, only used if includeRFM is True
def getRecommendataionsFromCategoryGroupDict(orders,
                                             custUniqueId,
                                             
                                             productColName,
                                             customerColName,
                                             
                                             categoryGroupsDF,
                                             categoryGroupsDF_DropCols,
                                             categoryGroupsDF_CatColumnName,
                                             categoryGroupsDF_groupIdName,
                                             
                                             minimumSimularity,
                                             
                                             includeRFM = False,
                                             RFM_ColName = None):

    #Remove ignored columns from CategoryGroupsDF, and set index
    catGroupsDF_Indexed = categoryGroupsDF.drop(columns=categoryGroupsDF_DropCols).set_index(categoryGroupsDF_groupIdName)
    
    #Get all categories in the Group
    all_categories = list(catGroupsDF_Indexed.columns)
    all_categories = [str(x) for x in all_categories]    
    
    #Set up the Product Recommendation Dict, empty dict returned if no user orders, or no simularity high enough
    RecommendationDict = {}    
    RecommendationDict['IdType']=categoryGroupsDF_groupIdName+"_CosSim"
    
    #Create the Row for the customer to use for cosine simularity
    thiscustSubset = orders[orders[customerColName] == custUniqueId]
    thisCustCategories = np.array([len(thiscustSubset[thiscustSubset[categoryGroupsDF_CatColumnName] == x]) for x in all_categories])
    
    #No user orders
    if(sum(thisCustCategories) == 0):
        return RecommendationDict
    
    #Normalize customer's row to percentages in categories, and create DF
    thisCustCategories = 100*thisCustCategories/sum(thisCustCategories)
    thisCustCategories.shape = (1, len(thisCustCategories))
    custRowDF = pd.DataFrame(data=thisCustCategories, columns=all_categories)    

    #Calculate simularity scores, and add to dataframe for sorting against Group ID
    simularity = cosine_similarity(catGroupsDF_Indexed, custRowDF)
    simDF = pd.DataFrame(simularity, columns=['Simularity'])
    simDF[categoryGroupsDF_groupIdName] = catGroupsDF_Indexed.reset_index()[categoryGroupsDF_groupIdName]
    
    #Get orders with product_ids this user has not bought
    thisCustomerProducts = thiscustSubset[productColName].unique()
    newProductOrders = OL_Data_Simp[~OL_Data_Simp[productColName].isin(thisCustomerProducts)]    
    
    #Filter out Group IDs that are not simular enough
    SimilarEnoughDF = simDF[simDF.Simularity>=minimumSimularity].sort_values(by='Simularity', ascending=False) 
    for index, row in SimilarEnoughDF.iterrows():
        
        prob = row['Simularity']
            
        if includeRFM and RFM_ColName != None:
            maxRFM = max(orders[RFM_ColName])
            
            #Get Product Ids, keeping the id with the highest specified RFM
            thisProdRecs = newProductOrders[newProductOrders[categoryGroupsDF_groupIdName]==row[categoryGroupsDF_groupIdName]][[productColName, RFM_ColName]]
            thisProdRecs = thisProdRecs.sort_values(by=RFM_ColName)
            thisProdRecs = thisProdRecs.drop_duplicates(subset=[productColName])
            
            #Go through collected Product Ids, replacing existing if new score is higher
            #or adding new
            for i,row in thisProdRecs.iterrows():
                adjustedProb = prob+row[RFM_ColName]/maxRFM
                
                if(row[productColName] in RecommendationDict.keys()):
                    RecommendationDict[row[productColName]] = max(RecommendationDict[row[productColName]], adjustedProb)
                else:
                    RecommendationDict[row[productColName]] = adjustedProb
        else:
            #Get product_ids of this Group, and add those not already in the Recommendations
            thisProdRecs = newProductOrders[newProductOrders[categoryGroupsDF_groupIdName] == row[categoryGroupsDF_groupIdName]][productColName].unique()

            for prod in thisProdRecs:
                if(prod in RecommendationDict.keys()):
                    continue
                
                RecommendationDict[prod] = prob
    #Return the product recommendations
    return RecommendationDict


In [3]:
#getRecommendataionsFromCategoryGroupDict
#  Take a list of orders, a unique customer id in the orders and a Category DataFrame (columns represent 
#  categories of the Orders, and each row is a Group that sums to 100% with this percentage spread out 
#  among the columns and every row is unique) and return a dictionary of product ids as keys and the cosine
#  simularity between this users orders and a row of the Category DataFrame.  
#
#  Only products that the customer has not bought before will appear in the returned dictionary, and products
#  that are repeated will be listed once with its highest simularity score.  In addition, the user passes in
#  a minimum simularity score, and any Group whose simularity with this user is below this threshold will not
#  appear in the returned recommendations
#
#  orders - a list of orders from the OL_Data dataset, either the whole thing or simplified columns
#  custUniqueId - the ID of the customer to create recommendations for
#  productColName - name of the column that contains the unique product identifier
#  customerColName - name of the column that contains the unique customer identifier
#  categoryGroupsDF - Category DataFrame as described above, columns are categories, rows are Groups
#  categoryGroupsDF_DropCols - Columns in categoryGroupsDF to ignore
#  categoryGroupsDF_CatColumnName - the column name in the orders that contains the categories in the columns
#                                   of categoryGroupsDF
#  categoryGroupsDF_groupIdName - the column name of the unique Group (row) identifier in categoryGroupsDF
#  minimumSimularity - value between 0 and 1, any simularity lower than this will be ignored
#  includeRFM (optional) - Boolean to include/exclude the RFM score in recommendation
#  RFM_ColName (optional) - Column for RFM Score, only used if includeRFM is True
def getRecommendataionsFromSimGroup(orders,
                                    custUniqueId,
                                             
                                    productColName,
                                    customerColName,
                                    
                                    SimularityDF,
                                    groupIdCol,
                                    CustGroupId,
                                    minimumSimularity,
                                    
                                    includeRFM = False,
                                    RFM_ColName = None):
    
    #Set up the Product Recommendation Dict, empty dict returned if no user orders, or no simularity high enough
    RecommendationDict = {}    
    RecommendationDict['IdType']=categoryGroupsDF_groupIdName+"_SimScoreTable"
    
    #Create the Row for the customer to use for cosine simularity
    thiscustSubset = orders[orders[customerColName] == custUniqueId]
    
    #Calculate simularity scores, and add to dataframe for sorting against Group ID
    simularity = SimularityDF[CustGroupId]
    simularity = simularity*(simularity>=minimumSimularity)
    groups = SimularityDF[groupIdCol]
    
    #Get orders with product_ids this user has not bought
    thisCustomerProducts = thiscustSubset[productColName].unique()
    newProductOrders = OL_Data_Simp[~OL_Data_Simp[productColName].isin(thisCustomerProducts)]    
    
    #Filter out Group IDs that are not simular enough
    for i in range(len(simularity)):
        if(simularity == 0):
            continue
        
        prob = simularity[i]
            
        if includeRFM and RFM_ColName != None:
            maxRFM = max(orders[RFM_ColName])
            
            #Get Product Ids, keeping the id with the highest specified RFM
            thisProdRecs = newProductOrders[newProductOrders[groupIdCol]==CustGroupId][[productColName, RFM_ColName]]
            thisProdRecs = thisProdRecs.sort_values(by=RFM_ColName)
            thisProdRecs = thisProdRecs.drop_duplicates(subset=[productColName])
            
            #Go through collected Product Ids, replacing existing if new score is higher
            #or adding new
            for i,row in thisProdRecs.iterrows():
                adjustedProb = prob+row[RFM_ColName]/maxRFM
                
                if(row[productColName] in RecommendationDict.keys()):
                    RecommendationDict[row[productColName]] = max(RecommendationDict[row[productColName]], adjustedProb)
                else:
                    RecommendationDict[row[productColName]] = adjustedProb
        else:
            #Get product_ids of this Group, and add those not already in the Recommendations
            thisProdRecs = newProductOrders[newProductOrders[groupIdCol] == CustGroupId][productColName].unique()

            for prod in thisProdRecs:
                if(prod in RecommendationDict.keys()):
                    continue
                
                RecommendationDict[prod] = prob
    #Return the product recommendations
    return RecommendationDict


In [4]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_samples
from IPython.display import clear_output
import copy


#getRecommendataionsFromAglomCluster
#  Take a list of orders, a unique customer id in the orders and a Category DataFrame (columns represent 
#  categories of the Orders, and each row is a Group that sums to 100% with this percentage spread out 
#  among the columns and every row is unique) and return a dictionary of product ids as keys and the silouette
#  score of the Groups in the same cluster as the customer
#
#  Only products that the customer has not bought before will appear in the returned dictionary, and products
#  that are repeated will be listed once with its highest silouette score.  In addition, the user passes in
#  a minimum simularity score, and any Group whose simularity with this user is below this threshold will not
#  appear in the returned recommendations
#
#  orders - a list of orders from the OL_Data dataset, either the whole thing or simplified columns
#  custUniqueId - the ID of the customer to create recommendations for
#  productColName - name of the column that contains the unique product identifier
#  customerColName - name of the column that contains the unique customer identifier
#  categoryGroupsDF - Category DataFrame as described above, columns are categories, rows are Groups
#  categoryGroupsDF_DropCols - Columns in categoryGroupsDF to ignore
#  categoryGroupsDF_CatColumnName - the column name in the orders that contains the categories in the columns
#                                   of categoryGroupsDF
#  categoryGroupsDF_groupIdName - the column name of the unique Group (row) identifier in categoryGroupsDF
#  minimumSimularity - value between 0 and 1, any simularity lower than this will be ignored
#  includeRFM (optional) - Boolean to include/exclude the RFM score in recommendation
#  RFM_ColName (optional) - Column for RFM Score, only used if includeRFM is True
def getRecommendataionsFromAglomCluster(orders,
                                        custUniqueId,
                                        
                                         productColName,
                                         customerColName,

                                         categoryGroupsDF,
                                         categoryGroupsDF_DropCols,
                                         categoryGroupsDF_CatColumnName,
                                         categoryGroupsDF_groupIdName,

                                         minimumSimularity,
                                         includeRFM = False,
                                         RFM_ColName = None):
    
    #Remove ignored columns from CategoryGroupsDF, and set index
    catGroupsDF_Indexed = categoryGroupsDF.drop(columns=categoryGroupsDF_DropCols).set_index(categoryGroupsDF_groupIdName).sort_values(by=categoryGroupsDF_groupIdName)
    
    #Get all categories in the Group
    all_categories = list(catGroupsDF_Indexed.columns)
    all_categories = [str(x) for x in all_categories]    
    
    #Set up the Product Recommendation Dict, empty dict returned if no user orders, or no simularity high enough
    RecommendationDict = {}    
    RecommendationDict['IdType']=categoryGroupsDF_groupIdName+"_Aglom"
    
    #Get the Row for the customer to use Agglomerative Clustering
    thiscustSubset = orders[orders[customerColName] == custUniqueId]
    thisCustCategories = np.array([len(thiscustSubset[thiscustSubset[categoryGroupsDF_CatColumnName] == x]) for x in all_categories])
    
    #No user orders
    if(sum(thisCustCategories) ==0):
        return RecommendationDict
    
    #Normalize customer's row to percentages in categories, and create DF
    thisCustCategories = 100*thisCustCategories/sum(thisCustCategories)
    thisCustCategories.shape = (1, len(thisCustCategories))
    custRowDF = pd.DataFrame(data=thisCustCategories, columns=all_categories)    
    

    #Add customer to Cat Groups for clustering,
    allDF = pd.concat([catGroupsDF_Indexed,custRowDF], ignore_index=True)
    model = AgglomerativeClustering(n_clusters=30, linkage="average").fit(allDF)
    #Get clusters for customer and others
    clusterAssignments = model.labels_
    cusCluster = clusterAssignments[-1]
    otherClusters = clusterAssignments[:-1]
    #Get silouette scores for customer and others
    scores = silhouette_samples(allDF,clusterAssignments, metric ='cosine')
    cusScore = scores[-1]
    otherScores = scores[:-1]
    
    #Add cluster assignments and silouette to Category DF
    catGroupsDF_Indexed['assignedTo'] = otherClusters
    catGroupsDF_Indexed['s_score'] = otherScores
    catGroupsDF_Indexed = catGroupsDF_Indexed.reset_index()
    
    #Get groups in same cluster of user and filter out sillouette score lower than threshold
    group = catGroupsDF_Indexed[catGroupsDF_Indexed['assignedTo']==cusCluster]
    group = group[group['s_score']>=minimumSimularity].sort_values(by='s_score')
    
    #Get orders with products the user has not bought
    thisCustomerProducts = thiscustSubset[productColName].unique()
    newProductOrders = OL_Data_Simp[~OL_Data_Simp[productColName].isin(thisCustomerProducts)]
    
    
    for i,row in group.iterrows():
        prob = row['s_score']
        
        if includeRFM and RFM_ColName != None:
            maxRFM = max(orders[RFM_ColName])
            
            #Get Product Ids, keeping the id with the highest specified RFM
            thisProdRecs = newProductOrders[newProductOrders[categoryGroupsDF_groupIdName] == row[categoryGroupsDF_groupIdName]][[productColName, RFM_ColName]]
            thisProdRecs = thisProdRecs.sort_values(by=RFM_ColName)
            thisProdRecs = thisProdRecs.drop_duplicates(subset=[productColName])
            
            #Go through collected Product Ids, replacing existing if new score is higher
            #or adding new
            for i,row in thisProdRecs.iterrows():
                adjustedProb = prob+row[RFM_ColName]/maxRFM
                
                if(row[productColName] in RecommendationDict.keys()):
                    RecommendationDict[row[productColName]] = max(RecommendationDict[row[productColName]], adjustedProb)
                else:
                    RecommendationDict[row[productColName]] = adjustedProb
        else:
            #Get product_ids of this Group, and add those not already in the Recommendations
            thisProdRecs = newProductOrders[newProductOrders[categoryGroupsDF_groupIdName] == row[categoryGroupsDF_groupIdName]][productColName].unique()

            for prod in thisProdRecs:
                if(prod in RecommendationDict.keys()):
                    continue
                
                RecommendationDict[prod] = prob
    #return recommendations
    return RecommendationDict


In [5]:
#getRecommendataionsForCustomer
#  Take a list of orders, a unique customer id in the orders and a dictionary containing Category Groups of the form:
#      key : Unique Group Identifier in Category Group
#      value : dict{
#                     'Groups': DF of Groups (rows) and Categories (columns),
#                     'DropCols': columns to ignore in the DF,
#                     'CategoryColName': column name in orders that contains values used as columns for DF
#                  }
#
#  For each Category Group, find recommendations using cosine simularity and Agglomerative clustering that 
#  are above a minimum threshold provided. 
#
#  Gather the recommendations into a DataFrame and sum them into a column called Total.  Return a DataFrame
#  of recommendations for the user sorted by the total score
#
#  orders - a list of orders from the OL_Data dataset, either the whole thing or simplified columns
#  custUniqueId - the ID of the customer to create recommendations for
#  CatgoryGroupDict - Dictionary of Category Groups, as discribed above
#  minimumSimularity - value between 0 and 1, any simularity lower than this will be ignored
#  includeRFM (optional) - Boolean to include/exclude the RFM score in recommendation
#  RFM_ColName (optional) - Column for RFM Score, only used if includeRFM is True
def getRecommendataionsForCustomer(orders,
                                   customerUniqueId,
                                    CategoryGroupDict,
                                    minimumSimularity,
                                    includeRFM = False):
    #Array to make DF from
    recomendationArray = []
    
    for i, (k,v) in enumerate(CategoryGroupDict.items()):
            #Add Cosine Simularity Recommendations to the Array
            recomendationArray.append(
                getRecommendataionsFromCategoryGroupDict(
                                        orders,
                                        customerUniqueId,
                                        'product_id',
                                        'customer_unique_id',
                                        v['Groups'],
                                        v['DropCols'],
                                        v['CategoryColName'],
                                        k,
                                        minimumSimularity,
                                        includeRFM,
                                        v['RFMCol'])
            )
            
            #Add Agglomerative Simularity Recommendations to the Array
            recomendationArray.append(
                getRecommendataionsFromAglomCluster(
                                        orders,
                                        customerUniqueId,
                                        'product_id',
                                        'customer_unique_id',
                                        v['Groups'],
                                        v['DropCols'],
                                        v['CategoryColName'],
                                        k,
                                        minimumSimularity,
                                        includeRFM,
                                        v['RFMCol'])
            )
    
    #Create Product Rec DF from array, and fill NA with 0
    recs = pd.DataFrame(recomendationArray)
    allRecs = recs.rename(columns={'IdType':'index'}).set_index('index').T.reset_index().rename(columns={'index':'product_id'}).fillna(0)
    
    #Create Total Column, a sum of all the other recommendation scores
    scoreTypes = recs.IdType.unique()
    allRecs['Total'] = np.zeros(len(allRecs))
    for scoreType in scoreTypes:
        allRecs['Total'] = allRecs['Total']+allRecs[scoreType]
    
    #Return Recommendations
    return allRecs.sort_values(by='Total', ascending=False)
        

<div class="span5 alert alert-info">
<h3>Import Data</h3>
</div>

In [6]:
#Import Order Data
OL_Data = pd.read_csv("./OL_Data_Cat_Simplified.csv")

OL_Data_RFM = pd.read_csv("./OLData_RFM_Metrics.csv")
buyerRFMDF = OL_Data_RFM[['customer_unique_id','Buyer_OverallRFMScore']].drop_duplicates()
sellerRFMDF = OL_Data_RFM[['seller_id','Seller_OverallRFMScore']].drop_duplicates()

In [7]:
#Simplify the Orders Data Frame
OL_Data_Simp = OL_Data[['order_id',
                        'customer_unique_id',
                        'product_id',
                        'product_category_name_english',
                        'seller_id',
                        'SellerGroupId',
                        'CustomerGroupId',
                        'CustomerProductGroupId']]
OL_Data_Simp = OL_Data_Simp.drop_duplicates()
OL_Data_Simp = OL_Data_Simp.merge(buyerRFMDF, how='left', on='customer_unique_id') 
OL_Data_Simp = OL_Data_Simp.merge(sellerRFMDF, how='left', on='seller_id') 

In [8]:
#Generate Category Groups Dict to Generate Recommendations
CategoryGroups = {}

#Add Seller-Category Groups
SellerCategoryGroups = pd.read_csv("./SellerCategoryUniqueGroups.csv").sort_values(by='SellerGroupId')
SellerSimularityScores = pd.read_csv("./SellerCategoryCosineSimularity.csv").sort_values(by='SellerGroupId')
SellerCatDict = {'Groups':SellerCategoryGroups, 
                 'DropCols':['size'],
                 'CategoryColName':'product_category_name_english',
                 'CatIdentifier':'seller_id',
                 'Simularity': SellerSimularityScores,
                 'RFMCol':'Seller_OverallRFMScore'}
CategoryGroups['SellerGroupId']=SellerCatDict

#Add Buyer-Category Groups
CustomerCategoryGroups = pd.read_csv("./BuyerCategoryUniqueGroups.csv").sort_values(by='CustomerGroupId')
CustomerSimularityScores = pd.read_csv("./BuyerCategoryCosineSimularity.csv").sort_values(by='CustomerGroupId')
CustomerCatDict = {'Groups':CustomerCategoryGroups, 
                 'DropCols':['size'],
                 'CategoryColName':'product_category_name_english',
                 'CatIdentifier':'customer_unique_id',
                 'Simularity': CustomerSimularityScores,
                 'RFMCol':'Buyer_OverallRFMScore'}
CategoryGroups['CustomerGroupId']=CustomerCatDict

#Add Customer-Product Groups
CustomerProductGroups = pd.read_csv("./Product_Customer_UniqueGroups.csv").sort_values(by='CustomerProductGroupId')
CustomerProductSimularityScores = pd.read_csv("./Product_Customer_CosineSimularity.csv").sort_values(by='CustomerProductGroupId')
ProductSimDict = {'Groups':CustomerProductGroups, 
                    'DropCols':['size'],
                    'CategoryColName':'product_id',
                    'CatIdentifier':'customer_unique_id',
                    'Simularity': CustomerProductSimularityScores,
                    'RFMCol':'Buyer_OverallRFMScore'}
CategoryGroups['CustomerProductGroupId']=ProductSimDict

<div class="alert alert-block alert-success">
    <h3><b>Generate Recommendations</b></h3>
</div>

#### Find common Customer Ids

In [9]:
OL_Data_Simp.customer_unique_id.value_counts().head(5)

8d50f5eadf50201ccdcedfb9e2ac8455    16
3e43e6105506432c953e165fb2acf44c    13
1b6c7548a2a1f9037c1fd3ddfed95f33     9
ca77025e7201e3b30c44b472ff346268     9
c8ed31310fc440a3f8031b177f9842c3     8
Name: customer_unique_id, dtype: int64

#### Purchases for an Example Customer

In [10]:
exampleCustomer = 'ca77025e7201e3b30c44b472ff346268'
OL_Data_Simp[OL_Data_Simp.customer_unique_id == exampleCustomer].product_category_name_english.value_counts()


Furniture_Home       4
HomeAppliance        2
CoolStuff            2
Furniture_Bedroom    1
Name: product_category_name_english, dtype: int64

### Recommendations

In [11]:
#Product Ids with Category
prodCategories = OL_Data_Simp[['product_id','product_category_name_english']].drop_duplicates()

#Standard
recs = getRecommendataionsForCustomer(OL_Data_Simp, exampleCustomer, CategoryGroups, 0.8)
recs = recs.sort_values(by='Total', ascending=False)
recs = recs.merge(prodCategories, how='left', on='product_id')

#With RFM Consideration
recs2 = getRecommendataionsForCustomer(OL_Data_Simp, exampleCustomer, CategoryGroups, 0.8, True)
recs2 = recs2.sort_values(by='Total', ascending=False)
recs2 = recs2.merge(prodCategories, how='left', on='product_id')

In [12]:
recs.head(100).product_category_name_english.value_counts()

Furniture_Home          79
HomeAppliance           11
CoolStuff                2
Furniture_Bedroom        1
Sports                   1
Furniture_LivingRoom     1
Name: product_category_name_english, dtype: int64

In [13]:
recs2.head(100).product_category_name_english.value_counts()

Furniture_Home       74
HomeAppliance         8
Toys                  5
Furniture_Bedroom     3
CoolStuff             3
MarketPlace           2
Baby                  2
Books                 1
Name: product_category_name_english, dtype: int64

<div class="span5 alert alert-info">
<h3>Update Data Based on New Orders</h3>
</div>

In [14]:
#getRecommendataionsFromCategoryGroupDict
#  Take a Category DataFrame (columns represent categories of the Orders, and each row is a Group that sums 
#  to 100% with this percentage spread out among the columns and every row is unique) and a simularity matrix
#  between the Groups of the Category DataFrame, and update them based on new orders 
#
#  This function will update the categories on a customer by customer basis, needing the column to identify
#  unique customers and the old orders the original Category DataFrame and Simularity Matrix was made from
#  to update them.  This is not the most effiecient method, as customers perchase percentages are most likely 
#  repeated, but it does the job.  Improving the performance is left for future work
#
#  categoryGroupsDF - Category DataFrame as described above, columns are categories, rows are Groups
#  categoryGroupsDF_DropCols - Columns in categoryGroupsDF to ignore
#  categoryGroupsDF_CatColumnName - the column name in the orders that contains the categories in the columns
#                                   of categoryGroupsDF
#  categoryGroupsDF_groupIdName - the column name of the unique Group (row) identifier in categoryGroupsDF
#  simularityScores - Matrix representing the simularity between every Group in the Category Groups DF Rows
#  uniqueCustIdCol - name of the column that contains the unique customer identifier
#  oldOrders - Orders the original Category Groups DataFrame and Simularity Matrix were made from
#  newOrders - Orders to add/modify the Category Groups DataFrame and Simularity Matrix
def UpdateCategoryGroupsForNewSales(categoryGroupsDF,
                                    categoryGroupsDF_CatColumnName,
                                    categoryGroupsDF_DropCols,
                                    categoryGroupsDF_groupIdName,
                                    
                                    simularityScores,
                                    uniqueCustIdCol,
                                    oldOrders,
                                    newOrders):
    
    #Gather all orders
    allOrders = pd.concat([oldOrders, newOrders])
    #Boolean to represent whether new Groups have been made
    newGroups = False
    
    #Parameters used for Tracking Progress
    ind = 0
    lengthOfNewUsers = len(newOrders[uniqueCustIdCol].unique())
    print("Updating orders of {0} users".format(lengthOfNewUsers))
    
    #Update Data Structures on a user by user basis
    for custId in newOrders[uniqueCustIdCol].unique():
        #Print update
        if(lengthOfNewUsers>100 and ind%(lengthOfNewUsers//10) ==0):
                print("{0}% ".format(ind/lengthOfNewUsers*100))
        ind= ind+1
        
        #Get old Category Group
        oldCategory = oldOrders[oldOrders[uniqueCustIdCol]==custId][categoryGroupsDF_groupIdName]
        
        #Index Category by Group Id
        catGroupsDF_Indexed = categoryGroupsDF.set_index(categoryGroupsDF_groupIdName).sort_values(by=categoryGroupsDF_groupIdName)
        if(len(oldCategory) != 0):
            #If Old Category exists, decrease count by one
            #This will be replaced if new orders do not alter the percetage distribution of orders
            oldCategory = oldCategory[0]
            catGroupsDF_Indexed.loc[oldCategory,'size'] = catGroupsDF_Indexed.loc[oldCategory,'size'] - 1

        #Remove ignored columns from CategoryGroupsDF
        catGroupsDF_Dropped = catGroupsDF_Indexed.drop(columns=categoryGroupsDF_DropCols)

        #Get all categories in the Group
        all_categories = list(catGroupsDF_Dropped.columns)
        all_categories = [str(x) for x in all_categories]        

        #Make Customer Row From Old Orders
        thiscustOldSubset = oldOrders[oldOrders[uniqueCustIdCol] == custId]
        thisCustOld = np.array([len(thiscustOldSubset[thiscustOldSubset[categoryGroupsDF_CatColumnName] == x]) for x in all_categories])
        #And Customer Row From New Orders
        thiscustNewSubset = newOrders[newOrders[uniqueCustIdCol] == custId]
        thisCustNew = np.array([len(thiscustNewSubset[thiscustNewSubset[categoryGroupsDF_CatColumnName] == x]) for x in all_categories])
        
        #Normalize Customer Row to 100%
        custTotal = thisCustOld+thisCustNew
        custTotalNorm = custTotal/sum(custTotal)*100/5.0
        custTotalNorm = np.floor(custTotalNorm)
        custTotalNorm = custTotalNorm * 5.0
        custTotalNorm.shape=(1,len(custTotalNorm))
        
        #Create Customer DataFrame
        custTotalNormDF = pd.DataFrame(data=custTotalNorm, columns=all_categories)    
        #Get similarity between Customer DataFrame and Existing Groups
        simScores = np.array(cosine_similarity(catGroupsDF_Dropped,custTotalNormDF))
        
        if(max(simScores)>=.9999):
            #Simularity is near enough to an existing Group
            
            #Create and Sort Simularity Scores
            simDF = pd.DataFrame(simScores, columns=['Simularity'])
            simDF[categoryGroupsDF_groupIdName] = catGroupsDF_Dropped.reset_index()[categoryGroupsDF_groupIdName]
            simDF = simDF[simDF.Simularity == max(simDF.Simularity)]
            
            #Find New Category for this Customer, and add 1 to the size of that group
            newCat = simDF.head(1)[categoryGroupsDF_groupIdName]
            catGroupsDF_Indexed.loc[newCat,'size'] = catGroupsDF_Indexed.loc[newCat,'size'] + 1
            
            #Reset CategroyGroup Index and set all the customer's orders to new Category
            categoryGroupsDF = catGroupsDF_Indexed.reset_index()
            allOrders.loc[allOrders[uniqueCustIdCol]==custId,categoryGroupsDF_groupIdName] = newCat
        else:
            #Matching Group Does not Exist, Create a new one
            newGroups = True
            #New Category Numer
            newCat = max(categoryGroupsDF[categoryGroupsDF_groupIdName])+1
            
            #Add Category Number and size to Customer Row DF
            custTotalNormDF[categoryGroupsDF_groupIdName] = newCat
            custTotalNormDF['size'] = 1
            #Add to Category Groups
            categoryGroupsDF = pd.concat([categoryGroupsDF,custTotalNormDF])
            
            #Add simularity Scores to Simularity Matrix
            simCols = simularityScores.set_index(categoryGroupsDF_groupIdName).columns
            length = len(simScores)
            simScores.shape = (1, length)
            
            #New Group Simularity DF Row
            custSimScoresDF = pd.DataFrame(data=simScores, columns=simCols)
            custSimScoresDF[categoryGroupsDF_groupIdName] = newCat
            custSimScoresDF[str(newCat)] = 1.0
            
            #Add new column for new scores
            simScores.shape = (length,1)
            simularityScores[str(newCat)] = simScores
            #Add New Group Row to Sim Scores
            simularityScores = pd.concat([simularityScores,custSimScoresDF])
            
            #Update all customer orders
            allOrders.loc[allOrders[uniqueCustIdCol]==custId,categoryGroupsDF_groupIdName] = newCat
        
    
    if(all(categoryGroupsDF['size'] != 0) and not newGroups):
        #If there are no groups added or removed, no need to go any further
        return (allOrders, categoryGroupsDF, simularityScores)
    
    #Set Indexes
    catGroupsDF_Indexed = categoryGroupsDF.set_index(categoryGroupsDF_groupIdName)
    simScore_Indexed = simularityScores.set_index(categoryGroupsDF_groupIdName)
    
    if(any(categoryGroupsDF['size'] <= 0)):
        #If some groups are now empty, remove them from Category Groups and Simularity Scores
        groupsToDrop = categoryGroupsDF[categoryGroupsDF.size == 0][categoryGroupsDF_groupIdName]
    
        for group in groupsToDrop:
            catGroupsDF_Indexed = catGroupsDF_Indexed.drop(group)
            simScore_Indexed = simScore_Indexed.drop(group)
            simScore_Indexed = simScore_Indexed.drop(columns=group)
    
    #Reset Indexes
    categoryGroupsDF = catGroupsDF_Indexed.reset_index()
    simularityScores = simScore_Indexed.reset_index()
    
    #Renumber all Groups, so that they are of a continuous range
    #creating dictionary from old group number to new
    groupsIndOld = categoryGroupsDF[categoryGroupsDF_groupIdName]
    groupsIndNew = list(range(len(groupsIndOld)))
    changeDict = {groupsIndOld[i] : groupsIndNew[i] for i in range(len(groupsIndOld))}
    
    #Update Orders to new Group Numbers
    allOrders.replace({categoryGroupsDF_groupIdName:changeDict})
    #Update Category Groups to new Group Numbers
    categoryGroupsDF[categoryGroupsDF_groupIdName] = groupsIndNew
    categoryGroupsDF = categoryGroupsDF.rename(columns=changeDict)
    #Update Simularity Scores to new Group Numbers
    simularityScores[categoryGroupsDF_groupIdName] = groupsIndNew
    simularityScores = simularityScores.rename(columns=changeDict)
    
    return (allOrders, categoryGroupsDF, simularityScores)
    

<div class="alert alert-block alert-success">
    <h3><b>Example Use Case</b></h3>
</div>

In [15]:
#Create Subset of groups, representing the first 5 Customer Groups
custGroups = range(0,5)
CustomerCategoryGroupsSub = CustomerCategoryGroups[CustomerCategoryGroups.CustomerGroupId.isin(custGroups)]
#Subset Of Simularity Scores
CustomerSimularityScoresSub = CustomerSimularityScores[CustomerSimularityScores.CustomerGroupId.isin(custGroups)]
columnsSubset = ['CustomerGroupId'] + [str(x) for x in custGroups]
CustomerSimularityScoresSub = CustomerSimularityScoresSub[columnsSubset]
#Subset of Orders
OrdersSub = OL_Data_Simp[OL_Data_Simp.CustomerGroupId.isin(custGroups)]

#New Orders Subset
newCustGrous = range(100,105)
newOrders = OL_Data_Simp[OL_Data_Simp.CustomerGroupId.isin(newCustGrous)]

In [16]:
#Old Category Groups
CustomerCategoryGroupsSub.loc[:, CustomerCategoryGroupsSub.any()]

Unnamed: 0,CustomerGroupId,Beauty,Furniture_Bedroom,Gardening,Office,Technology,size
306,0,0,100,0,0,0,8691
105,1,100,0,0,0,0,11332
495,2,0,0,0,100,0,3388
522,3,0,0,0,0,100,12988
414,4,0,0,100,0,0,3459


In [17]:
#Old Simularity Scores
CustomerSimularityScoresSub

Unnamed: 0,CustomerGroupId,0,1,2,3,4
0,0,1.0,0.0,0.0,0.0,0.0
1,1,0.0,1.0,0.0,0.0,0.0
2,2,0.0,0.0,1.0,0.0,0.0
3,3,0.0,0.0,0.0,1.0,0.0
4,4,0.0,0.0,0.0,0.0,1.0


#### Update Based on New Orders

In [19]:
CategoryGroup = CategoryGroups['CustomerGroupId']
(orders, cat, sim) = UpdateCategoryGroupsForNewSales(CustomerCategoryGroupsSub,
                                                        CategoryGroup['CategoryColName'],
                                                        CategoryGroup['DropCols'],
                                                        'CustomerGroupId',
                                                        CustomerSimularityScoresSub,
                                                        'customer_unique_id',
                                                        OrdersSub,
                                                        newOrders)

Updating orders of 42 users


In [20]:
#New Categories
cat.loc[:,cat.any()]

Unnamed: 0,CustomerGroupId,Beauty,Clothing,CoolStuff,Furniture_Bedroom,Furniture_Home,Furniture_LivingRoom,Gardening,Office,Technology,Toys,Video Games,size
0,0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8691
1,1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11332
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,3388
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,12988
4,4,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,3459
5,5,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,0.0,10
6,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,50.0,0.0,11
7,7,0.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,0.0,0.0,12
8,8,0.0,0.0,0.0,0.0,50.0,50.0,0.0,0.0,0.0,0.0,0.0,8
9,9,0.0,0.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,65.0,1


In [21]:
#New Simularity
sim

Unnamed: 0,CustomerGroupId,0,1,2,3,4,5,6,7,8,9
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,1.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0
2,2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,1.0,0.0,0.0,0.707107,0.707107,0.0,0.0
4,4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,5,0.0,0.707107,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0
6,6,0.0,0.0,0.0,0.707107,0.0,0.5,1.0,0.5,0.0,0.0
7,7,0.0,0.0,0.0,0.707107,0.0,0.0,0.5,1.0,0.0,0.0
8,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
9,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
