In [52]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
import warnings
import xgboost as xgb
warnings.filterwarnings("ignore")
%matplotlib inline

# Create Recommender

In [58]:
def get_split_predictions():
    count = 0
    with ZipFile('outputs/xgb_models_and_data.zip', 'r') as zipfile:
        count = len(zipfile.infolist())
        zipfile.extractall()
    print("Found buffer zip file")
    print("Number of files", count)
    #predictions = 
    user_clusterdf = pd.read_csv("outputs/user_multi_cluster_assignments.csv")
    purdf = user_clusterdf.iloc[0:0]
    predictions = 0
    count = int(count/2)
    for i in range(count):
        segment = user_clusterdf[user_clusterdf['cluster_num'] == i]
        bst = xgb.Booster({'nthread': 4})  # init model
        bst.load_model('outputs/xgb_model%d.bin' % i) # load model
        dmodel = xgb.DMatrix('outputs/xgb_data%d.buffer' % i) # load model data
        segment['pur_pred'] = bst.predict(dmodel)
        if i == 0:
            purdf = segment
        else:
            purdf = pd.concat([purdf, segment])
    user_clusterdf['colFromIndex'] = user_clusterdf.index
    user_clusterdf = user_clusterdf.sort_values(['cluster_num', 'colFromIndex'])
    user_clusterdf = user_clusterdf.join(purdf['pur_pred']).sort_values(['colFromIndex'])
    return user_clusterdf['pur_pred'].values
    
def get_predictions():
    with ZipFile('outputs/xgb_model_and_data.zip', 'r') as zipfile:
        zipfile.extractall()
    print("Found buffer zip file")
    bst = xgb.Booster({'nthread': 4})  # init model
    bst.load_model('outputs/xgb_model.bin') # load model
    dmodel = xgb.DMatrix('outputs/xgb_data.buffer')# load model data
    return bst.predict(dmodel)
    
def create_cluster_pred_pur(split_models = False):
    predictions = []
    user_clusterdf = []
    if split_models:
        try:
            predictions = get_split_predictions()
            user_clusterdf = pd.read_csv("outputs/user_multi_cluster_assignments.csv")
        except:
            print("Failed to find models and data zip file.")
            return 0
    else:
        try:
            predictions = get_predictions()
            user_clusterdf = pd.read_csv("outputs/user_cluster_assignments.csv")
        except:
            print("Failed to find model and data zip file.")
            return 0
    
    user_pred_clusterdf = user_clusterdf.join(pd.DataFrame(data = predictions, columns = ['pur_pred']))

    unique_clusters = user_pred_clusterdf['cluster_num'].unique()
    unique_clusters.sort()

    for cluster_num in unique_clusters:
        cluster = user_pred_clusterdf[user_pred_clusterdf['cluster_num'] == cluster_num]

        p_cats = ['p_cat_1', 'p_cat_2', 'p_cat_3']
        prod_cats = cluster['p_cat_1'].unique()
        prod_cats.sort()
        pur_sums = {x:0 for x in prod_cats}

        for cat in p_cats:
            prod_cats = cluster[cat].unique()
            prod_cats = prod_cats[prod_cats != 0]
            prod_cats.sort()
            for prod_cat in prod_cats:
                pur_sums[prod_cat] += cluster[cluster[cat] == prod_cat]['pur_pred'].sum()

        cluster_pur = pd.DataFrame(list(pur_sums.items()), columns = ['p_cat', 'total_pur_pred'])
        if split_models:
            cluster_pur.to_csv("outputs/split_cluster_%d_pred_pur.csv" % cluster_num, encoding='utf-8', index = False)
        else:
            cluster_pur.to_csv("outputs/cluster_%d_pred_pur.csv" % cluster_num, encoding='utf-8', index = False)
        print("Wrote Cluster", cluster_num, "Predicted Purchase Information")
    return len(unique_clusters)

In [50]:
# Reading top k product categories for each cluster
def top_k_cats(top_k = 18):
    top_k = min(max(top_k, 0), 18)
    for x in range(num_clusters):
        cluster_pur = pd.read_csv("outputs/cluster_%d_pred_pur.csv" % x)
        top_k_cats = cluster_pur.nlargest(top_k, 'total_pur_pred')['p_cat'].values
        print("Top %d product categories for cluster %d: %s" % (top_k, x, " ".join(str(cat) for cat in top_k_cats)))

## Recommendations from Combined Model

In [59]:
num_clusters = create_cluster_pred_pur()

Found buffer zip file
[07:28:30] 537577x69 matrix with 37092813 entries loaded from outputs/xgb_data.buffer
Wrote Cluster 0 Predicted Purchase Information
Wrote Cluster 1 Predicted Purchase Information
Wrote Cluster 2 Predicted Purchase Information


In [60]:
k = 10
top_k_cats(k)

Top 10 product categories for cluster 0: 1 8 5 2 16 15 14 6 17 4
Top 10 product categories for cluster 1: 1 8 5 16 2 15 14 6 17 4
Top 10 product categories for cluster 2: 1 8 5 16 2 15 14 6 4 17


## Recomendations from Split Model

In [61]:
num_clusters = create_cluster_pred_pur(split_models = True)

Found buffer zip file
Number of files 6
[07:28:36] 24790x80 matrix with 1983200 entries loaded from outputs/xgb_data0.buffer
[07:28:36] 215832x80 matrix with 17266560 entries loaded from outputs/xgb_data1.buffer
[07:28:37] 296955x80 matrix with 23756400 entries loaded from outputs/xgb_data2.buffer
Wrote Cluster 0 Predicted Purchase Information
Wrote Cluster 1 Predicted Purchase Information
Wrote Cluster 2 Predicted Purchase Information


In [62]:
k = 10
top_k_cats(k)

Top 10 product categories for cluster 0: 1 8 5 2 16 15 14 6 17 4
Top 10 product categories for cluster 1: 1 8 5 16 2 15 14 6 17 4
Top 10 product categories for cluster 2: 1 8 5 16 2 15 14 6 4 17
