# Customer Segmentation by Dongmei Yin

In [18]:
import pandas as pd
import numpy as np
import pickle
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

def customerSegmentation(filepath):
    # Part1: read in raw data with provided filepah
    aisle = pd.read_csv(filepath + '/aisles.csv')
    department = pd.read_csv(filepath + '/departments.csv')
    order_products = pd.read_csv(filepath + '/sample_order_products.csv')
    orders = pd.read_csv(filepath + '/sample_order.csv')
    products = pd.read_csv(filepath + '/products.csv')
    aisle_match = pd.read_csv(filepath + '/final_aisle_match.csv');
    
    # Part 2: data preprocessing
    
    # Take out orderID and productID from both train and prior table and merge them into one order_products_id table
    order_products_id = order_products[['order_id','product_id']]
    # take out orderID and userID from orders table
    order_user_id = orders[['order_id','user_id']]
    # get total order count for each user
    user_order_count = order_user_id[['user_id','order_id']]
    user_order_count = user_order_count.groupby(['user_id']).agg({'order_id': 'count'}).reset_index()
    user_order_count.rename(columns={"order_id": "order_count"}, inplace=True)
    #combine order_products and products table to see what aisle each product belongs for each order
    order_product_aisle = order_products_id.merge(products,on='product_id')
    #from order_product_aisle table, only select orderID and aisleID to aggregate some data since we only deal with 
    #aisle level granularity
    #In addition, count of products from each aisle for each order is kept
    order_aisle_id = order_product_aisle[['order_id','aisle_id']]
    order_aisle_id['aisle_id2'] = order_product_aisle['aisle_id']
    order_aisle_count = order_aisle_id.groupby(['order_id','aisle_id']).agg({'aisle_id2': 'count'}).reset_index()
    order_aisle_count.rename(columns={"aisle_id2": "product_count"}, inplace=True)
    #join order_user_id table and order_aisle_count table to see which order belongs to which user
    order_user_aisle = order_aisle_count.merge(order_user_id, on='order_id')
    #from order_user_aisle table, take out orderID and group by user_id, aisle_id
    user_aisle_id = order_user_aisle[['user_id','aisle_id','product_count']]
    user_aisle_count = user_aisle_id.groupby(['user_id','aisle_id']).agg({'product_count': 'sum'}).reset_index()
    # user_aisle_count right join user_order_count 
    user_aisle_order_count= user_aisle_count.merge(user_order_count, how='right',on='user_id')
    # final aisle grouping along with filtration of essential aisles
    new_user_aisle_order_count = user_aisle_order_count.merge(aisle_match, how='inner', on='aisle_id')
    new_user_aisle_order_count = new_user_aisle_order_count[['user_id','new_aisle_ID','product_count','order_count']]
    new_user_aisle_order_count = new_user_aisle_order_count.groupby(['user_id','new_aisle_ID']).agg({'product_count':'sum','order_count':'sum'}).reset_index()
    # divide product count by order count to get for aisle, how many products on average each customer would buy per order
    new_user_aisle_order_count['avg'] = new_user_aisle_order_count['product_count']/new_user_aisle_order_count['order_count']
    user_aisle_avgCount_annual = new_user_aisle_order_count[['user_id','new_aisle_ID','avg']]
    # get final data
    data = pd.pivot_table(user_aisle_avgCount_annual, values='avg', index=['user_id'],columns=['new_aisle_ID'], aggfunc=np.sum, fill_value=0)
    data.reset_index(inplace=True)
                                                                                                    
    # index match
    index_match = data['user_id']
    data.set_index('user_id', inplace=True)
    
    # Model prediction
    # apply principal component analysis to reduce the dimension from 69 to 34 dimensions
    pca = PCA(n_components=34)
    pca_34 = pca.fit_transform(data)
    pca_34_90percent = pd.DataFrame(pca_34)
                                                                                                     
    # load the model from disk and predict on the data
    filename = 'customer_segmentation_model.sav'
    loaded_model = pickle.load(open(filename, 'rb'))
    y_kmean = loaded_model.predict(pca_34_90percent)
    result = pca_34_90percent.copy(deep=True)
    result.reset_index(inplace=True)
    result.insert(1,'label',y_kmean)
    result = result.rename(columns = {'index':'X'})
    result = result[['X','label']]
    index_match = pd.DataFrame(index_match)
    index_match.reset_index(inplace=True)
    index_match = index_match.rename(columns = {'index':'X'})
    result = index_match.merge(result, on = 'X')
    result = result[['user_id','label']]
                                                                                                     
    return result



# Code to run whole system

In [19]:
# file path for the sample data folder
fp = 'C:/Users/lyn02/Desktop/CMPE255 Project/Sample Data';
# getting customer clusters with two columns (user_id, label)
customercluster = customerSegmentation(fp);

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  order_aisle_id['aisle_id2'] = order_product_aisle['aisle_id']


In [20]:
customercluster

Unnamed: 0,user_id,label
0,21,1
1,28,1
2,36,1
3,54,1
4,63,1
...,...,...
245,203750,1
246,204090,1
247,204236,1
248,205273,1
