In [1]:
# -------------------------------------------------------------------------------
# Read POS data
# Read COnfiguration data
# -------------------------------------------------------------------------------
# Compute Frequency and Monetary values for ecah Customer
# Compute Normalised Frequency and Monetary values for each Customer
# Read Customer Segments and their corresponding Frequency, Monetary ranges from configuration file
# Map the Customer Segments to the Customers based on these F and M ranges
# For ease of use, all values of F and M ranges can be specified as a Normalised value between 1 and 100
# Auto Segmentation of Customers
# Auto segmentation applied on Normalised F and M values of Customers to group them into Customer segments
# -------------------------------------------------------------------------------
# Import libraries
import csv
import pandas as pd
import random as r
import sklearn
from sklearn.cluster import KMeans


In [2]:
#---------------
# Function to Read Customer segments data from Configuration file
#  CustomerSegment|SegmentAutoRank|RFrom|RTo|FFrom|FTo|MFrom|MTo
#  Platinum|1|40|50|40|50|40|50
#  Loyal Gold|2|20|50|30|50|30|50

def read_custsegments_config(in_file_obj):
    df = pd.read_csv(in_file_obj, header='infer', delimiter='|', skip_blank_lines=True, parse_dates=True, dtype=str)
    df = df.sort_values(by=['SegmentAutoRank'])
    return(df)

#---------------
# Compute Customer statistics like Frequency, Monetary value or Total spend by each Customer
#    for further analysis
def customer_stats(df_pos):
	df = df_pos
	#Non negative filter
	df_nn = df[df["UnitPrice"]>0]
	# Group
	v_lineval=df_nn["Quantity"].astype(float) * df_nn["UnitPrice"].astype(float)
	df_nn=df_nn.assign(LineValue=v_lineval)
	#Non negative filter
	df_nn = df_nn[df_nn["LineValue"]>0]
	# Group the data frame by month and item and extract a number of stats from each group
	grouped_df = df_nn.groupby('CustomerID', as_index=False).agg({'LineValue': [min, max, sum],      # find the min, max, and sum of the duration column
					   'InvoiceNo': ["count"], # find the number of network type entries
					   'InvoiceDate': [min, max, 'first', 'nunique']})    # get the min, first, and number of unique dates per group
	grouped_df.columns = ["".join(x) for x in grouped_df.columns.ravel()]
	grouped_df.columns
	grouped_df.head()
	return(grouped_df)

#---------------
# Normalize functions
# Used for Normalising the absolute RFM values into a range of 1 to 100
# This Normalisation range between 1 to 100 will help visualise the F,M 
#                                           values intuitively by users

def fxnorm(x, mina, maxa):
    x_norm = (((x-float(mina))/(float(maxa)-float(mina)))*100)
    return x_norm

def flnorm(l, minx, maxx):
    l_norm = [fxnorm(i,minx, maxx) for i in l]
    return l_norm

# flnorm(x, min(x), max(x))
#---------------
# Compute Customer Normals
def customer_normals(df_custstats):
    # Normalize Customer stat 1
    custvals=df_custstats.LineValuesum
    v_ValsNorm=flnorm(custvals, min(custvals), max(custvals))
    df_custstats=df_custstats.assign(LineValuesumNorm=v_ValsNorm)
    # Normalize Customer stat 2
    custvals=df_custstats.InvoiceNocount
    v_ValsNorm=flnorm(custvals, min(custvals), max(custvals))
    df_custstats=df_custstats.assign(InvoiceNocountNorm=v_ValsNorm)
    # Check Dataset
    df_custstats.columns
    df_custstats.head()
    #df_analyze = pd.DataFrame({'CustomerID':grouped_df['CustomerID'].values,
    #                           'LineValuesumNorm':grouped_df['LineValuesumNorm'].values,
    #                           'InvoiceNocountNorm':grouped_df['InvoiceNocountNorm'].values})
    df_analyze = pd.DataFrame({'LineValuesumNorm':df_custstats['LineValuesumNorm'].values,
                               'InvoiceNocountNorm':df_custstats['InvoiceNocountNorm'].values})
    return(df_analyze)


#---------------
# To conduct experiments with Automatic Segmentation of Customers based on their 
#  Frequency and Monetary values
# Automatic Customer segmentation and Ranking using k-means algorithm
#  Segment count is picked up from the CustomerSegment configuration file
#  But the Normalised F and M value ranges defined in the configuration file is ignored in Auto Ranking
    
def auto_cust_ranks(df_custanalyze, df_custstats, in_clustcount=2):
    from sklearn.cluster import KMeans
    # Convert DataFrame to matrix
    mat = df_custanalyze.as_matrix()
    # Using sklearn
    km = sklearn.cluster.KMeans(n_clusters=in_clustcount)
    km.fit(mat)
    # Get cluster assignment labels
    labels = km.labels_
    # Format results as a DataFrame
    results = pd.DataFrame([df_custanalyze.index,labels]).T
    results  
    results[0]  
    print(df_custanalyze.head())  
    # Get the Customer groups  
    # Join the Groups  
    df_CustGroups = pd.DataFrame({'CustomerGroup':results[1], 'CustomerID':df_custstats['CustomerID'].values,
                                  'F':df_custstats['InvoiceNocount'].values, 'M':df_custstats['LineValuesum'].values,
                                  'FNorm':df_custanalyze['InvoiceNocountNorm'].values, 'MNorm':df_custanalyze['LineValuesumNorm'].values})  
    df_CustGroups = df_CustGroups.sort_values(by=['CustomerGroup', 'MNorm', 'FNorm'])  
    return(df_CustGroups)  

# Compute Customer Segment statistics
def segment_stats(df_custRanked):
    df = df_custRanked
    # Group the data frame by month and item and extract a number of stats from each group
    # find the mean
    grouped_df = df.groupby('CustomerGroup', as_index=False).agg({'F': ['mean'], 'FNorm': ['mean'],      
                           'M': ['mean'], 'MNorm': ['mean'] })
    grouped_df.columns = ["".join(x) for x in grouped_df.columns.ravel()]
    grouped_df.columns
    print(grouped_df.head())
    # grouped_df.sort_values(by=['CustomerGroup', 'MNormmean', 'FNormmean'])
    grouped_df = grouped_df.sort_values(by=['MNormmean', 'FNormmean'], ascending=False)
    print(grouped_df)
    return(grouped_df)

# Assign Auto segment names
# df_segmentstats
# df_CustGroups = pd.DataFrame({'CustomerGroup':df_custRanked['CustomerGroup'].values, 'CustomerID':df_custstats['CustomerID'].values})
# df_CustGroups.head()
def auto_cust_segments(df_custRanked, df_segmentstats):
    # del df_custRanked['CustomerSegment']
    df_custRanked.head()
    df_custRanked = df_custRanked.assign(CustomerSegment=("Unassigned"))
    df_custRanked.head()
    # --- Assign in a loop
    for i, row in df_segmentstats.iterrows():
        print (row.CustomerSegment, row.CustomerGroup)
        df_custRanked.loc[(df_custRanked["CustomerGroup"]==row.CustomerGroup), 'CustomerSegment']=row.CustomerSegment
    df_custRanked.head()
    return(df_custRanked)

# Manual mapping ofCustomers to Customer segments
#   Normalised F and M values defined in CustomerSegment configuration file used
#   to map Customers based on their Normalised F and M scores
def manual_cust_segments(df_custRanked, df_custseg):
    df_custRanked.dtypes
    # Add a column CustomerSegment to Ranked Customer dataset
    if 'CustomerSegment' in df_custRanked.columns:
        del df_custRanked['CustomerSegment']
    df_custRanked.head()
    df_custRanked = df_custRanked.assign(CustomerSegment=("Unassigned"))
    df_custRanked.head()
    # Filter Customer Segments for manual matching
    for i, row in df_custseg.iterrows():
        # print (row.CustomerSegment, row.SegmentAutoRank, row.RFrom, row.RTo, row.MFrom, row.MTo)
        # print(df_custRanked[(df_custRanked["FNorm"]>=float(row.FFrom)) & (df_custRanked["FNorm"]<float(row.FTo)) &
        #                    (df_custRanked["MNorm"]>=float(row.MFrom)) & (df_custRanked["MNorm"]<float(row.MTo))])
        # df_custRanked[(df_custRanked["FNorm"]>=float(row.FFrom)) & (df_custRanked["FNorm"]<float(row.FTo)) &
        #                    (df_custRanked["MNorm"]>=float(row.MFrom)) & (df_custRanked["MNorm"]<float(row.MTo))].CustomerSegment
        df_custRanked.loc[(df_custRanked["FNorm"]>=float(row.FFrom)) & (df_custRanked["FNorm"]<float(row.FTo)) &
                            (df_custRanked["MNorm"]>=float(row.MFrom)) & (df_custRanked["MNorm"]<float(row.MTo)), 'CustomerSegment']=row.CustomerSegment
    return(df_custRanked)

# -------------------------------------------------------------------


In [3]:
# Wrapper function to run Customer Segmentation
def segment_customers(df_pos, df_custsegconfig, in_segment_mode='AUTO'):
# Compute Customer metrics
    df_custstats = customer_stats(df_pos)
    df_custstats.head()
    # NormalizeCustomer metrics
    df_custanalyze = customer_normals(df_custstats)
    df_custanalyze.head()
    # Automatic Segmentation of Customers
    v_clustcount = len(df_custsegconfig)
    print("Number of Clusters for Auto Segmentation : ", v_clustcount)
    df_custRanked = auto_cust_ranks(df_custanalyze, df_custstats, v_clustcount)
    df_custRanked.head()
    # Execute flow for AUTO segmentation or MANUAL segmentation    
    if (in_segment_mode == 'AUTO'):    
        # Option 1: AUTO segmentation of Customers
        # Customer segment wise stats
        df_segmentstats = segment_stats(df_custRanked)
        df_segmentstats.head()
        # Map Auto SegmentGroup with identifyable names from Configuration file
        df_segmentstats = df_segmentstats.assign(CustomerSegment=df_custsegconfig['CustomerSegment'].values)
        df_segmentstats.head()
        df_segmentstats.dtypes
        # Assign Auto Cutomersegment statsto Customer groups
        df_SegmentedCustomers = auto_cust_segments(df_custRanked, df_segmentstats)
        df_SegmentedCustomers.head()
    else:
        # Option 2: MANUAL segmentation of Customers
        df_SegmentedCustomers = manual_cust_segments(df_custRanked, df_custsegconfig)
        df_SegmentedCustomers.head()
    return(df_SegmentedCustomers)



### Fucntions to Read and write files in Watson Data platform

In [4]:
import ibm_boto3
from botocore.client import Config


In [5]:
import sys
if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO

In [6]:
# @hidden_cell
# The following code contains the credentials for a file in your IBM Cloud Object Storage.
# You might want to remove those credentials before you share your notebook.
credentials_2 = {
    'IBM_API_KEY_ID': '9lpMGaOqG9N_xYlDNyNKYxVNjfKoxX5iTFFW-5bZ1NgA',
    'IAM_SERVICE_ID': 'iam-ServiceId-6bb39998-b144-480b-8881-b769aea97455',
    'ENDPOINT': 'https://s3-api.us-geo.objectstorage.service.networklayer.com',
    'IBM_AUTH_ENDPOINT': 'https://iam.ng.bluemix.net/oidc/token',
    'BUCKET': 'campaignmanagementded83c5c44d440ebba56858d64bcdce0',
    'FILE': 'Segmented Customers Out.csv'
}


In [7]:
# This function accesses a file in your Object Storage.
# The definition uses your credentials that you set in the previous step.
cos = ibm_boto3.client('s3',
                    ibm_api_key_id=credentials_2['IBM_API_KEY_ID'],
                    ibm_service_instance_id=credentials_2['IAM_SERVICE_ID'],
                    ibm_auth_endpoint=credentials_2['IBM_AUTH_ENDPOINT'],
                    config=Config(signature_version='oauth'),
                    endpoint_url=credentials_2['ENDPOINT'])

def get_file(filename):
    '''Retrieve file from Cloud Object Storage'''
    fileobject = cos.get_object(Bucket=credentials_2['BUCKET'], Key=filename)['Body']
    return fileobject

def load_string(fileobject):
    '''Load the file contents into a Python string'''
    text = fileobject.read()
    return text

def put_file(filename, filecontents):
    '''Write file to Cloud Object Storage'''
    resp = cos.put_object(Bucket=credentials_2['BUCKET'], Key=filename, Body=filecontents)
    return resp


   ## Read POS data

In [8]:
### Read POS data
pos_filename = 'Online Retail Sample.csv'

pos_data = load_string(get_file(pos_filename))
inputfo = pos_data.decode('utf-8')

testdata = StringIO(inputfo)
df_pos = pd.read_csv(testdata, sep=",")

df_pos.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,01/12/2010 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,01/12/2010 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,01/12/2010 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,01/12/2010 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,01/12/2010 08:26:00,3.39,17850.0,United Kingdom


## Read Config data

In [9]:
###Read Config file
config_filename = 'PromoConfig.txt'

config_data = load_string(get_file(config_filename))
inputfo = config_data.decode('utf-8')

testdata = StringIO(inputfo)
df_custsegconfig = pd.read_csv(testdata, sep="|")

df_custsegconfig.head()

df_custsegconfig = df_custsegconfig.sort_values(by=['SegmentAutoRank'])
df_custsegconfig.head()

Unnamed: 0,CustomerSegment,SegmentAutoRank,RFrom,RTo,FFrom,FTo,MFrom,MTo
0,Platinum,1,40,50,40,50,40,50
1,Loyal Gold,2,20,50,30,50,30,50
2,Up Sell,3,40,50,40,50,10,30
3,Cross Sell,4,30,40,0,10,0,10
4,Attend Now,5,20,30,20,30,20,30


In [10]:
# AUTO Segmentation of Customers
df_custsegments_auto = segment_customers(df_pos, df_custsegconfig, in_segment_mode='AUTO')
df_custsegments_auto.head()

Number of Clusters for Auto Segmentation :  7
   InvoiceNocountNorm  LineValuesumNorm
0           15.476190          5.369130
1           85.714286         29.090105
2           22.619048         12.931350
3           16.666667          3.898508
4            0.000000          0.000000
   CustomerGroup        Mmean      Fmean  FNormmean   MNormmean
0              0   387.246667  60.333333  70.634921    5.809794
1              1   211.468056   4.777778   4.497354    3.138472
2              2  6585.160000  33.000000  38.095238  100.000000
3              3   326.967500  32.875000  37.946429    4.893727
4              4  2667.533333  15.333333  17.063492   40.463501
   CustomerGroup        Mmean      Fmean  FNormmean   MNormmean
2              2  6585.160000  33.000000  38.095238  100.000000
4              4  2667.533333  15.333333  17.063492   40.463501
5              5  1052.560000  80.750000  94.940476   15.920617
6              6   414.502727  15.939394  17.784993    6.224007
0         

Unnamed: 0,CustomerGroup,CustomerID,F,FNorm,M,MNorm,CustomerSegment
39,0,15012.0,54,63.095238,191.06,2.828329,Attend Now
85,0,17908.0,58,67.857143,243.28,3.621921,Attend Now
36,0,14729.0,71,83.333333,313.49,4.688908,Attend Now
48,0,15525.0,57,66.666667,313.93,4.695595,Attend Now
50,0,15862.0,64,75.0,354.23,5.308037,Attend Now


In [11]:
# df_custsegconfig.head()
# Manual Segmentation of Customers based on Customer Segment Configuration
df_custsegments_manual = segment_customers(df_pos, df_custsegconfig, in_segment_mode='MANUAL')
df_custsegments_manual.head()

Number of Clusters for Auto Segmentation :  7
   InvoiceNocountNorm  LineValuesumNorm
0           15.476190          5.369130
1           85.714286         29.090105
2           22.619048         12.931350
3           16.666667          3.898508
4            0.000000          0.000000


Unnamed: 0,CustomerGroup,CustomerID,F,FNorm,M,MNorm,CustomerSegment
70,0,17346.0,35,40.47619,146.61,2.152819,Unassigned
83,0,17897.0,35,40.47619,165.89,2.445819,Unassigned
33,0,14606.0,40,46.428571,198.32,2.93866,Unassigned
32,0,14594.0,31,35.714286,255.0,3.800031,Unassigned
52,0,15983.0,32,36.904762,440.89,6.625017,Unassigned


### Write to csv file in Watson data storage

In [12]:
import ibm_boto3
from botocore.client import Config

In [13]:
# @hidden_cell
# The following code contains the credentials for a file in your IBM Cloud Object Storage.
# You might want to remove those credentials before you share your notebook.
credentials_2 = {
    'IBM_API_KEY_ID': '9lpMGaOqG9N_xYlDNyNKYxVNjfKoxX5iTFFW-5bZ1NgA',
    'IAM_SERVICE_ID': 'iam-ServiceId-6bb39998-b144-480b-8881-b769aea97455',
    'ENDPOINT': 'https://s3-api.us-geo.objectstorage.service.networklayer.com',
    'IBM_AUTH_ENDPOINT': 'https://iam.ng.bluemix.net/oidc/token',
    'BUCKET': 'campaignmanagementded83c5c44d440ebba56858d64bcdce0',
    'FILE': 'Segmented Customers Out.csv'
}


In [14]:
# This function accesses a file in your Object Storage.
# The definition uses your credentials that you set in the previous step.
cos = ibm_boto3.client('s3',
                    ibm_api_key_id=credentials_2['IBM_API_KEY_ID'],
                    ibm_service_instance_id=credentials_2['IAM_SERVICE_ID'],
                    ibm_auth_endpoint=credentials_2['IBM_AUTH_ENDPOINT'],
                    config=Config(signature_version='oauth'),
                    endpoint_url=credentials_2['ENDPOINT'])

def get_file(filename):
    '''Retrieve file from Cloud Object Storage'''
    fileobject = cos.get_object(Bucket=credentials_2['BUCKET'], Key=filename)['Body']
    return fileobject

def load_string(fileobject):
    '''Load the file contents into a Python string'''
    text = fileobject.read()
    return text

def put_file(filename, filecontents):
    '''Write file to Cloud Object Storage'''
    resp = cos.put_object(Bucket=credentials_2['BUCKET'], Key=filename, Body=filecontents)
    return resp

In [15]:
# Write Auto segment Customers
file_auto_segment = "Segmented Customers Auto.csv"
txt = df_custsegments_auto.to_csv()
put_file(file_auto_segment, txt)

{'ETag': '"e385e7966c65fa9242fb3a558fcad47c"',
 'ResponseMetadata': {'HTTPHeaders': {'content-length': '0',
   'date': 'Tue, 17 Jul 2018 08:16:10 GMT',
   'etag': '"e385e7966c65fa9242fb3a558fcad47c"',
   'server': '3.13.3.57',
   'x-amz-request-id': 'c99ebbb4-96d7-49f6-a2e6-e451286bb594',
   'x-clv-request-id': 'c99ebbb4-96d7-49f6-a2e6-e451286bb594',
   'x-clv-s3-version': '2.5'},
  'HTTPStatusCode': 200,
  'HostId': '',
  'RequestId': 'c99ebbb4-96d7-49f6-a2e6-e451286bb594',
  'RetryAttempts': 0}}

In [16]:
# Write Manual segment Customers
file_manual_segment = "Segmented Customers Manual.csv"
txt = df_custsegments_manual.to_csv()
put_file(file_manual_segment, txt)

{'ETag': '"07b1627c62caf509f93b98bcd7564ce2"',
 'ResponseMetadata': {'HTTPHeaders': {'content-length': '0',
   'date': 'Tue, 17 Jul 2018 08:16:11 GMT',
   'etag': '"07b1627c62caf509f93b98bcd7564ce2"',
   'server': '3.13.3.57',
   'x-amz-request-id': 'e04be833-bfec-485d-bee5-8714539ca728',
   'x-clv-request-id': 'e04be833-bfec-485d-bee5-8714539ca728',
   'x-clv-s3-version': '2.5'},
  'HTTPStatusCode': 200,
  'HostId': '',
  'RequestId': 'e04be833-bfec-485d-bee5-8714539ca728',
  'RetryAttempts': 0}}