# Customer Management Example

## Narrative


Your company has grouped customers into three classes, active customers, semi-active customers and inactive customers. In each time period the company earns something from each active customer, less from semi-active customers. In each time period, the company can do some customer marketing that can move some customers from one status to another, but this has a cost. What is the best action?

- Email Marketing: to all customers, bad and good effects
- Coupon: to semi-active and non-active, some improvements but some active becomes non-active
- Resurrection: to non-active, many non-active becomes active or semi-active, but with little drawbaks
- Special: only little improvements

## Libraries

In [1]:
import numpy as np
import random


In [12]:
environ_params = {
    'reward_per_active' : 15,
    'reward_per_semi_active' : 10,
    
    'action_email_cost' : 3,
    'action_coupon_cost' : 20,
    'action_resurrection_cost' : 15,
    'action_special_cost' : 16,

    'epoch_new_active' : 10,
    'epoch_new_semi_active' : 5,
    'epoch_active_to_semi_active': (.02, .001),
    'epoch_semi_active_to_non_active': (.05, .002),
    
    'email_semi_active_to_active' : (.1, .005),
    'email_non_active_to_active' : (.15, .02),
    'email_non_active_to_semi_active' : (.1, .02),
    'email_2_active_to_non_active': (.05, .005),
    'email_2_active_to_semi_active' : (.05, .005),
    'email_2_semi_active_to_non_active' : (.1 , .02),
    
    'coupon_semi_active_to_active' : (.5, .02),
    'coupon_non_active_to_active' : (.2, .005),
    'coupon_active_to_non_active' : (.02, .001),
    
    'resurrection_non_active_to_active' : (.4, .02),
    'resurrection_non_active_to_semi_active' : (.1, .015),
    'resurrection_active_to_semi_active' : (.05, .005),
    'resurrection_semi_active_to_non_active' : (.05, .005),

    'special_semi_active_to_active' : (.1, .005),
    'special_non_active_to_semi_active' : (.05, .005),
}





In [13]:

def customer_transition(source_number, params):
    loc, scale = params
    # Sample from Normal random variable
    transation = np.random.normal(loc, scale, 1)[0]
    # Make sample a proportion
    transation = max( min(1,transation), 0)
    # Find the number of transiting customers
    transation *= source_number
    # Cast into int
    transation = int(transation)

    return transation




In [14]:

class Environment():

    def __init__(self, active_customers, semi_active_customers, non_active_customers):
        self._active_customers = active_customers
        self._semi_active_customers = semi_active_customers
        self._non_active_customers = non_active_customers

        self._active_customers_final = 0
        self._semi_active_customers_final = 0
        self._non_active_customers_final = 0

        #self._active_customers_next_transitions = 0
        #self._semi_active_customers_next_transitions = 0
        #self._non_active_customers_next_transitions = 0


    def __str__(self):
        return f"Active: \t {self._active_customers}\nSemi act: \t {self._semi_active_customers}\nNon act \t {self._non_active_customers}\n"


    # Good transitions
        
    def _semi_active_to_active(self, param):
        number = customer_transition(self._semi_active_customers, param)
        self._semi_active_customers_final -= number
        self._active_customers_final += number

    def _non_active_to_active(self, param):
        number = customer_transition(self._non_active_customers, param)
        self._non_active_customers_final -= number
        self._active_customers_final += number

    def _non_active_to_semi_active(self, param):
        number = customer_transition(self._non_active_customers, param)
        self._non_active_customers_final -= number
        self._semi_active_customers_final += number


    # Bad transitions
        
    def _active_to_semi_active(self, param):
        number = customer_transition(self._active_customers, param)
        self._active_customers_final -= number
        self._semi_active_customers_final += number
        
    def _active_to_non_active(self, param):
        number = customer_transition(self._active_customers, param)
        self._active_customers_final -= number
        self._non_active_customers_final += number

    def _semi_active_to_non_active(self, param):
        number = customer_transition(self._semi_active_customers, param)
        self._semi_active_customers_final -= number
        self._non_active_customers_final += number


    # Actions    
        
    def _step_organic(self):
        self._active_to_semi_active(environ_params.get('epoch_active_to_semi_active'))
        self._semi_active_to_non_active(environ_params.get('epoch_semi_active_to_non_active'))

    def _action_email(self):
        self._semi_active_to_active(environ_params.get('email_semi_active_to_active'))
        self._non_active_to_active(environ_params.get('email_non_active_to_active'))
        self._non_active_to_semi_active(environ_params.get('email_non_active_to_semi_active'))

        # Should be 2 step ahead
        self._active_to_non_active(environ_params.get('email_2_active_to_non_active'))
        self._active_to_semi_active(environ_params.get('email_2_active_to_semi_active'))
        self._semi_active_to_non_active(environ_params.get('email_2_semi_active_to_non_active'))


    def _action_coupon(self):
        self._semi_active_to_active(environ_params.get('coupon_semi_active_to_active'))
        self._non_active_to_active(environ_params.get('coupon_non_active_to_active'))
        self._active_to_non_active(environ_params.get('coupon_active_to_non_active'))

    def _action_resurrection(self):
        self._non_active_to_active(environ_params.get('resurrection_non_active_to_active'))
        self._non_active_to_semi_active(environ_params.get('resurrection_non_active_to_semi_active'))
        self._active_to_semi_active(environ_params.get('resurrection_active_to_semi_active'))
        self._semi_active_to_non_active(environ_params.get('resurrection_semi_active_to_non_active'))

    def _action_special(self):
        self._semi_active_to_active(environ_params.get('special_semi_active_to_active'))
        self._non_active_to_semi_active(environ_params.get('special_non_active_to_semi_active'))


    # State transition
        
    def step(self, action : str = None):

        # New organic customers
        self._active_customers += environ_params.get('epoch_new_active')
        self._semi_active_customers += environ_params.get('epoch_new_semi_active')
        
    
        # Setup the customers classes after the action
        self._active_customers_final = self._active_customers
        self._semi_active_customers_final = self._semi_active_customers
        self._non_active_customers_final = self._non_active_customers

        # Do step organic transitions
        self._step_organic()

        # Calculate reward
        reward = self._active_customers * environ_params.get('reward_per_active') + \
                 self._semi_active_customers * environ_params.get('reward_per_semi_active')

        # Do action and update the costs
        if action == 'email':
            self._action_email()
            reward -= (self._active_customers + self._semi_active_customers + self._non_active_customers) * environ_params.get('action_email_cost')
        elif action == 'coupon':
            self._action_coupon()
            reward -= (self._semi_active_customers + self._non_active_customers) * environ_params.get('action_coupon_cost')
        elif action == 'resurrection':
            self._action_resurrection()
            reward -= (self._non_active_customers) * environ_params.get('action_resurrection_cost')
        elif action == 'special':
            self._action_special()
            reward -= (self._active_customers + self._semi_active_customers + self._non_active_customers) * environ_params.get('action_special_cost')

        # Update customer classes
        self._active_customers = max(0, self._active_customers_final)
        self._semi_active_customers = max(0, self._semi_active_customers_final)
        self._non_active_customers = max(0, self._non_active_customers_final)

        return reward

    def make_step(self, action : str = None):
        print(f"Reward: \t {self.step(action)}\n - \n")


    # For VFA

    def export_status(self):
        return (self._active_customers, self._semi_active_customers, self._non_active_customers)
        





In [15]:
my_environment = Environment(active_customers=1500, semi_active_customers=1200, non_active_customers=2300)

In [16]:
print(my_environment)

Active: 	 1500
Semi act: 	 1200
Non act 	 2300



In [17]:
my_environment.make_step()
print(my_environment)

Reward: 	 34700
 - 

Active: 	 1478
Semi act: 	 1184
Non act 	 2353



In [18]:
my_environment.make_step()
print(my_environment)

Reward: 	 34210
 - 

Active: 	 1461
Semi act: 	 1157
Non act 	 2412



In [19]:
my_environment.make_step('email')
print(my_environment)

Reward: 	 18550
 - 

Active: 	 1786
Semi act: 	 1259
Non act 	 2000



In [20]:
my_environment.make_step()
print(my_environment)

Reward: 	 39580
 - 

Active: 	 1761
Semi act: 	 1236
Non act 	 2063



In [21]:
my_environment.make_step('special')
print(my_environment)

Reward: 	 -42225
 - 

Active: 	 1857
Semi act: 	 1208
Non act 	 2010



In [22]:
my_environment.make_step()
print(my_environment)

Reward: 	 40135
 - 

Active: 	 1829
Semi act: 	 1188
Non act 	 2073



## Value Function Approximation


In [23]:
class VFA_Agent():

    def __init__(self, actions, gamma, alpha, epsilon):

        self.gamma = gamma
        self.alpha = alpha
        self.epsilon = epsilon # Exploration-exploitation balance parameter

        self._actions = actions
        self._actions_dim = len(actions)

        self._state = (0, 0, 0)

        self._tile_coding_coef = 100

        self._state_active_dim = 100
        self._state_semi_active_dim = 100
        self._state_non_active_dim = 100

        self._q_function = np.zeros([self._state_active_dim, self._state_semi_active_dim, self._state_non_active_dim, self._actions_dim])


    def _tile_coding(self, export):
        (num_active, num_semi_active, num_non_active) = export
        return (int(num_active/self._tile_coding_coef), 
                       int(num_semi_active/self._tile_coding_coef), 
                       int(num_non_active/self._tile_coding_coef))


    def _set_state(self, export):
        self._state = self._tile_coding(export)

        
    def _select_action(self):
        """
        Select best action considering exploration-exploitation balance
        """
        if np.random.rand() < self.epsilon:
            return random.randint(0, self._actions_dim - 1)
        else:
            q_values = [self._q_function[self._state + (a,)] for a in range(self._actions_dim)]
            # Break the ties!
            best_actions = np.where(q_values == np.max(q_values))[0]
            return np.random.choice(best_actions)
        
    def _update_q_function(self, action, reward, next_state):
        """
        Q-Learning core: this fuction calculates the update of the q function
        """
        
        # get the current value of the q function for the given state and action 
        current_q = self._q_function[self._state + (action,)]
        
        # get the q function value for the prescribed action for the next state
        max_next_q = np.max(self._q_function[next_state])

        # calculate the temporal difference  (Q - learning)
        td_error = reward + self.gamma * max_next_q - current_q

        # update the q function (Q - learning)
        self._q_function[self._state + (action,)] += self.alpha * td_error

    def train(self, environment: Environment, num_episodes):
        """
        Run simulation to learn the q function
        """
        
        # Training loop
        for episode in range(num_episodes):
            
            # Set the status from the environment status
            self._set_state(environment.export_status())

            # Choose an action
            action = self._select_action()

            # Step the environment, save the reward
            reward = environment.step(self._actions[action])

            # Save the new state
            new_state = self._tile_coding(environment.export_status())

            # Update the q function 
            self._update_q_function(action, reward, new_state)


    def get_value_function(self, export):
        state = self._tile_coding(export)
        q_values = self._q_function[state, :]
        return np.mean(q_values)


    

In [41]:
actions = [None, 'email', 'coupon', 'resurrection', 'special']
agent = VFA_Agent(actions, gamma=0.9, alpha=0.1, epsilon=0.1)

In [42]:
for _ in range(100):
    my_environment = Environment(active_customers=1500, semi_active_customers=1200, non_active_customers=2300)
    agent.train(my_environment, 100)

In [43]:
print(my_environment)

Active: 	 3699
Semi act: 	 1717
Non act 	 1084



In [44]:
agent.get_value_function(my_environment.export_status())

15.393623140987652

In [45]:
agent._q_function

array([[[[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         ...,
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         ...,
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         ...,
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]],

        ...,

        [[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         ...,
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         ...,
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         

## Direct LookAhead


In [29]:
class DLA_Agent():

    def __init__(self, actions):

        #self.gamma = gamma
        #self.alpha = alpha
        #self.epsilon = epsilon # Exploration-exploitation balance parameter

        self._actions = actions
        self._actions_dim = len(actions)

        self._state = (0, 0, 0)

        self._tile_coding_coef = 10

        self._state_active_dim = 1000
        self._state_semi_active_dim = 1000
        self._state_non_active_dim = 1000

        self._q_function = np.zeros([self._actions_dim])


    def _tile_coding(self, export):
        (num_active, num_semi_active, num_non_active) = export
        return (int(num_active/self._tile_coding_coef), 
                       int(num_semi_active/self._tile_coding_coef), 
                       int(num_non_active/self._tile_coding_coef))


    def _set_state(self, export):
        self._state = self._tile_coding(export)

        
    def _select_action(self):
        """
        Select best action using euristic policy
        """
        return random.randint(0, self._actions_dim - 1)


    def train(self, initial_environment, num_episodes, len_episodes):
        """
        Run simulation to learn the q function
        """

        # Loop on the action to be trained
        for action_train in range(self._actions_dim): 

            rewards = []
        
            # Training loop
            for _ in range(num_episodes):
                
                environment = Environment(active_customers=initial_environment.get('active_customers'), 
                                          semi_active_customers=initial_environment.get('semi_active_customers'), 
                                          non_active_customers=initial_environment.get('non_active_customers'))

                for step in range(len_episodes):
                
                    # Set the status from the environment status
                    self._set_state(environment.export_status())

                    # Choose an action, if this is the first step, the use the action in training
                    if step == 0:
                        action = action_train
                    else:
                        action = self._select_action()

                    # Step the environment, save the reward
                    rewards.append( environment.step(self._actions[action]) )


            self._q_function[action_train] = np.mean(rewards)


    def get_value_function(self):
        return self._q_function
        


    

In [36]:
actions = [None, 'email', 'coupon', 'resurrection', 'special']

agent = DLA_Agent(actions)

initial_environment = {'active_customers': 1500,
                       'semi_active_customers': 1200,
                       'non_active_customers': 2300,}



agent.train(initial_environment=initial_environment, num_episodes=50, len_episodes=10)

In [37]:
print(agent.get_value_function())


[19897.63 19579.11 19439.16 24670.83 10318.26]
