In [1]:
%load_ext autoreload
%autoreload 2

# Markov chain text generation example

In [2]:
from package.mock_data import df
import pandas as pd

df.shape

(27, 7)

In [3]:
df.head(5)

Unnamed: 0,session_id,turn,user_message,assistant_message,state,outcome,timestamp
0,s1,1,"Hi, I need help choosing a laptop",I can help you find the perfect laptop. What w...,inquiry,convert,2024-06-01 10:00:00
1,s1,2,Gaming and video editing,"For gaming and video editing, I recommend our ...",specification,convert,2024-06-01 10:15:00
2,s1,3,What about the price range?,Our gaming laptops range from $1200-$3000. Wha...,pricing,convert,2024-06-01 10:30:00
3,s1,4,Around $2000 sounds good,Perfect! I have a great model at $1899 that fi...,purchase_intent,convert,2024-06-01 10:45:00
4,s1,5,"Yes, please proceed",Great! I have added the laptop to your cart. P...,purchase,convert,2024-06-01 11:00:00


# Data Preprocessing

In [4]:
sessions = df.groupby('session_id').agg(
    states=pd.NamedAgg(column='state', aggfunc=list),
    outcome=pd.NamedAgg(column='outcome', aggfunc='first')
).reset_index()
sessions['length'] = sessions['states'].apply(len)
print(sessions.to_string())

  session_id                                                            states  outcome  length
0         s1      [inquiry, specification, pricing, purchase_intent, purchase]  convert       5
1         s2                                                   [inquiry, exit]   bounce       2
2         s3                        [support, upsell, specification, purchase]  convert       4
3         s4                           [inquiry, specification, pricing, exit]   bounce       4
4         s5  [information, inquiry, specification, purchase_intent, purchase]  convert       5
5         s6                                         [pricing, trade_in, exit]   bounce       3
6         s7                                 [inquiry, inquiry, inquiry, exit]   bounce       4


# Transition Analysis

## Extract transitions

- we can apply bi-gram or tri-gram concept, but now bi-gram for the test

In [None]:
def extract_transitions(states):
    return list(zip(states[:-1], states[1:]))

sessions['transitions'] = sessions['states'].apply(extract_transitions)
sessions


Unnamed: 0,session_id,states,outcome,length,transitions
0,s1,"[inquiry, specification, pricing, purchase_int...",convert,5,"[(inquiry, specification), (specification, pri..."
1,s2,"[inquiry, exit]",bounce,2,"[(inquiry, exit)]"
2,s3,"[support, upsell, specification, purchase]",convert,4,"[(support, upsell), (upsell, specification), (..."
3,s4,"[inquiry, specification, pricing, exit]",bounce,4,"[(inquiry, specification), (specification, pri..."
4,s5,"[information, inquiry, specification, purchase...",convert,5,"[(information, inquiry), (inquiry, specificati..."
5,s6,"[pricing, trade_in, exit]",bounce,3,"[(pricing, trade_in), (trade_in, exit)]"
6,s7,"[inquiry, inquiry, inquiry, exit]",bounce,4,"[(inquiry, inquiry), (inquiry, inquiry), (inqu..."


In [6]:
from collections import Counter

transition_counts = dict(
    convert=Counter(),
    bounce=Counter()
)

for _, row in sessions.iterrows():
    transition_counts[row["outcome"]].update(row["transitions"])

transition_counts

{'convert': Counter({('inquiry', 'specification'): 2,
          ('purchase_intent', 'purchase'): 2,
          ('specification', 'pricing'): 1,
          ('pricing', 'purchase_intent'): 1,
          ('support', 'upsell'): 1,
          ('upsell', 'specification'): 1,
          ('specification', 'purchase'): 1,
          ('information', 'inquiry'): 1,
          ('specification', 'purchase_intent'): 1}),
 'bounce': Counter({('inquiry', 'exit'): 2,
          ('inquiry', 'inquiry'): 2,
          ('inquiry', 'specification'): 1,
          ('specification', 'pricing'): 1,
          ('pricing', 'exit'): 1,
          ('pricing', 'trade_in'): 1,
          ('trade_in', 'exit'): 1})}

In [7]:
transition_counts['convert']

Counter({('inquiry', 'specification'): 2,
         ('purchase_intent', 'purchase'): 2,
         ('specification', 'pricing'): 1,
         ('pricing', 'purchase_intent'): 1,
         ('support', 'upsell'): 1,
         ('upsell', 'specification'): 1,
         ('specification', 'purchase'): 1,
         ('information', 'inquiry'): 1,
         ('specification', 'purchase_intent'): 1})

In [None]:
def normalize_counter(counter):
    """We normalize a Counter to get probabilities of each outcome."""
    total = sum(counter.values())
    return {k: v / total for k, v in counter.items()}

convert_rates = normalize_counter(transition_counts['convert'])
bounce_rates = normalize_counter(transition_counts['bounce'])

convert_rates, bounce_rates

({('inquiry', 'specification'): 0.18181818181818182,
  ('specification', 'pricing'): 0.09090909090909091,
  ('pricing', 'purchase_intent'): 0.09090909090909091,
  ('purchase_intent', 'purchase'): 0.18181818181818182,
  ('support', 'upsell'): 0.09090909090909091,
  ('upsell', 'specification'): 0.09090909090909091,
  ('specification', 'purchase'): 0.09090909090909091,
  ('information', 'inquiry'): 0.09090909090909091,
  ('specification', 'purchase_intent'): 0.09090909090909091},
 {('inquiry', 'exit'): 0.2222222222222222,
  ('inquiry', 'specification'): 0.1111111111111111,
  ('specification', 'pricing'): 0.1111111111111111,
  ('pricing', 'exit'): 0.1111111111111111,
  ('pricing', 'trade_in'): 0.1111111111111111,
  ('trade_in', 'exit'): 0.1111111111111111,
  ('inquiry', 'inquiry'): 0.2222222222222222})

## Calculating transition scores for each transition using convert_rates
- support may be used in the future, maybe borrow the concept from other analytic techniques

In [9]:
all_transitions = set(transition_counts['convert']) | set(transition_counts['bounce'])
rows = []
for t in all_transitions:
    convert = transition_counts['convert'].get(t, 0)
    bounce = transition_counts['bounce'].get(t, 0)
    total = convert + bounce
    row = dict(
        transition=f"{t[0]} -> {t[1]}",
        convert=convert,
        bounce=bounce,
        convert_rates=convert / total if total > 0 else 0.0,
        support=total,
        convert_norm = convert_rates.get(t, 0),
        bounce_norm = bounce_rates.get(t, 0),
    )
    rows.append(row)

transition_summary = pd.DataFrame(rows).sort_values(['convert_rates', 'support'], ascending=False).reset_index(drop=True)
transition_summary

Unnamed: 0,transition,convert,bounce,convert_rates,support,convert_norm,bounce_norm
0,purchase_intent -> purchase,2,0,1.0,2,0.181818,0.0
1,information -> inquiry,1,0,1.0,1,0.090909,0.0
2,pricing -> purchase_intent,1,0,1.0,1,0.090909,0.0
3,upsell -> specification,1,0,1.0,1,0.090909,0.0
4,specification -> purchase,1,0,1.0,1,0.090909,0.0
5,support -> upsell,1,0,1.0,1,0.090909,0.0
6,specification -> purchase_intent,1,0,1.0,1,0.090909,0.0
7,inquiry -> specification,2,1,0.666667,3,0.181818,0.111111
8,specification -> pricing,1,1,0.5,2,0.090909,0.111111
9,inquiry -> exit,0,2,0.0,2,0.0,0.222222


In [10]:
transition_score_map = {
    row['transition']: row['convert_rates']
    for _, row in transition_summary.iterrows()
}
transition_score_map

{'purchase_intent -> purchase': 1.0,
 'information -> inquiry': 1.0,
 'pricing -> purchase_intent': 1.0,
 'upsell -> specification': 1.0,
 'specification -> purchase': 1.0,
 'support -> upsell': 1.0,
 'specification -> purchase_intent': 1.0,
 'inquiry -> specification': 0.6666666666666666,
 'specification -> pricing': 0.5,
 'inquiry -> exit': 0.0,
 'inquiry -> inquiry': 0.0,
 'trade_in -> exit': 0.0,
 'pricing -> trade_in': 0.0,
 'pricing -> exit': 0.0}

## Full Path analysis

- find out if we have the series of transition as full path what score should it be

In [11]:
def path_score(states, transition_score_map):
    transitions = list(zip(states[:-1], states[1:]))
    scores = []
    for t in transitions:
        key = f"{t[0]} -> {t[1]}"
        score = transition_score_map.get(key, 0.5) # Default score for unseen transitions, can be changed
        scores.append(score)
    return sum(scores) / len(scores) if scores else 0.5

sessions['path_score'] = sessions['states'].apply(lambda states: path_score(states, transition_score_map))
sessions.sort_values('path_score', ascending=False)

Unnamed: 0,session_id,states,outcome,length,transitions,path_score
2,s3,"[support, upsell, specification, purchase]",convert,4,"[(support, upsell), (upsell, specification), (...",1.0
4,s5,"[information, inquiry, specification, purchase...",convert,5,"[(information, inquiry), (inquiry, specificati...",0.916667
0,s1,"[inquiry, specification, pricing, purchase_int...",convert,5,"[(inquiry, specification), (specification, pri...",0.791667
3,s4,"[inquiry, specification, pricing, exit]",bounce,4,"[(inquiry, specification), (specification, pri...",0.388889
1,s2,"[inquiry, exit]",bounce,2,"[(inquiry, exit)]",0.0
5,s6,"[pricing, trade_in, exit]",bounce,3,"[(pricing, trade_in), (trade_in, exit)]",0.0
6,s7,"[inquiry, inquiry, inquiry, exit]",bounce,4,"[(inquiry, inquiry), (inquiry, inquiry), (inqu...",0.0


## Find critical or danger transtions
- convert_rates below some threshold, explore/experiment with different thresholds

In [16]:
transition_summary.query("convert_rates < 0.6")

Unnamed: 0,transition,convert,bounce,convert_rates,support,convert_norm,bounce_norm
8,specification -> pricing,1,1,0.5,2,0.090909,0.111111
9,inquiry -> exit,0,2,0.0,2,0.0,0.222222
10,inquiry -> inquiry,0,2,0.0,2,0.0,0.222222
11,trade_in -> exit,0,1,0.0,1,0.0,0.111111
12,pricing -> trade_in,0,1,0.0,1,0.0,0.111111
13,pricing -> exit,0,1,0.0,1,0.0,0.111111


## Check occurrence of full path and outcomes

In [12]:
sessions.assign(path=sessions['states'].apply(lambda states: ' -> '.join(states))).groupby('path').agg(
    convert=pd.NamedAgg(column='outcome', aggfunc=lambda outcomes: sum(1 for o in outcomes if o == 'convert')),
    bounce=pd.NamedAgg(column='outcome', aggfunc=lambda outcomes: sum(1 for o in outcomes if o == 'bounce')),
).sort_values('convert', ascending=False)

Unnamed: 0_level_0,convert,bounce
path,Unnamed: 1_level_1,Unnamed: 2_level_1
information -> inquiry -> specification -> purchase_intent -> purchase,1,0
support -> upsell -> specification -> purchase,1,0
inquiry -> specification -> pricing -> purchase_intent -> purchase,1,0
inquiry -> inquiry -> inquiry -> exit,0,1
inquiry -> exit,0,1
inquiry -> specification -> pricing -> exit,0,1
pricing -> trade_in -> exit,0,1


## Loop Detection and Analysis

- study how loops in state transitions affect conversion rates

In [13]:
def has_loop(states):
    return any(states[i] == states[i+1] for i in range(len(states)-1))

sessions['has_loop'] = sessions['states'].apply(has_loop)
sessions.groupby(['has_loop', 'outcome']).size().unstack(fill_value=0)

outcome,bounce,convert
has_loop,Unnamed: 1_level_1,Unnamed: 2_level_1
False,3,3
True,1,0


In [14]:
sessions.query("has_loop==True")

Unnamed: 0,session_id,states,outcome,length,transitions,path_score,has_loop
6,s7,"[inquiry, inquiry, inquiry, exit]",bounce,4,"[(inquiry, inquiry), (inquiry, inquiry), (inqu...",0.0,True


# Use algorithms

## pm4py

- we need to read the docs to understand how it works

In [18]:
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.algo.discovery.heuristics import algorithm as heuristics_miner

In [20]:
df.head()

Unnamed: 0,session_id,turn,user_message,assistant_message,state,outcome,timestamp
0,s1,1,"Hi, I need help choosing a laptop",I can help you find the perfect laptop. What w...,inquiry,convert,2024-06-01 10:00:00
1,s1,2,Gaming and video editing,"For gaming and video editing, I recommend our ...",specification,convert,2024-06-01 10:15:00
2,s1,3,What about the price range?,Our gaming laptops range from $1200-$3000. Wha...,pricing,convert,2024-06-01 10:30:00
3,s1,4,Around $2000 sounds good,Perfect! I have a great model at $1899 that fi...,purchase_intent,convert,2024-06-01 10:45:00
4,s1,5,"Yes, please proceed",Great! I have added the laptop to your cart. P...,purchase,convert,2024-06-01 11:00:00


In [26]:
event_log = df.rename(columns={
    "session_id": "case:concept:name",
    "state": "concept:name",
    "timestamp": "time:timestamp",
})[["case:concept:name", "concept:name", "time:timestamp", "outcome"]]
event_log

Unnamed: 0,case:concept:name,concept:name,time:timestamp,outcome
0,s1,inquiry,2024-06-01 10:00:00,convert
1,s1,specification,2024-06-01 10:15:00,convert
2,s1,pricing,2024-06-01 10:30:00,convert
3,s1,purchase_intent,2024-06-01 10:45:00,convert
4,s1,purchase,2024-06-01 11:00:00,convert
5,s2,inquiry,2024-06-02 09:00:00,bounce
6,s2,exit,2024-06-02 09:02:00,bounce
7,s3,support,2024-06-03 14:00:00,convert
8,s3,upsell,2024-06-03 14:05:00,convert
9,s3,specification,2024-06-03 14:10:00,convert


In [27]:
event_log.loc[event_log['outcome'] == 'convert']

Unnamed: 0,case:concept:name,concept:name,time:timestamp,outcome
0,s1,inquiry,2024-06-01 10:00:00,convert
1,s1,specification,2024-06-01 10:15:00,convert
2,s1,pricing,2024-06-01 10:30:00,convert
3,s1,purchase_intent,2024-06-01 10:45:00,convert
4,s1,purchase,2024-06-01 11:00:00,convert
7,s3,support,2024-06-03 14:00:00,convert
8,s3,upsell,2024-06-03 14:05:00,convert
9,s3,specification,2024-06-03 14:10:00,convert
10,s3,purchase,2024-06-03 14:15:00,convert
15,s5,information,2024-06-05 12:00:00,convert


In [35]:
convert_log = event_log[event_log['outcome'] == 'convert']
bounce_log = event_log[event_log['outcome'] == 'bounce']

convert_pm = log_converter.apply(convert_log)
bounce_pm = log_converter.apply(bounce_log)

convert_net, _, _ = heuristics_miner.apply(convert_pm)
bounce_net, _, _ = heuristics_miner.apply(bounce_pm)

In [36]:
convert_net

places: [ intplace_specification, pre_inquiry, pre_purchase, pre_purchase_intent, pre_specification, pre_upsell, sink0, source0 ]
transitions: [ (hid_10, None), (hid_2, None), (information, 'information'), (inquiry, 'inquiry'), (pricing, 'pricing'), (purchase, 'purchase'), (purchase_intent, 'purchase_intent'), (specification, 'specification'), (support, 'support'), (upsell, 'upsell') ]
arcs: [ (hid_10, None)->pre_purchase, (hid_10, None)->pre_purchase_intent, (hid_2, None)->pre_inquiry, (information, 'information')->pre_inquiry, (inquiry, 'inquiry')->pre_specification, (pricing, 'pricing')->pre_purchase_intent, (purchase, 'purchase')->sink0, (purchase_intent, 'purchase_intent')->pre_purchase, (specification, 'specification')->intplace_specification, (support, 'support')->pre_upsell, (upsell, 'upsell')->pre_specification, intplace_specification->(hid_10, None), intplace_specification->(pricing, 'pricing'), pre_inquiry->(inquiry, 'inquiry'), pre_purchase->(purchase, 'purchase'), pre_purc

In [38]:
bounce_net

places: [ intplace_inquiry, intplace_pricing, pre_exit, pre_inquiry, pre_pricing, sink0, source0 ]
transitions: [ (exit, 'exit'), (hid_11, None), (hid_2, None), (hid_7, None), (hid_8, None), (hid_9, None), (inquiry, 'inquiry'), (pricing, 'pricing'), (specification, 'specification'), (trade_in, 'trade_in') ]
arcs: [ (exit, 'exit')->sink0, (hid_11, None)->pre_exit, (hid_2, None)->pre_inquiry, (hid_7, None)->pre_pricing, (hid_8, None)->pre_exit, (hid_9, None)->pre_inquiry, (inquiry, 'inquiry')->intplace_inquiry, (pricing, 'pricing')->intplace_pricing, (specification, 'specification')->pre_pricing, (trade_in, 'trade_in')->pre_exit, intplace_inquiry->(hid_8, None), intplace_inquiry->(hid_9, None), intplace_inquiry->(specification, 'specification'), intplace_pricing->(hid_11, None), intplace_pricing->(trade_in, 'trade_in'), pre_exit->(exit, 'exit'), pre_inquiry->(inquiry, 'inquiry'), pre_pricing->(pricing, 'pricing'), source0->(hid_2, None), source0->(hid_7, None) ]

## hmmlearn

More research = better  

Flows:
- encoding
- preprocess data
- train HMM model
- evaluate model: run many n_components
- select how many hidden states: think it as hyper tuning but the n_components
- match observed states with hidden states
- interpret

In [39]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df["state_id"] = encoder.fit_transform(df["state"])

In [40]:
df.head()

Unnamed: 0,session_id,turn,user_message,assistant_message,state,outcome,timestamp,state_id
0,s1,1,"Hi, I need help choosing a laptop",I can help you find the perfect laptop. What w...,inquiry,convert,2024-06-01 10:00:00,2
1,s1,2,Gaming and video editing,"For gaming and video editing, I recommend our ...",specification,convert,2024-06-01 10:15:00,6
2,s1,3,What about the price range?,Our gaming laptops range from $1200-$3000. Wha...,pricing,convert,2024-06-01 10:30:00,3
3,s1,4,Around $2000 sounds good,Perfect! I have a great model at $1899 that fi...,purchase_intent,convert,2024-06-01 10:45:00,5
4,s1,5,"Yes, please proceed",Great! I have added the laptop to your cart. P...,purchase,convert,2024-06-01 11:00:00,4


In [41]:
def build_sequences(dataframe):
    sequences, lengths = [], []
    for _, g in dataframe.groupby("session_id"):
        seq = g["state_id"].tolist()
        sequences.extend(seq)
        lengths.append(len(seq))
    X = [[x] for x in sequences]
    return X, lengths

X_convert, len_convert = build_sequences(df[df["outcome"] == "convert"])
X_bounce,  len_bounce  = build_sequences(df[df["outcome"] == "bounce"])

In [45]:
X_convert

[[2], [6], [3], [5], [4], [7], [9], [6], [4], [1], [2], [6], [5], [4]]

In [44]:
len_convert

[5, 4, 5]

In [46]:
X_bounce

[[2], [0], [2], [6], [3], [0], [3], [8], [0], [2], [2], [2], [0]]

In [47]:
len_bounce

[2, 4, 3, 4]

In [65]:
df.head()

Unnamed: 0,session_id,turn,user_message,assistant_message,state,outcome,timestamp,state_id
0,s1,1,"Hi, I need help choosing a laptop",I can help you find the perfect laptop. What w...,inquiry,convert,2024-06-01 10:00:00,2
1,s1,2,Gaming and video editing,"For gaming and video editing, I recommend our ...",specification,convert,2024-06-01 10:15:00,6
2,s1,3,What about the price range?,Our gaming laptops range from $1200-$3000. Wha...,pricing,convert,2024-06-01 10:30:00,3
3,s1,4,Around $2000 sounds good,Perfect! I have a great model at $1899 that fi...,purchase_intent,convert,2024-06-01 10:45:00,5
4,s1,5,"Yes, please proceed",Great! I have added the laptop to your cart. P...,purchase,convert,2024-06-01 11:00:00,4


In [48]:
from hmmlearn.hmm import MultinomialHMM

hmm_convert = MultinomialHMM(n_components=4, n_iter=100, random_state=0)
hmm_bounce  = MultinomialHMM(n_components=4, n_iter=100, random_state=0)

hmm_convert.fit(X_convert, len_convert)
hmm_bounce.fit(X_bounce,  len_bounce)

MultinomialHMM has undergone major changes. The previous version was implementing a CategoricalHMM (a special case of MultinomialHMM). This new implementation follows the standard definition for a Multinomial distribution (e.g. as in https://en.wikipedia.org/wiki/Multinomial_distribution). See these issues for details:
https://github.com/hmmlearn/hmmlearn/issues/335
https://github.com/hmmlearn/hmmlearn/issues/340
MultinomialHMM has undergone major changes. The previous version was implementing a CategoricalHMM (a special case of MultinomialHMM). This new implementation follows the standard definition for a Multinomial distribution (e.g. as in https://en.wikipedia.org/wiki/Multinomial_distribution). See these issues for details:
https://github.com/hmmlearn/hmmlearn/issues/335
https://github.com/hmmlearn/hmmlearn/issues/340
Fitting a model with 15 free scalar parameters with only 14 data points will result in a degenerate solution.
Fitting a model with 15 free scalar parameters with only

0,1,2
,n_components,4
,n_trials,"array([2, 0, ..., 2, 2, 2, 0])"
,startprob_prior,1.0
,transmat_prior,1.0
,algorithm,'viterbi'
,random_state,RandomState(M... 0x1B4E4FFD540
,n_iter,100
,tol,0.01
,verbose,False
,params,'ste'


In [51]:
hmm_convert.transmat_

array([[8.20307797e-01, 1.42628223e-01, 3.70549180e-02, 9.06215842e-06],
       [4.30221158e-08, 9.52355734e-02, 8.93087849e-01, 1.16765349e-02],
       [2.45598630e-03, 5.30021116e-03, 9.30764284e-01, 6.14795189e-02],
       [2.44108257e-02, 7.02811984e-08, 7.91457988e-02, 8.96443305e-01]])

In [52]:
hmm_bounce.transmat_

array([[8.20307797e-01, 1.42628223e-01, 3.70549180e-02, 9.06215842e-06],
       [4.30221158e-08, 9.52355734e-02, 8.93087849e-01, 1.16765349e-02],
       [2.45598630e-03, 5.30021116e-03, 9.30764284e-01, 6.14795189e-02],
       [2.44108257e-02, 7.02811984e-08, 7.91457988e-02, 8.96443305e-01]])

In [53]:
hmm_convert.emissionprob_

array([[1.],
       [1.],
       [1.],
       [1.]])

In [54]:
hmm_bounce.emissionprob_

array([[1.],
       [1.],
       [1.],
       [1.]])

In [55]:
import numpy as np

def prefix_scores(session_df):
    obs = [[x] for x in session_df["state_id"].tolist()]
    scores = []

    for i in range(1, len(obs) + 1):
        s = obs[:i]
        scores.append({
            "turn": i,
            "convert_ll": hmm_convert.score(s),
            "bounce_ll": hmm_bounce.score(s)
        })
    return pd.DataFrame(scores)

In [64]:
session = df[df["session_id"] == "s7"]
prefix_scores(session).round(4)

Unnamed: 0,turn,convert_ll,bounce_ll
0,1,-0.0,-0.0
1,2,-0.0,-0.0
2,3,-0.0,-0.0
3,4,0.0,-0.0


In [66]:
import numpy as np
from hmmlearn import hmm
from sklearn.preprocessing import LabelEncoder

# Encode states to integers
le = LabelEncoder()
df['state_id'] = le.fit_transform(df['state'])

# Prepare sequences for each session
sequences = []
lengths = []

for session_id in df['session_id'].unique():
    session_data = df[df['session_id'] == session_id].sort_values('turn')
    sequence = session_data['state_id'].values
    sequences.extend(sequence)
    lengths.append(len(sequence))

# Convert to required format
X = np.array(sequences).reshape(-1, 1)
lengths = np.array(lengths)

# Train separate models for convert vs bounce
convert_sessions = df[df['outcome'] == 'convert']['session_id'].unique()
bounce_sessions = df[df['outcome'] == 'bounce']['session_id'].unique()

# Convert sequences
convert_X, convert_lengths = [], []
for sid in convert_sessions:
    seq = df[df['session_id'] == sid].sort_values('turn')['state_id'].values
    convert_X.extend(seq)
    convert_lengths.append(len(seq))

convert_X = np.array(convert_X).reshape(-1, 1)
convert_lengths = np.array(convert_lengths)

# Train model
n_states = 3  # Hidden states (e.g., exploring, deciding, committed)
model = hmm.MultinomialHMM(n_components=n_states, n_iter=100)
model.fit(convert_X, convert_lengths)

# Predict hidden states
hidden_states = model.predict(convert_X, convert_lengths)


MultinomialHMM has undergone major changes. The previous version was implementing a CategoricalHMM (a special case of MultinomialHMM). This new implementation follows the standard definition for a Multinomial distribution (e.g. as in https://en.wikipedia.org/wiki/Multinomial_distribution). See these issues for details:
https://github.com/hmmlearn/hmmlearn/issues/335
https://github.com/hmmlearn/hmmlearn/issues/340


In [73]:
round(model.score(convert_X, convert_lengths),4)

0.0

In [77]:
convert_X.shape

(14, 1)

In [76]:
hidden_states.shape

(14,)

In [78]:
from hmmlearn import hmm
import numpy as np

# Split data by outcome
convert_sessions = df[df['outcome'] == 'convert']['session_id'].unique()
bounce_sessions = df[df['outcome'] == 'bounce']['session_id'].unique()

# Prepare convert data
convert_X, convert_lengths = [], []
for sid in convert_sessions:
    seq = df[df['session_id'] == sid].sort_values('turn')['state_id'].values
    convert_X.extend(seq)
    convert_lengths.append(len(seq))

convert_X = np.array(convert_X).reshape(-1, 1)
convert_lengths = np.array(convert_lengths)

# Same for bounce data
bounce_X, bounce_lengths = [], []
for sid in bounce_sessions:
    seq = df[df['session_id'] == sid].sort_values('turn')['state_id'].values
    bounce_X.extend(seq)
    bounce_lengths.append(len(seq))

bounce_X = np.array(bounce_X).reshape(-1, 1)
bounce_lengths = np.array(bounce_lengths)

# Train separate models
convert_model = hmm.MultinomialHMM(n_components=3)
convert_model.fit(convert_X, convert_lengths)

bounce_model = hmm.MultinomialHMM(n_components=2)  # Fewer states for bounce
bounce_model.fit(bounce_X, bounce_lengths)


MultinomialHMM has undergone major changes. The previous version was implementing a CategoricalHMM (a special case of MultinomialHMM). This new implementation follows the standard definition for a Multinomial distribution (e.g. as in https://en.wikipedia.org/wiki/Multinomial_distribution). See these issues for details:
https://github.com/hmmlearn/hmmlearn/issues/335
https://github.com/hmmlearn/hmmlearn/issues/340
MultinomialHMM has undergone major changes. The previous version was implementing a CategoricalHMM (a special case of MultinomialHMM). This new implementation follows the standard definition for a Multinomial distribution (e.g. as in https://en.wikipedia.org/wiki/Multinomial_distribution). See these issues for details:
https://github.com/hmmlearn/hmmlearn/issues/335
https://github.com/hmmlearn/hmmlearn/issues/340


0,1,2
,n_components,2
,n_trials,"array([2, 0, ..., 2, 2, 2, 0])"
,startprob_prior,1.0
,transmat_prior,1.0
,algorithm,'viterbi'
,random_state,RandomState(M... 0x1B3BE74E040
,n_iter,10
,tol,0.01
,verbose,False
,params,'ste'


In [79]:
hidden_convert = convert_model.predict(convert_X, convert_lengths)
hidden_bounce = bounce_model.predict(bounce_X, bounce_lengths)

In [80]:
hidden_convert

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [81]:
hidden_bounce

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])