## 3 Markov Chain Model

In this section we'll explore a simple statistical language model to estimate the likelihood of a particular injury occurring based on a player's prior injured reserve history (or lack thereof).  Our data and subsequent model are based solely on words describing injuries over their careers and no other statistics associated with the player's career. Ideally, we'll be able to build more elaborate models in later stages using more advanced features with more predictive power than words and can be predictive to the player vs predictive of the population, but we'll start with this simple approach.

In [59]:
from nltk import ngrams
import pandas as pd
import json
import requests
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.preprocessing import MultiLabelBinarizer
import pickle
from nltk.util import pad_sequence
from nltk.lm.preprocessing import pad_both_ends

In [3]:
career = pd.read_csv('data/all_pos_ir_career.csv')
df_long = pd.read_csv('data/df_long.csv',sep='|')

In [60]:
career.head()

Unnamed: 0,player,position,1,2,3,4,5,6,7,8,9
0,A'Shawn Robinson,DT,Healthy,Healthy,Knee,Healthy,Healthy,Healthy,,,
1,A.J. Brown,WR,Healthy,Healthy,Chest,,,,,,
2,A.J. Cann,OG,Healthy,Healthy,Healthy,Healthy,Healthy,Healthy,Knee,,
3,A.J. Derby,TE,Undisclosed,Healthy,Shoulder,Foot,Healthy,Healthy,Healthy,,
4,A.J. Epenesa,DE,Healthy,Healthy,,,,,,,


In [4]:
df_long

Unnamed: 0,player,position,variable,value
0,A.J. Brown,WR,1,Healthy
1,A.J. Derby,TE,1,Undisclosed
2,A.J. McCarron,QB,1,Healthy
3,AJ Dillon,RB,1,Healthy
4,Aaron Burbridge,WR,1,Healthy
...,...,...,...,...
6529,Zach Gentry,TE,9,
6530,Zach Mettenberger,QB,9,
6531,Zach Wilson,QB,9,
6532,Zack Moss,RB,9,


Let's build a vocabulary based on the unique list of injuries

In [5]:
from nltk.lm.preprocessing import flatten

vocab = list(flatten(pad_both_ends(injury, n=2) for injury in career.iloc[:,2:].values))
vocab = [word for word in vocab if pd.isnull(word) == False]

Here, our "vocabulary" of unique IR-related injuries is 39 words/phrases.  If we attempt to build a language model with an N of 4 (since our player's expected career length is 4 as we saw in the eda section), that means that we could have 2,313,441 possible combinations of healthy/injury sequences across a 4 year career. Obviously, with our limited data set, we don't have all combinations of injuries but we should be able to predict more frequent IR sequences.

In [6]:

def getFrequentItemsets(df, threshold=.01, k=2):
    
    ''' Takes a binarized dataset and provides an analysis of support based on a specificed threshold 
    Parameters:
    -----------
        df - a binarized dataframe
        threshold - float (default = .01) The minimum threshold for support. itemset threshold 
        underneath this value will be filtered out
        k - 'int' (default=2) the size of the itemset to filter on
        
    Returns:
    --------
        a dataframe containing a itemsets and their relative support from highest to lowest
    
    '''
    
    itmst = apriori(df, min_support=threshold, use_colnames=True)
    itmst.sort_values("support", ascending=False, inplace=True)
    itmst = itmst[itmst['itemsets'].map(len)==k]
    return itmst



def getTransitionProbabilities(df,init,init_num,trans_num,threshold=.0001):
    mlb = MultiLabelBinarizer()
    init_df = pd.DataFrame(mlb.fit_transform(df[df[str(init_num)]==init][[str(trans_num)]].fillna('').values),
                             columns = mlb.classes_)
    itmst = apriori(init_df, min_support=threshold, use_colnames=True).sort_values('support',ascending=False)
    return itmst
    
    
    # init_df = df[df[str(init_num)]==init]
    # total = len(init_df)
    # return len(init_df[init_df[str(trans_num)]==trans])/total


In [7]:
mlb = MultiLabelBinarizer()
df = pd.DataFrame(mlb.fit_transform(career[['1']].fillna('').values),
                             columns = mlb.classes_)

itms = getFrequentItemsets(df,threshold=.001, k=1)



In [8]:
print("initial probability")
itms

initial probability distributions


Unnamed: 0,support,itemsets
11,0.713725,(Healthy)
0,0.109804,()
13,0.050109,(Knee)
2,0.020915,(Ankle)
18,0.015251,(Shoulder)
7,0.012636,(Foot)
9,0.0122,(Hamstring)
21,0.009586,(Undisclosed)
6,0.005664,(Designated for Return)
5,0.005229,(Concussion)


In [9]:
#Now we assign transition probabilities to each state

itmst = getTransitionProbabilities(career,'Healthy',2,3)
itmst



Unnamed: 0,support,itemsets
20,0.708307,(Healthy)
0,0.116802,()
25,0.035603,(Knee)
3,0.018738,(Ankle)
36,0.018114,(Undisclosed)
17,0.01624,(Hamstring)
31,0.012492,(Shoulder)
14,0.011868,(Foot)
2,0.006246,(Achilles)
11,0.004372,(Elbow)


In [10]:
career[career['3'].isna()==True]

Unnamed: 0,player,position,1,2,3,4,5,6,7,8,9
4,A.J. Epenesa,DE,Healthy,Healthy,,,,,,,
7,A.J. Terrell,DB,Healthy,Healthy,,,,,,,
8,AJ Dillon,RB,Healthy,Healthy,,,,,,,
9,Aaron Banks,OG,Healthy,,,,,,,,
19,Aaron Robinson,DB,Core Muscle,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
2283,Zach Wilson,QB,Healthy,,,,,,,,
2284,Zack Baun,LB,Healthy,Healthy,,,,,,,
2286,Zack Moss,RB,Ankle,,,,,,,,
2290,Zaven Collins,LB,Healthy,,,,,,,,


In [11]:
mlb = MultiLabelBinarizer()
init_df = pd.DataFrame(mlb.fit_transform(career[(career['1']=='Healthy') &
                                               (career['2']=='Healthy')][["3"]].fillna('').values),
                         columns = mlb.classes_)
itmst = apriori(init_df, min_support=.0001, use_colnames=True).sort_values('support',ascending=False)
itmst



Unnamed: 0,support,itemsets
17,0.713796,(Healthy)
0,0.125964,()
20,0.032562,(Knee)
3,0.023993,(Ankle)
27,0.016281,(Undisclosed)
14,0.015424,(Hamstring)
11,0.01371,(Foot)
25,0.009426,(Shoulder)
2,0.005141,(Achilles)
10,0.005141,(Elbow)


In [12]:
# ['Healthy', 'Healthy', 'Foot', 'Healthy']

anks = career[(career['1']=='Healthy') & 
       (career['2']=='Ankle') &
       (career['3']=='Healthy') &
       (career['4']=='Healthy') &
       (career['5']=='Healthy')]

        


In [13]:
## Using NLTK's language models

In [14]:
from nltk.util import pad_sequence
from nltk.lm.preprocessing import pad_both_ends
from nltk.util import everygrams

Using nltk's library, we can incorporate beginning and ending tags in a player's career (pad_both_ends).  We're mostly concerned about the start of the player's career for this model as we care to predict injuries during the player's career (not so much predicting the end of the player's career.

In [54]:
my_lst = [[x for x in row if pd.isnull(x)==False] for row in career.iloc[:,2:].values]
my_lst
my_new_lst = [list(pad_both_ends(seq,2)) for seq in my_lst]

In [55]:
# Visual inspection of the first 5 career sequences after pre-processing
my_new_lst[:5]

[['<s>',
  'Healthy',
  'Healthy',
  'Knee',
  'Healthy',
  'Healthy',
  'Healthy',
  '</s>'],
 ['<s>', 'Healthy', 'Healthy', 'Chest', '</s>'],
 ['<s>',
  'Healthy',
  'Healthy',
  'Healthy',
  'Healthy',
  'Healthy',
  'Healthy',
  'Knee',
  '</s>'],
 ['<s>',
  'Undisclosed',
  'Healthy',
  'Shoulder',
  'Foot',
  'Healthy',
  'Healthy',
  'Healthy',
  '</s>'],
 ['<s>', 'Healthy', 'Healthy', '</s>']]

In [18]:
career[career['player'].str.contains('Christian McCaffrey')]


Unnamed: 0,player,position,1,2,3,4,5,6,7,8,9
381,Christian McCaffrey,RB,Healthy,Healthy,Healthy,Ankle,Hamstring,,,,


In [19]:
# Use everygrams to get larger n-grams
my_quad_lst = [list(everygrams(seq,max_len=3)) for seq in my_new_lst]

In [20]:
import numpy as np

# my_every_array  = np.array(my_every_lst, dtype='object')
# train, dev, test = np.split(my_tri_lst,[int(.8*len(my_tri_lst)), int(.9*len(my_tri_lst))])

For our simple model, we'll use ntlk's Maximum Likelihood Estimator which has a variety of features to help estimate both raw probabilities and words (in this case injuries). We can also use the 39 injury types to build a vocabulary which helps train the statistical model

In [21]:
from nltk.lm import MLE
from nltk.lm import Vocabulary

In [22]:
inj_vocab = Vocabulary(vocab, unk_cutoff=2)

In [23]:
lm = MLE(4)

In [24]:
lm.fit(my_quad_lst, inj_vocab)

In [25]:
lm.vocab.lookup(["Ankle", "Swim", "Mars"])

('Ankle', '<UNK>', '<UNK>')

In [26]:
print(lm.counts)

<NgramCounter with 3 ngram orders and 53436 ngrams>


We can use the counts property to count the number of n-grams for a single injury, or we can pass a sequence of injuries (representing a career of x length) followed by an interested injury given the sequence to see the number of n-grams with that sequence.  We could convert it to a probability by dividing it by the total number of players over the passed sequence.

In [27]:
lm.counts['Ankle']

203

In [28]:
lm.score('Ankle',['<s>','Healthy','Knee'])

0

In [29]:

print(lm.counts[['<s>','Healthy']]['Healthy']  /  lm.counts[['<s>']]['Healthy'])

0.7263329706202394


In [30]:
lm.counts[['<s>']]['Healthy']

1838

In [34]:
lm.score('Knee', ['<s>','Healthy','Healthy','Ankle','Hamstring'])

0

In [35]:
lm.score('Healthy', ['<s>','Healthy'])

0.7263329706202394

In [36]:
import ast

In [37]:
pickle.dump(lm, open('model/ir_model', 'wb'))

In [38]:
lm.counts[['Healthy', 'Healthy', 'Healthy']]

FreqDist({})

In [56]:
def IRPredictions(injury_history=None, future_injury=None, player=None, num_seasons_back=None):
    
    
    
    ''' Takes a sequence of  IR injuries or a player's name and finds the Maximum Likelihood of
         a user-provided future injury OR if one is not provided, the top-5 likeliest injuries for that sequence.
        
    Parameters:
    -----------
        injury_history : (list) <no default> an ordered sequence representing a player's career where each item 
        indicates either "Healthy" (ie. player wasn't on IR) or the name of an Injury causing the player to land 
        on Injured Reserve. The ordering of the injuries must correspond to seasons the player played.
        
        Example:  Player "A" was healthy two seasons before having an ankle injury: ['Heathy','Healthy','Ankle']
        Example:  Player "B" was injured with a knee injury to start their career, had a Healthy season, and was
        injured again with an ankle injury:  ['Knee','Healthy','Ankle']
        
        future_injury :  str default=None A string representing the name of an injury to be predicted on. If none 
        is provided, a dataframe of the top 5 likeliest injuries is provided
        
        player : str  default=None NFL Player's name. This will determine the top-5 likeliest injuries (and their associated
        probabilities they will incur in the next season
        
        num_seasons_back : int <default None> the season # to go back assuming the current season is yet to be played). 
        For Player "A" with a sequence of ['Healthy, 'Healthy', 'Ankle', 'Healthy', 'Knee'] specifying "3" 
        would start the model with ["Ankle","Healthy","Knee"] ignoring the first two Healthy seasons.
        
        
    Returns:
    --------
        a float representing the probability the specified future injury will occur provided 
        either an injury sequence or player was given OR a dataframe containing the top-5 likeliest injuries
        and their assocated probabilities of occurring
    '''
    
    if  player is None and  (injury_history is None or len(injury_history) == 0):
        return """Please provide a valid injury history as a python list. Example: ['Healthy','Healthy','Ankle']"""
    
    
    if player is not None:
        try:
            idx = career[career['player'].str.contains(player)].index[0]
            srch_reslts = my_new_lst[idx]
            # srch_reslts = career[career['player'].str.contains(player)].iloc[0,2:]
        except:
            print("The name provided returned an error. Please re-enter the name")
            return _
    
        if len(srch_reslts) > 0:
            injury_history = [injury for injury in srch_reslts if pd.isnull(injury)==False]
            print('injury history:', injury_history)
            total_seasons = len(injury_history)
            injury_history = list(pad_both_ends(injury_history,2))
            if num_seasons_back is None:
                if total_seasons >= 5:
                    num_seasons_back = 5
                elif total_seasons == 1:
                    num_seasons_back = 2
                else:
                    num_seasons_back = total_seasons - 1
            
            begin_season  = (total_seasons - num_seasons_back) + 1
            print('begin season',begin_season, num_seasons_back,total_seasons)
            tmp_history = injury_history[begin_season - 1:]
            
            if "</s>" in tmp_history:
                idx = injury_history.index('</s>')
                begin_season = (total_seasons - num_seasons_back) + 1
                injury_history = injury_history[begin_season:idx]
            else:
                injury_history = tmp_history
            print(f"player's injury history: {injury_history}")
            freq = lm.counts[injury_history]
            scores = [(injury[0], lm.score(injury[0], injury_history), injury[1]) for injury in freq.most_common(10) if injury[0] != '</s>']
            df = pd.DataFrame(scores, columns = ['Injury','Likelihood','Freq'])
            num_total = sum(freq.values())
            return df, num_total, injury_history
    
    elif len(injury_history) > 0:
        injury_history = ast.literal_eval(injury_history)
        if future_injury is not None:
            print('identified future injury')
            score = lm.score(future_injury, injury_history)
            return score, 8, injury_history
        else:
            print(f'processing injury history as {injury_history}')
            freq = lm.counts[injury_history]
            # grab top 6 as one of the items will be the ending sequence tag '</s>'
            scores = [(injury[0], lm.score(injury[0], injury_history), injury[1]) for injury in freq.most_common(10) if injury[0] != '</s>']
            df = pd.DataFrame(scores, columns = ['Injury','Likelihood','Freq'])
            num_total = sum(freq.values())
            return df, num_total, injury_history

    
injury_history = []
traffic_light = True   
# while traffic_light:
user_input = input("Please provide a user's name, or a sequence of injuries or type 'quit' to quit:")
if '[' in user_input:
    num_seasons_back = 2
    injury_history = user_input
    output,num_total,injury_history = IRPredictions(injury_history,None, None, num_seasons_back)
elif len(user_input) > 4:
    player = user_input
    output,num_total,injury_history = IRPredictions(None,None, player, 3)
    print(output)
elif user_input == 'quit':
    traffic_light = False

            
    
    print(injury_history, player)
    # output,num_total,injury_history = IRPredictions(injury_history, player, num_seasons_back=3)
    print(f"Total Number of Players Matching the injury sequence of {injury_history} :  {num_total}")
    injury_history = []
    player = ""
print("This concludes our session") 

Please provide a user's name, or a sequence of injuries or type 'quit' to quit: Christian McCaffrey


injury history: ['<s>', 'Healthy', 'Healthy', 'Healthy', 'Ankle', 'Hamstring', '</s>']
begin season 5 3 7
player's injury history: ['Ankle', 'Hamstring']
     Injury  Likelihood  Freq
0   Healthy         0.4     2
1  Shoulder         0.2     1
This concludes our session


(2295, 11)

In [None]:
# Leonard Fournette
three_health = career[(career['1']=='Healthy') &
       (career['2']=='Foot') &
       (career['3']=='Healthy') 
       # (career['4']!='Healthy')
        ]


three_health.shape

# kam = [injury for injury in kam if pd.isnull(injury) == False ]

In [None]:
 IRPredictions(None,None, 'Saquon Barkley',3)

In [None]:
career[career['player'].str.contains('Ekeler')]

In [None]:
lst = ['<s>', 'Healthy', 'Knee', 'Healthy', 'Healthy', 'Healthy', '</s>']

In [None]:
lst.index('</s>')

In [None]:
career[career['player'].str.contains('Marshall')]

In [None]:
from nltk.tag import hmm

In [None]:
test = range(5)

In [None]:
def CreateSequence(career_seq):

    if career_seq == "Healthy":
        s="No_IR"
    elif career_seq == '</s>':
        s='Retired'
    else:
        s = 'IR'
    return career_seq, s

In [None]:
s = """"Your humble writer knows a little bit about a lot of things, but despite writing a fair amount about text processing (a book, for example), linguistic processing is a relatively novel area for me. Forgive me if I stumble through my explanations of the quite remarkable Natural Language Toolkit (NLTK), a wonderful tool for teaching, and working in, computational linguistics using Python. Computational linguistics, moreover, is closely related to the fields of artificial intelligence, language/speech recognition, translation, and grammar checking.\nWhat NLTK includes\nIt is natural to think of NLTK as a stacked series of layers that build on each other. Readers familiar with lexing and parsing of artificial languages (like, say, Python) will not have too much of a leap to understand the similar -- but deeper -- layers involved in natural language modeling.\nGlossary of terms\nCorpora: Collections of related texts. For example, the works of Shakespeare might, collectively, by called a corpus; the works of several authors, corpora.\nHistogram: The statistic distribution of the frequency of different words, letters, or other items within a data set.\nSyntagmatic: The study of syntagma; namely, the statistical relations in the contiguous occurrence of letters, words, or phrases in corpora.\nContext-free grammar: Type-2 in Noam Chomsky's hierarchy of the four types of formal grammars. See Resources for a thorough description.\nWhile NLTK comes with a number of corpora that have been pre-processed (often manually) to various degrees, conceptually each layer relies on the processing in the adjacent lower layer. Tokenization comes first; then words are tagged; then groups of words are parsed into grammatical elements, like noun phrases or sentences (according to one of several techniques, each with advantages and drawbacks); and finally sentences or other grammatical units can be classified. Along the way, NLTK gives you the ability to generate statistics about occurrences of various elements, and draw graphs that represent either the processing itself, or statistical aggregates in results.\nIn this article, you'll see some relatively fleshed-out examples from the lower-level capabilities, but most of the higher-level capabilities will be simply described abstractly. Let's now take the first steps past text processing, narrowly construed. """
sentences = s.split('.')[:-1]
seq = [map(lambda x:(x,''), ss.split(' ')) for ss in sentences]

In [None]:
symbols = list(set(vocab))
states=['No_IR', 'IR', 'Retired']
trainer = hmm.HiddenMarkovModelTrainer(states=states,symbols=symbols)


In [None]:
import hmmlearn

In [None]:
lst_obj = []
for word in symbols:
    score = m.probability([('Healthy',''),('Healthy',''),('Knee',''),('Healthy',''),(word,'')])
    obj = {}
    obj['injury'] = word
    obj['score'] = score   
    lst_obj.append(obj)
pd.DataFrame(lst_obj).sort_values('score',ascending=False)


In [None]:
lst_obj = []
for word in symbols:
    score = m.log_probability([])
    obj = {}
    obj['injury'] = word
    obj['score'] = score
    lst_obj.append(obj)
    
df = pd.DataFrame(lst_obj)
df.sort_values('score', ascending=False)  


In [None]:
df.iloc[0]['score']

In [None]:
import math
seq = []
rand_samp = career.sample(frac=1,random_state=0)
train,test = np.split(rand_samp, [int(.8 * len(rand_samp))])
train.shape, test.shape

my_train = train[[str(x) for x in range(1,9)]]
for row in my_train.iterrows():
    lst_row = list([x for x in list(row[1].values) if pd.isnull(x)==False])
    seq.append(map(CreateSequence, lst_row))

my_seq = []
for x in seq:
    my_seq.append(list(x))

seq_test = []
seq = []
my_test = test[[str(x) for x in range(1,9)]]
for row in my_test.iterrows():
    lst_row = list([x for x in list(row[1].values) if pd.isnull(x)==False])
    seq.append(map(CreateSequence, lst_row))
for x in seq:
    my_test.append(list(x))


In [None]:
num_seasons_back
# my_seq[:30]

In [None]:
test = map(CreateSequence,['Healthy','Healthy','Ankle'])

In [None]:
for i in test:
    print(tuple(i))

In [None]:
url = 'http://nddleague.com:999/Knee'
data = json.dumps({"ir_history": ['Healthy','Healthy']})



resp = requests.get(url,data=data)

In [None]:
requests.

In [None]:
career[(career['1'] != 'Healthy') & 
       (career['2'] != 'Healthy')][['1','2']]