# Fantasy Football Topic Modeling using LDA
By Daniel Jimenez

In [23]:
import pandas as pd
import pprint

In [24]:
def first_pass(text):
    text = str(text)
    #remove newline character code
    text = text.replace('\n',' ')
    #remove the ampersand character code
    text = text.replace('&amp',' ')
    #character code for a Zero-width space
    text = text.replace('&#x200B;',' ')
    #non-breaking space
    text = text.replace('nbsp',' ')

    return text

In [25]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
stopwords_english = stopwords.words('english')

import string

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def second_pass(text):
    #make string lowercase
    text = str(text)
    text = text.lower()

    #remove https links
    text = re.sub(r"[(+*)]\S*https?:\S*[(+*)]", "", text)
    #remove https links with no brackets
    text = re.sub(r"http\S+", "", text)
    #remove link markups []
    #note that this will also remove comment fields with ["Delete"] 
    text = re.sub(r"[\(\[].*?[\)\]]", " ", text)

    # remove numbers
    text = re.sub(r'\d+', '', text)

    # remove puncuation
    text = re.sub(r'[^\w\s]', '', text)

    #tokenize
    #create bag of words
    tokens = nltk.word_tokenize(text)

    clean_text = []

    #remove stopwords, puncuation, and words of length 3 or smaller
    for token in tokens:
        if (token not in stopwords_english and token not in string.punctuation and len(token) > 3):
                clean_text.append(token)

    return clean_text

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dannyjimenez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dannyjimenez/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dannyjimenez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV

import os
import re, nltk, spacy

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        exc_list = ['week', 'season', 'year', 'game', 'fantasy', 'league', 'weekend', 'sunday'] # words to exclude
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] and token.lemma_ not in exc_list 
                                   else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [27]:
# create document-term matrix
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum occurences of a word 
                             ngram_range=(1,2)                 # extract unigrams, bigrams, and trigrams
                            )

In [28]:
# Build LDA Model
lda_model = LatentDirichletAllocation(n_components=10,           # Number of topics
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      n_jobs = -1,               # Use all available CPUs
                                     )

In [29]:
def gridsearch(data_vectorized):
    # Define Search Param
    search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}

    # Init the Model
    lda = LatentDirichletAllocation()

    # Init Grid Search Class
    model = GridSearchCV(lda, param_grid=search_params)

    # Do the Grid Search
    model.fit(data_vectorized)

    # Best Model
    best_lda_model = model.best_estimator_

    # Model Parameters
    print("Best Model's Params: ", model.best_params_)

    # Log Likelihood Score
    print("Best Log Likelihood Score: ", model.best_score_)

    # Perplexity
    print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))
    
    return best_lda_model

In [30]:
import pyLDAvis
import pyLDAvis.lda_model

def get_lda_vis(best_lda_model):
    pyLDAvis.enable_notebook()
    panel = pyLDAvis.lda_model.prepare(best_lda_model, data_vectorized, vectorizer, mds='tsne')
    return panel

## <font color='green'>Overperformers</font>

### <font color='green'>Quarterbacks</font>

In [31]:
df = pd.read_excel('player_reddit_raw.xlsx', sheet_name='QB Overperformers')
df['clean_text'] = df['body'].apply(first_pass)
df['clean_text'] = df['clean_text'].apply(second_pass)

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(df.clean_text, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

data_vectorized = vectorizer.fit_transform(data_lemmatized)
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)  # Model attributes

# Grid Search
best_lda_model = gridsearch(data_vectorized)

# Topic Model Visualization
get_lda_vis(best_lda_model)

LatentDirichletAllocation(learning_method='online', n_jobs=-1, random_state=100)
Best Model's Params:  {'learning_decay': 0.7, 'n_components': 10}
Best Log Likelihood Score:  -60829.57739711219
Model Perplexity:  355.52489785482993


### <font color='green'>Running Backs</font>

In [32]:
df = pd.read_excel('player_reddit_raw.xlsx', sheet_name='RB Overperformers')
df['clean_text'] = df['body'].apply(first_pass)
df['clean_text'] = df['clean_text'].apply(second_pass)

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(df.clean_text, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

data_vectorized = vectorizer.fit_transform(data_lemmatized)
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)  # Model attributes

# Grid Search
best_lda_model = gridsearch(data_vectorized)

# Topic Model Visualization
get_lda_vis(best_lda_model)

LatentDirichletAllocation(learning_method='online', n_jobs=-1, random_state=100)
Best Model's Params:  {'learning_decay': 0.7, 'n_components': 10}
Best Log Likelihood Score:  -48576.06819107059
Model Perplexity:  359.95339747679344


### <font color='green'>Wide Receivers</font>

In [33]:
df = pd.read_excel('player_reddit_raw.xlsx', sheet_name='WR Overperformers')
df['clean_text'] = df['body'].apply(first_pass)
df['clean_text'] = df['clean_text'].apply(second_pass)

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(df.clean_text, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

data_vectorized = vectorizer.fit_transform(data_lemmatized)
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)  # Model attributes

# Grid Search
best_lda_model = gridsearch(data_vectorized)

# Topic Model Visualization
get_lda_vis(best_lda_model)

LatentDirichletAllocation(learning_method='online', n_jobs=-1, random_state=100)
Best Model's Params:  {'learning_decay': 0.9, 'n_components': 10}
Best Log Likelihood Score:  -67357.13338450507
Model Perplexity:  447.2494215878746


### <font color='green'>Tight Ends</font>

In [34]:
df = pd.read_excel('player_reddit_raw.xlsx', sheet_name='TE Overperformers')
df['clean_text'] = df['body'].apply(first_pass)
df['clean_text'] = df['clean_text'].apply(second_pass)

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(df.clean_text, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

data_vectorized = vectorizer.fit_transform(data_lemmatized)
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)  # Model attributes

# Grid Search
best_lda_model = gridsearch(data_vectorized)

# Topic Model Visualization
get_lda_vis(best_lda_model)

LatentDirichletAllocation(learning_method='online', n_jobs=-1, random_state=100)
Best Model's Params:  {'learning_decay': 0.7, 'n_components': 10}
Best Log Likelihood Score:  -43674.50216580134
Model Perplexity:  337.0819168226845


## <font color='red'>Underperformers</font>

### <font color='red'>Quarterbacks</font>

In [35]:
df = pd.read_excel('player_reddit_raw.xlsx', sheet_name='QB Underperformers')
df['clean_text'] = df['body'].apply(first_pass)
df['clean_text'] = df['clean_text'].apply(second_pass)

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(df.clean_text, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

data_vectorized = vectorizer.fit_transform(data_lemmatized)
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)  # Model attributes

# Grid Search
best_lda_model = gridsearch(data_vectorized)

# Topic Model Visualization
get_lda_vis(best_lda_model)

LatentDirichletAllocation(learning_method='online', n_jobs=-1, random_state=100)
Best Model's Params:  {'learning_decay': 0.9, 'n_components': 10}
Best Log Likelihood Score:  -80684.61812367346
Model Perplexity:  374.12769567065044


### <font color='red'>Running Backs</font>

In [36]:
df = pd.read_excel('player_reddit_raw.xlsx', sheet_name='RB Underperformers')
df['clean_text'] = df['body'].apply(first_pass)
df['clean_text'] = df['clean_text'].apply(second_pass)

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(df.clean_text, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

data_vectorized = vectorizer.fit_transform(data_lemmatized)
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)  # Model attributes

# Grid Search
best_lda_model = gridsearch(data_vectorized)

# Topic Model Visualization
get_lda_vis(best_lda_model)

LatentDirichletAllocation(learning_method='online', n_jobs=-1, random_state=100)
Best Model's Params:  {'learning_decay': 0.9, 'n_components': 10}
Best Log Likelihood Score:  -54468.317458885234
Model Perplexity:  384.7920583006863


### <font color='red'>Wide Receivers</font>

In [37]:
df = pd.read_excel('player_reddit_raw.xlsx', sheet_name='WR Underperformers')
df['clean_text'] = df['body'].apply(first_pass)
df['clean_text'] = df['clean_text'].apply(second_pass)

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(df.clean_text, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

data_vectorized = vectorizer.fit_transform(data_lemmatized)
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)  # Model attributes

# Grid Search
best_lda_model = gridsearch(data_vectorized)

# Topic Model Visualization
get_lda_vis(best_lda_model)

LatentDirichletAllocation(learning_method='online', n_jobs=-1, random_state=100)
Best Model's Params:  {'learning_decay': 0.5, 'n_components': 10}
Best Log Likelihood Score:  -35661.13346031407
Model Perplexity:  166.54170432231493


### <font color='red'>Tight Ends</font>

In [38]:
df = pd.read_excel('player_reddit_raw.xlsx', sheet_name='TE Underperformers')
df['clean_text'] = df['body'].apply(first_pass)
df['clean_text'] = df['clean_text'].apply(second_pass)

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(df.clean_text, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

data_vectorized = vectorizer.fit_transform(data_lemmatized)
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)  # Model attributes

# Grid Search
best_lda_model = gridsearch(data_vectorized)

# Topic Model Visualization
get_lda_vis(best_lda_model)

LatentDirichletAllocation(learning_method='online', n_jobs=-1, random_state=100)
Best Model's Params:  {'learning_decay': 0.9, 'n_components': 10}
Best Log Likelihood Score:  -30915.57040678447
Model Perplexity:  257.28316142639414


## Code Sources

The code in this notebook contains snippets from the following sources:
- [Tawfiq Ammari](https://github.com/tawfiqam/MI564/blob/main/LDA_Intro.ipynb)
- [Selva Prabhakaran](https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/)