NOTE: Notebook can be ran chronologically

In [1]:
import gensim
from gensim.models import CoherenceModel
from gensim import corpora
from nltk.stem import WordNetLemmatizer


In [2]:
from nltk.corpus import stopwords
import pandas as pd

In [3]:
df = pd.read_csv('all_draft_classes.csv')

In [4]:
import re
df['Strengths'] = \
df['Strengths'].map(lambda x: re.sub('[,\.!?]', '', x))

df['Weaknesses'] = \
df['Weaknesses'].map(lambda x: re.sub('[,\.!?]', '', x))


df['Overview'] = \
df['Overview'].map(lambda x: re.sub('[,\.!?]', '', x))

df['Strengths'] = 'STRENGTHS: ' + df['Strengths'].astype(str)
df['Weaknesses'] = 'WEAKNESSES: ' + df['Weaknesses'].astype(str)
df.head()

Unnamed: 0,Year,Name,Overview,Strengths,Weaknesses,Label
0,2025,Cam Ward,Gunslinger with good size a big arm and the mo...,STRENGTHS: Recognizes pre-snap pressure and ca...,WEAKNESSES: Too willing to work out of structu...,
1,2025,Shedeur Sanders,Any perceptions that Sanders is a product of H...,STRENGTHS: Plays with confidence and composure...,WEAKNESSES: Spacing and clearly defined route ...,
2,2025,Jaxson Dart,Three-year SEC starter who saw improvement in ...,STRENGTHS: Gets across the full field of progr...,WEAKNESSES: Deep zone coverages slowed his mom...,
3,2025,Jalen Milroe,Milroe is an explosive athlete who is very cap...,STRENGTHS: Unflinching when he delivers throws...,WEAKNESSES: Threw five touchdowns and 10 inter...,
4,2025,Will Howard,Howard brings outstanding size and toughness t...,STRENGTHS: Outstanding size and toughness insi...,WEAKNESSES: Very gradual in his setup and rele...,


In [5]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()    
df['composite'] = df['Overview'] + df['Strengths'] + df['Weaknesses']
# Tokenize the sentence
overviews = []
for doc in df['composite']:
    # tokens = word_tokens = word_tokenize(doc)
    filtered_sentence = [w for w in doc.split() if not w in stop_words]
    filtered_sentence = [lemmatizer.lemmatize(word) for word in filtered_sentence]
    overviews.append(" ".join(filtered_sentence).lower())


In [6]:
print(overviews)

['gunslinger good size big arm mobility help offensive line ward read full field operates average decision-making processing quickness like shortstop rip sidearm rocket fit tight window three level delivery mechanic cause inconsistency placement accuracy he fairly consistent regardless coverage scheme see figuring disguised coverage pro level take time given develop skill he look strike rich aggressive vertical throws; better efficiency need learn mine gold combo read rhythm throw while ability move stick leg he’s pocket passer dual-threat quarterback pocket mobility help extend make play structure longer he’s off-schedule spottier decision-making get with patient plan nurturing offensive coordinator accentuate physical tool regulating feast-or-famine element play ward could become good nfl starter inside first contractstrengths: recognizes pre-snap pressure protect displays recognition attacking hole coverage sudden release help thrive quick game seam throw can whip throw tight window

In [7]:
# Preprocess the data
import math
texts = [[word for word in document.split()] for document in overviews]
# Create a dictionary and corpus
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
optimal_num_topics = 0
best_coherence = math.inf
# Train the LDA model
for i in range(1, 11):
    lda_model = gensim.models.LdaModel(corpus, num_topics=i, id2word=dictionary, passes=15, iterations=200)

    # Compute the coherence score
    cm = CoherenceModel(model=lda_model, corpus=corpus, coherence='u_mass')
    coherence_score = cm.get_coherence()  # get coherence value

    print(f'Coherence Score: {coherence_score:.4f}')
    if abs(coherence_score) < abs(best_coherence):
        optimal_num_topics = i
        best_coherence = coherence_score

Coherence Score: -0.4479
Coherence Score: -0.5529
Coherence Score: -2.1527
Coherence Score: -1.0792
Coherence Score: -0.4925
Coherence Score: -0.5880
Coherence Score: -1.3210
Coherence Score: -1.2525
Coherence Score: -1.8403
Coherence Score: -1.4430


In [8]:
lda_model = gensim.models.LdaModel(corpus, num_topics=4, id2word=dictionary, passes=15, iterations=200, random_state=42)
cm = CoherenceModel(model=lda_model, corpus=corpus, coherence='u_mass')
coherence_score = cm.get_coherence()  # get coherence value
print(f'Coherence Score: {coherence_score:.4f}')

Coherence Score: -0.6719


In [9]:
lda_model.top_topics(corpus=corpus, texts=texts, dictionary=dictionary)


[([(0.022677714, 'throw'),
   (0.01483573, 'pocket'),
   (0.012148997, 'ball'),
   (0.010723803, 'arm'),
   (0.0086666895, 'play'),
   (0.008118441, 'make'),
   (0.007905059, 'accuracy'),
   (0.007428679, 'good'),
   (0.0070468537, 'time'),
   (0.007037044, 'he'),
   (0.006701414, 'deep'),
   (0.005743849, 'nfl'),
   (0.0057012173, 'has'),
   (0.00567671, 'quarterback'),
   (0.0053974213, 'field'),
   (0.0051668924, 'ability'),
   (0.0050873957, 'release'),
   (0.0050509633, 'talent'),
   (0.004959794, 'game'),
   (0.004806378, 'pressure')],
  -0.46356675619565807),
 ([(0.015505491, 'throw'),
   (0.012182401, 'ball'),
   (0.010556213, 'pocket'),
   (0.0079196645, 'arm'),
   (0.00680062, 'accuracy'),
   (0.0063299285, 'good'),
   (0.006089696, 'foot'),
   (0.005830859, 'make'),
   (0.0056900172, 'he'),
   (0.005612876, 'get'),
   (0.0055645998, 'field'),
   (0.0055338736, 'yard'),
   (0.005474808, 'play'),
   (0.0052334433, 'game'),
   (0.004980964, 'has'),
   (0.0049059354, 'time'),
  

In [10]:
import numpy as np
classes_dict = {}
for i, topic in enumerate(lda_model.get_document_topics(corpus)):
    print(df.iloc[i]["Name"], topic)
    max_prob = 0
    topic_num = 0
    for t in topic:
        if t[1] > max_prob:
            max_prob = t[1]
            topic_num = t[0]
    print(f"Topic {topic_num} with probability {max_prob:.4f}")
    classes_dict[df.iloc[i]["Name"]] = topic_num

Cam Ward [(2, 0.9970429)]
Topic 2 with probability 0.9970
Shedeur Sanders [(2, 0.99676186)]
Topic 2 with probability 0.9968
Jaxson Dart [(0, 0.058041524), (2, 0.9389261)]
Topic 2 with probability 0.9389
Jalen Milroe [(2, 0.99629)]
Topic 2 with probability 0.9963
Will Howard [(2, 0.99558944)]
Topic 2 with probability 0.9956
Kyle McCord [(2, 0.99575865)]
Topic 2 with probability 0.9958
Tyler Shough [(2, 0.995324)]
Topic 2 with probability 0.9953
Quinn Ewers [(2, 0.9962005)]
Topic 2 with probability 0.9962
Dillon Gabriel [(1, 0.10963606), (2, 0.88752866)]
Topic 2 with probability 0.8875
Riley Leonard [(2, 0.99558324)]
Topic 2 with probability 0.9956
Caleb Williams [(2, 0.99638)]
Topic 2 with probability 0.9964
Jayden Daniels [(2, 0.995925)]
Topic 2 with probability 0.9959
Drake Maye [(0, 0.61177903), (2, 0.3855232)]
Topic 0 with probability 0.6118
Michael Penix Jr. [(0, 0.6405664), (2, 0.35676563)]
Topic 0 with probability 0.6406
JJ McCarthy [(0, 0.05891147), (2, 0.9383318)]
Topic 2 with 

In [11]:
#inspect each topic
classes_df = pd.DataFrame.from_dict(classes_dict, orient='index', columns=['Topic'])

classes_df[classes_df['Topic'] == 0]

Unnamed: 0,Topic
Drake Maye,0
Michael Penix Jr.,0
Jordan Travis,0
CJ Stroud,0
Kyle Trask,0
Nate Stanley,0
Dwayne Haskins,0
Will Grier,0
Ryan Finley,0
Josh Rosen,0


In [12]:
import pyLDAvis.gensim_models as gensimvis
import pickle 
import pyLDAvis
import os
# Visualize the topics
pyLDAvis.enable_notebook()
num_topics = 4
LDAvis_data_filepath = os.path.join('./results/ldavis_prepared_'+str(num_topics))

# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
LDAvis_prepared = gensimvis.prepare(lda_model, corpus, dictionary)
with open(LDAvis_data_filepath, 'wb') as f:
    pickle.dump(LDAvis_prepared, f)

# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

pyLDAvis.save_html(LDAvis_prepared, './results/ldavis_prepared_'+ str(num_topics) +'.html')

LDAvis_prepared