NOTE FOR HOW TO RUN NOTEBOOK: This notebook can be ran chronologically

In [1]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd

In [3]:
docs = pd.read_csv('all_draft_classes.csv')
# stop_words = set(stopwords.words('english'))
docs['Strengths'] = 'STRENGTHS: ' + docs['Strengths'].astype(str)
docs['Weaknesses'] = 'WEAKNESSES: ' + docs['Weaknesses'].astype(str)

docs['composite'] = docs['Overview'] + docs['Strengths'] + docs['Weaknesses']
# docs['s_and_w'] = \
# docs['s_and_w'].map(lambda x: re.sub('[,\.!?]', '', x))


# Tokenize the sentence
#NOTE: No need to remove stopwords or preprocess the text as BERTopic is meant to handle that
overviews = []
for doc in docs['composite']:
    # tokens = word_tokens = word_tokenize(doc)
    # filtered_sentence = [w for w in doc.split()]
    overviews.append(" ".join(doc.split()))


In [4]:
# ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
# vectorizer_model = CountVectorizer(stop_words="english")
representation_model = KeyBERTInspired()

topic_model = BERTopic(representation_model=representation_model, embedding_model="all-MiniLM-L6-v2")
topics, probs = topic_model.fit_transform(overviews)
#exploratory analysis: create doc2vec embedding of each overview and plot them

In [5]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,142,-1_quarterback_throw_receivers_throws,"[quarterback, throw, receivers, throws, throwi...",[Trubisky is a high-end quarterback prospect w...
1,0,22,0_quarterback_ability_strength_strengths,"[quarterback, ability, strength, strengths, th...","[Big, athletic, ascending quarterback with eno..."
2,1,15,1_quarterback_throw_throws_throwing,"[quarterback, throw, throws, throwing, defense...",[With five seasons of starting experience unde...


In [6]:
topic_model.get_topics()

{-1: [('quarterback', 0.54905397),
  ('throw', 0.42328927),
  ('receivers', 0.41598716),
  ('throws', 0.41570163),
  ('throwing', 0.40599707),
  ('yards', 0.39559528),
  ('passer', 0.3695256),
  ('offense', 0.3573051),
  ('ability', 0.3344883),
  ('strength', 0.3343683)],
 0: [('quarterback', 0.5289717),
  ('ability', 0.39342833),
  ('strength', 0.39264655),
  ('strengths', 0.38081974),
  ('throw', 0.3615546),
  ('throws', 0.3610834),
  ('plays', 0.352975),
  ('offense', 0.35003906),
  ('nfl', 0.34846914),
  ('passer', 0.33122322)],
 1: [('quarterback', 0.5392395),
  ('throw', 0.45356447),
  ('throws', 0.43516627),
  ('throwing', 0.4238921),
  ('defenses', 0.41600788),
  ('draft', 0.39189515),
  ('passer', 0.3641559),
  ('nfl', 0.3584649),
  ('talent', 0.33834875),
  ('ability', 0.32237396)]}

In [7]:
#representative documents for each topic
topic_model.get_representative_docs()

{-1: ['Trubisky is a high-end quarterback prospect who possesses NFL size, a big arm and the ability to throw with accuracy from the pocket or on the move. Despite playing in a spread-based offense, he\'s a full-field reader who does a very good job of getting an early read on the safeties before crafting his course of action. Trubisky will have to become much more pocket aware and do a better job of recognizing and attacking blitzes to back NFL defensive coordinators off. He hasn\'t put all the pieces together yet, but the puzzle is all right in front. Trubisky projects as a good starting quarterback with a high floor and the potential to be great.STRENGTHS: Adequate height with good muscular definition. Built to withstand physical challenges of the position. Calm field general. Very good pocket mobility. Can slide around circumference of the pocket without having to drop his eyes from their task. When he leaves the pocket, looks to challenge with his arm before defaulting to his feet

In [8]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from nltk.corpus import stopwords
import nltk
top_n_words = 10
topic_keywords = []
for topic in topic_model.get_topics().values():
    words = [word for word, _ in topic[:top_n_words]]
    topic_keywords.append(words)

# Preprocess documents, this is simply tokenizing the documents since we do not want to preprocess for BERTopic
def preprocess(doc):
    return [word for word in nltk.word_tokenize(doc) ]

texts = [preprocess(doc) for doc in overviews]

# Create dictionary and coherence model
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
coherence_model = CoherenceModel(
    topics=topic_keywords,
    corpus=corpus,
    dictionary=dictionary,
    coherence='u_mass' 
)

In [9]:
coherence_score = coherence_model.get_coherence()
print(f"Coherence Score: {coherence_score:.4f}")

Coherence Score: -0.7521


In [10]:
docs['Label'] = topic_model.get_document_info(docs['composite'])['Topic']
docs.to_csv('draft_reports_labelled.csv')

In [11]:
classes = {}

for index, row in docs.iterrows():
    if row['Label'] not in classes:
        classes[row['Label']] = []
    classes[row['Label']].append(row['Name'])

In [12]:
df = pd.DataFrame({"topic": topics, "names": docs['Name'], "overview": docs['composite']})

In [13]:
#Inspecting topics, can change number to -1, 0, or 1
df[df['topic'] == 0]

Unnamed: 0,topic,names,overview
0,0,Cam Ward,"Gunslinger with good size, a big arm and the m..."
23,0,Hendon Hooker,Hooker’s age and ACL tear will be starting poi...
37,0,Trevor Lawrence,"Refined and polished for his age, Lawrence is ..."
58,0,Tommy Stevens,Stevens has outstanding size and speed for the...
96,0,Cody Kessler,Can orchestrate an offense with confidence and...
104,0,Brandon Doughty,There are several games where Doughty plays wi...
109,0,Bryce Petty,NFL evaluators felt like Petty's senior season...
111,0,Blake Bortles,"Big, athletic, ascending quarterback with enou..."
112,0,Johnny Manziel,"A very unique, run-around, ad-lib, sandlot-sty..."
113,0,Teddy Bridgewater,"A calculated, football-smart, precision-matchu..."


In [14]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from gensim import corpora
#NOTE used response from stack overflow article to learn how to write this function, this is noted in the works cited of my final report.
def calculate_coherence_score(topic_model, docs):

    # Extract vectorizer and tokenizer from BERTopic
    vectorizer = topic_model.vectorizer_model
    tokenizer = vectorizer.build_tokenizer()

    # Extract features for Topic Coherence evaluation
    words = vectorizer.get_feature_names_out()

    tokens = [tokenizer(doc) for doc in docs]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    # Create topic words
    topic_words = [[dictionary.token2id[w] for w in words if w in dictionary.token2id]
    for _ in range(-1, 2)]

    coherence_model = CoherenceModel(topics=topic_words,
                                    texts=tokens,
                                    corpus=corpus,
                                    dictionary=dictionary,
                                    coherence='c_v')
    coherence = coherence_model.get_coherence()
    return coherence

coherence_score = calculate_coherence_score(topic_model, docs['composite'])
print(f"Coherence Score: {coherence_score}")


Coherence Score: 0.6142926967732716


In [15]:

topic_model.visualize_topics(topics)

In [16]:
from sentence_transformers import SentenceTransformer
from umap import UMAP


In [17]:
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(docs['composite'])

# Run the visualization with the original embeddings
topic_model.visualize_documents(docs['Name'], embeddings=embeddings)


In [18]:
import numpy as np
#Find similarity between each document in the dataset
similarities = sentence_model.similarity(embeddings, embeddings)
indices = []
#For each document, it should have a similarity score of 1.0 with itself. Because of this
#we will want to choose the second most similar document to each document in the dataset.
for s in similarities:
    max_val = 0
    max_idx = 0
    second_max = 0
    second_idx = 0
    for i, doc in enumerate(s):
        if doc > max_val:
            second_max = max_val
            second_idx = max_idx
            max_val = doc
            max_idx = i
        elif doc > second_max:
            second_max = doc
            second_idx = i
    indices.append(second_idx)

print(indices)



[117, 57, 36, 50, 117, 116, 87, 54, 101, 17, 61, 48, 50, 127, 86, 50, 48, 48, 117, 87, 50, 48, 50, 117, 59, 56, 117, 168, 117, 86, 48, 50, 131, 6, 56, 27, 101, 25, 50, 48, 101, 99, 57, 50, 50, 125, 57, 87, 43, 50, 43, 159, 50, 57, 122, 120, 117, 122, 122, 57, 50, 99, 50, 50, 50, 26, 50, 117, 57, 56, 56, 46, 101, 102, 57, 84, 57, 63, 56, 80, 117, 50, 42, 152, 75, 152, 50, 117, 56, 101, 113, 50, 93, 92, 21, 105, 117, 26, 58, 41, 81, 86, 73, 157, 117, 95, 112, 120, 70, 117, 36, 122, 121, 90, 119, 119, 117, 122, 117, 117, 131, 122, 121, 122, 117, 50, 120, 57, 50, 50, 140, 117, 117, 177, 116, 133, 48, 57, 11, 46, 130, 98, 57, 21, 107, 51, 101, 63, 92, 118, 57, 57, 117, 103, 61, 119, 57, 131, 119, 51, 26, 56, 90, 117, 117, 152, 120, 122, 27, 24, 177, 117, 57, 127, 14, 50, 178, 117, 87]


In [19]:
print(len(indices))
for i, idx in enumerate(indices):
    print(docs.iloc[i]['Name'], 'is most similar to', docs.iloc[idx]['Name'])

179
Cam Ward is most similar to Tom Savage
Shedeur Sanders is most similar to Ben Dinucci
Jaxson Dart is most similar to Brock Purdy
Jalen Milroe is most similar to Jordan Love
Will Howard is most similar to Tom Savage
Kyle McCord is most similar to Logan Thomas
Tyler Shough is most similar to CJ Beathard
Quinn Ewers is most similar to Jacob Fromm
Dillon Gabriel is most similar to Nate Sudfeld
Riley Leonard is most similar to Jordan Travis
Caleb Williams is most similar to Daniel Jones
Jayden Daniels is most similar to Tua Tagovailoa
Drake Maye is most similar to Jordan Love
Michael Penix Jr. is most similar to Mike Glennon
JJ McCarthy is most similar to Davis Webb
Bo Nix is most similar to Jordan Love
Spencer Rattler is most similar to Tua Tagovailoa
Jordan Travis is most similar to Tua Tagovailoa
Joe Milton is most similar to Tom Savage
Bryce Young is most similar to CJ Beathard
CJ Stroud is most similar to Jordan Love
Anthony Richardson is most similar to Tua Tagovailoa
Will Levis i

In [20]:
topic_model.visualize_heatmap()