# Hands-on Lab Instructions

## Setting Up Python Environment

In [None]:
import nltk
import spacy
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

## Basic Text Processing of Sports Commentary

In [None]:
# Sample sports commentary text
commentary = """
Messi receives the ball at the halfway line. He dribbles past two defenders with incredible skill.
He shoots from outside the box... GOAL! What an amazing strike from the Argentine superstar!
That's his 20th goal of the season, putting Barcelona ahead 1-0 in this crucial match.
"""

In [None]:
# Basic analysis
print("Original text:")
print(commentary)

Original text:

Messi receives the ball at the halfway line. He dribbles past two defenders with incredible skill.
He shoots from outside the box... GOAL! What an amazing strike from the Argentine superstar!
That's his 20th goal of the season, putting Barcelona ahead 1-0 in this crucial match.



In [None]:
# Tokenization
words = word_tokenize(commentary)
sentences = sent_tokenize(commentary)

In [None]:
print(f"\nNumber of words: {len(words)}")
print(f"Number of sentences: {len(sentences)}")
print(f"First 10 tokens: {words[:10]}")


Number of words: 54
Number of sentences: 5
First 10 tokens: ['Messi', 'receives', 'the', 'ball', 'at', 'the', 'halfway', 'line', '.', 'He']


In [None]:
# Stemming
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in words]
print(f"\nStemmed words (first 10): {stemmed_words[:10]}")


Stemmed words (first 10): ['messi', 'receiv', 'the', 'ball', 'at', 'the', 'halfway', 'line', '.', 'he']


In [None]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
print(f"\nLemmatized words (first 10): {lemmatized_words[:10]}")


Lemmatized words (first 10): ['Messi', 'receives', 'the', 'ball', 'at', 'the', 'halfway', 'line', '.', 'He']


In [None]:
# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stop_words]
print(f"\nWords after stopword removal (first 10): {filtered_words[:10]}")


Words after stopword removal (first 10): ['Messi', 'receives', 'ball', 'halfway', 'line', '.', 'dribbles', 'past', 'two', 'defenders']


In [None]:
# spaCy analysis
doc = nlp(commentary)
print("\nNamed Entities:")
for ent in doc.ents:
    print(f"{ent.text} - {ent.label_}")


Named Entities:
Messi - PERSON
two - CARDINAL
Argentine - NORP
20th - ORDINAL
the season - DATE
Barcelona - ORG
1 - CARDINAL


## Exploratory Analysis of Sports Language Patterns

In [None]:
# Analyze word frequencies
from collections import Counter
import re

# Clean and normalize text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

cleaned_text = clean_text(commentary)
cleaned_words = cleaned_text.split()
word_freq = Counter(cleaned_words)

# Display most common words
print("\nMost common words:")
for word, count in word_freq.most_common(10):
    print(f"{word}: {count}")

# Compare with another sport's commentary
basketball_commentary = """
Curry with the ball at the top of the key. He crosses over his defender, steps back behind the arc.
Launches a deep three... BANG! Nothing but net! That's his 5th three-pointer tonight,
extending the Warriors' lead to 10 points with just 2 minutes remaining in the 4th quarter.
"""

# Process basketball commentary similarly
# Compare language patterns between the two sports


Most common words:
the: 5
he: 2
from: 2
goal: 2
messi: 1
receives: 1
ball: 1
at: 1
halfway: 1
line: 1


### Removing stopwords

In [None]:
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords (run once)
nltk.download('stopwords')

# Define English stopwords
stop_words = set(stopwords.words('english'))

# Clean and normalize text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# Function to process text and return word frequencies (excluding stopwords)
def get_word_frequencies(text):
    cleaned = clean_text(text)
    words = cleaned.split()
    filtered_words = [word for word in words if word not in stop_words]
    return Counter(filtered_words)

# Compare with another sport's commentary
basketball_commentary = """
Curry with the ball at the top of the key. He crosses over his defender, steps back behind the arc.
Launches a deep three... BANG! Nothing but net! That's his 5th three-pointer tonight,
extending the Warriors' lead to 10 points with just 2 minutes remaining in the 4th quarter.
"""

# Process basketball commentary
basketball_freq = get_word_frequencies(basketball_commentary)
print("\nMost common words (Basketball):")
for word, count in basketball_freq.most_common(10):
    print(f"{word}: {count}")


Most common words (Basketball):
curry: 1
ball: 1
top: 1
key: 1
crosses: 1
defender: 1
steps: 1
back: 1
behind: 1
arc: 1


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Mini-Project (Starter Code)

In [None]:
# Provide a function for calculating text statistics
def analyze_text_statistics(text, sport_name):
    # Calculate various metrics
    doc = nlp(text)

    # Basic statistics
    word_count = len([token for token in doc if not token.is_punct])
    sentence_count = len(list(doc.sents))
    avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0

    # Vocabulary richness (unique words / total words)
    unique_words = set([token.text.lower() for token in doc if not token.is_punct])
    lexical_diversity = len(unique_words) / word_count if word_count > 0 else 0

    # Entity analysis
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    entity_count = len(entities)

    return {
        "sport": sport_name,
        "word_count": word_count,
        "sentence_count": sentence_count,
        "avg_sentence_length": avg_sentence_length,
        "lexical_diversity": lexical_diversity,
        "entity_count": entity_count,
        "entities": entities
    }

# Students will analyze different sports and compare results