In [10]:
# Import necessary libraries
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Download necessary resources
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('omw-1.4')


In [2]:
# Load the text from the file
with open('movie_review.txt', 'r') as file:
    text = file.read()

# NLTK processing
# Tokenization
tokens_nltk = word_tokenize(text)

# Stemming
stemmer = PorterStemmer()
stems_nltk = [stemmer.stem(token) for token in tokens_nltk]

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmas_nltk = [lemmatizer.lemmatize(token) for token in tokens_nltk]

In [3]:
#!python -m spacy download en_core_web_sm

In [4]:
# SpaCy processing
import spacy

# Load English model for SpaCy and processing text
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

# Tokenization
tokens_spacy = [token.text for token in doc]

# Stemming is not directly supported in SpaCy

# Lemmatization
lemmas_spacy = [token.lemma_ for token in doc]

In [5]:
# Printing first 50 of results index for comparison
# Tokenization results
print("Sample tokens (NLTK):", tokens_nltk[:50])
print("\n")
print("Sample tokens (SpaCy):", tokens_spacy[:50])

Sample tokens (NLTK): ['Review', ':', '``', 'The', 'Dark', 'Knight', "''", ';', 'OR', 'Can', 'Superhero', 'Movie', 'Fatigue', 'Be', 'Defeated', '?', 'Glenn', 'Kenny', 'This', 'may', 'seem', 'like', 'faint', 'praise', ',', 'but', 'about', 'the', 'highest', 'compliment', 'I', 'can', 'give', 'Christopher', 'Nolan', '’', 's', 'The', 'Dark', 'Knight', 'right', 'now', 'is', 'to', 'say', 'that', 'there', 'were', 'many', 'long']


Sample tokens (SpaCy): ['Review', ':', '"', 'The', 'Dark', 'Knight', '"', ';', 'OR', 'Can', 'Superhero', 'Movie', 'Fatigue', 'Be', 'Defeated', '?', '\n\n', 'Glenn', 'Kenny', '\n\n', 'This', 'may', 'seem', 'like', 'faint', 'praise', ',', 'but', 'about', 'the', 'highest', 'compliment', 'I', 'can', 'give', 'Christopher', 'Nolan', '’s', 'The', 'Dark', 'Knight', 'right', 'now', 'is', 'to', 'say', 'that', 'there', 'were', 'many']


In [6]:
# Stemming results (not supported in SpaCy)
print("Sample stems (NLTK):", stems_nltk[:50])

Sample stems (NLTK): ['review', ':', '``', 'the', 'dark', 'knight', "''", ';', 'or', 'can', 'superhero', 'movi', 'fatigu', 'be', 'defeat', '?', 'glenn', 'kenni', 'thi', 'may', 'seem', 'like', 'faint', 'prais', ',', 'but', 'about', 'the', 'highest', 'compliment', 'i', 'can', 'give', 'christoph', 'nolan', '’', 's', 'the', 'dark', 'knight', 'right', 'now', 'is', 'to', 'say', 'that', 'there', 'were', 'mani', 'long']


In [7]:
# Lemmatization results
print("Sample lemmas (NLTK):", lemmas_nltk[:50])
print("\n")
print("Sample lemmas (SpaCy):", lemmas_spacy[:50])

Sample lemmas (NLTK): ['Review', ':', '``', 'The', 'Dark', 'Knight', "''", ';', 'OR', 'Can', 'Superhero', 'Movie', 'Fatigue', 'Be', 'Defeated', '?', 'Glenn', 'Kenny', 'This', 'may', 'seem', 'like', 'faint', 'praise', ',', 'but', 'about', 'the', 'highest', 'compliment', 'I', 'can', 'give', 'Christopher', 'Nolan', '’', 's', 'The', 'Dark', 'Knight', 'right', 'now', 'is', 'to', 'say', 'that', 'there', 'were', 'many', 'long']


Sample lemmas (SpaCy): ['review', ':', '"', 'the', 'Dark', 'Knight', '"', ';', 'or', 'can', 'Superhero', 'Movie', 'Fatigue', 'be', 'defeat', '?', '\n\n', 'Glenn', 'Kenny', '\n\n', 'this', 'may', 'seem', 'like', 'faint', 'praise', ',', 'but', 'about', 'the', 'high', 'compliment', 'I', 'can', 'give', 'Christopher', 'Nolan', '’s', 'the', 'Dark', 'Knight', 'right', 'now', 'be', 'to', 'say', 'that', 'there', 'be', 'many']


In [8]:
# P2

In [17]:
from collections import Counter

# Load the large text dataset, a very old txt I found online
with open('long_article.txt', 'r', encoding='latin1') as file:
    large_text = file.read()

# Define a smaller text sample with new words, I made one from the movie review article
small_text = """
The superhero genre has evolved tremendously, and many new directors are attempting fresh takes.
Despite superhero fatigue, the audience's love for action-packed movies seems unrelenting.
"""

lemmatizer = WordNetLemmatizer()

# Tokenize and lemmatize  
def tokenize_and_lemmatize(text):
    tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(token.lower()) for token in tokens]
    return lemmas

large_text_lemmas = tokenize_and_lemmatize(large_text)
small_text_lemmas = tokenize_and_lemmatize(small_text)

print("\nSample lemmas large text:", large_text_lemmas[:20])
print("\nSample lemmas small text:", small_text_lemmas[:20])

# Create vocabularies
vocab_large_text = Counter(large_text_lemmas)
bow_small_text = Counter(small_text_lemmas)

# Display 1-grams BoW model for small text
print("\nBoW model for smaller text:", bow_small_text)

# Identify new words by comparing the smaller text to the large text's vocabulary
new_words = [word for word in bow_small_text if word not in vocab_large_text]
new_words_count = {word: bow_small_text[word] for word in new_words}

# Display the new words and their counts
print("\nNew words in the smaller text:", new_words_count)
print("\nNumber of new words in the smaller text:", len(new_words))



Sample lemmas large text: ['[', 'pp', '38-40', ':', 'article', 'from', 'die', 'zeit', ',', '30', 'november', '1984', ',', 'by', 'thomas', 'von', 'randow', ']', 'bildschirmtext', ':']

Sample lemmas small text: ['the', 'superhero', 'genre', 'ha', 'evolved', 'tremendously', ',', 'and', 'many', 'new', 'director', 'are', 'attempting', 'fresh', 'take', '.', 'despite', 'superhero', 'fatigue', ',']

BoW model for smaller text: Counter({'the': 2, 'superhero': 2, ',': 2, '.': 2, 'genre': 1, 'ha': 1, 'evolved': 1, 'tremendously': 1, 'and': 1, 'many': 1, 'new': 1, 'director': 1, 'are': 1, 'attempting': 1, 'fresh': 1, 'take': 1, 'despite': 1, 'fatigue': 1, 'audience': 1, "'s": 1, 'love': 1, 'for': 1, 'action-packed': 1, 'movie': 1, 'seems': 1, 'unrelenting': 1})

New words in the smaller text: {'superhero': 2, 'genre': 1, 'evolved': 1, 'tremendously': 1, 'director': 1, 'attempting': 1, 'fresh': 1, 'fatigue': 1, 'action-packed': 1, 'movie': 1, 'seems': 1, 'unrelenting': 1}

Number of new words in 