<a href="https://colab.research.google.com/github/chapmjs/oaks_talks_analysis/blob/main/oaks_talks_analysis_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# President Dallin H. Oaks - Complete Talk Collection Analysis
This notebook analyzes ALL talks by President Dallin H. Oaks using the comprehensive collection from bencrowder.net.**Instructions**: Run each cell in order by clicking the play button or pressing Shift+Enter.
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/chapmjs/oaks_talks_analysis/blob/main/oaks_talks_analysis_colab.ipynb)

## Step 1: Install Required Libraries

In [None]:
# Install required packages
!pip install beautifulsoup4 wordcloud nltk pandas matplotlib seaborn scikit-learn lxml -q

# Download NLTK data
import nltk
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('omw-1.4', quiet=True)

print("‚úì Libraries installed successfully!")

## Step 2: Create Project Structure and Helper Functions

In [None]:
import os
import requests
from bs4 import BeautifulSoup
import json
import time
from datetime import datetime
import re
import string
from collections import Counter, defaultdict
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from IPython.display import Image, display
import warnings
warnings.filterwarnings('ignore')

# Create directories
os.makedirs('data/talks', exist_ok=True)
os.makedirs('output/wordclouds', exist_ok=True)
os.makedirs('output/analysis', exist_ok=True)

print("‚úì Project structure created!")
print("\nDirectories:")
print("  - data/talks/ (for downloaded talks)")
print("  - output/wordclouds/ (for word cloud images)")
print("  - output/analysis/ (for analysis reports)")

## Step 3: Define Stopwords and Text Processing

In [None]:
# Custom stopwords for religious/conference texts
ENGLISH_STOPWORDS = {
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're",
    "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he',
    'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's",
    'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
    'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
    'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does',
    'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as',
    'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between',
    'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from',
    'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',
    'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'both',
    'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not',
    'only', 'own', 'same', 'so', 'than', 'too', 'very', 'can', 'will',
    'just', 'should', 'now'
}

CONFERENCE_COMMON = {
    'elder', 'president', 'brother', 'sister', 'saint', 'saints',
    'conference', 'general', 'talk', 'spoke', 'speaking', 'said', 'says',
    'today', 'time', 'year', 'years', 'day', 'days',
    'may', 'might', 'must', 'shall', 'would', 'could', 'should',
    'also', 'even', 'well', 'much', 'many', 'every',
    'first', 'second', 'third', 'last', 'next',
    'one', 'two', 'three', 'four', 'five',
    'know', 'known', 'knew', 'think', 'thought',
    'come', 'came', 'go', 'went', 'going',
    'make', 'made', 'making', 'give', 'gave', 'given',
    'see', 'saw', 'seen', 'say', 'said', 'saying',
    'us', 'let', 'way', 'like', 'want', 'need'
}

# Additional stopwords specific to talks
ADDITIONAL_STOPS = {
    'lord', 'god', 'jesus', 'christ', 'church', 'lds',
    'latter', 'day', 'will', 'can', 'also', 'one', 'two',
    'brethren', 'sisters', 'oaks', 'dallin', 'president',
    'thank', 'name', 'amen'
}

ALL_STOPWORDS = ENGLISH_STOPWORDS | CONFERENCE_COMMON | ADDITIONAL_STOPS

print(f"‚úì Loaded {len(ALL_STOPWORDS)} stopwords")

# Text processing class
class TextProcessor:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = ALL_STOPWORDS

    def clean_text(self, text):
        text = re.sub(r'http\S+|www.\S+', '', text)
        text = re.sub(r'\S+@\S+', '', text)
        text = re.sub(r'\b\d+\b', '', text)
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    def extract_talk_content(self, filepath):
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()
        if '\n---\n' in content:
            content = content.split('\n---\n', 1)[1]
        return content

    def process_text(self, text, remove_stops=True):
        text = self.clean_text(text)
        tokens = word_tokenize(text.lower())
        tokens = [t for t in tokens if t not in string.punctuation and any(c.isalpha() for c in t)]
        if remove_stops:
            tokens = [t for t in tokens if t.lower() not in self.stopwords]
        return tokens

processor = TextProcessor()
print("‚úì Text processor initialized!")

## Step 4: Fetch Talks from bencrowder.net\n**Note**: This may take 30-60 minutes on first run as it downloads hundreds of talks.

In [None]:
class TalkFetcher:
    def __init__(self):
        self.base_url = "https://bencrowder.net"
        self.speaker_url = f"{self.base_url}/collected-talks/dallin-h-oaks/"
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        self.data_dir = "data/talks"

    def get_talk_links(self):
        print("Fetching talk list from bencrowder.net...")
        try:
            response = requests.get(self.speaker_url, headers=self.headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')

            talks = []
            talk_list = soup.find('div', class_='entry-content') or soup.find('article') or soup.find('main')

            if talk_list:
                links = talk_list.find_all('a')
                for link in links:
                    href = link.get('href', '')
                    text = link.get_text(strip=True)

                    if href and ('churchofjesuschrist.org' in href or 'speeches.byu.edu' in href or 'lds.org' in href):
                        parent_text = link.parent.get_text() if link.parent else text
                        date_match = re.search(r'(\d{1,2}\s+\w+\s+\d{4}|\w+\s+\d{4}|\d{4})', parent_text)
                        date = date_match.group(0) if date_match else 'Unknown'

                        if 'general-conference' in href:
                            talk_type = 'General Conference'
                        elif 'byu' in href:
                            talk_type = 'BYU Speech'
                        else:
                            talk_type = 'Other'

                        talk_info = {
                            'title': text,
                            'url': href,
                            'date': date,
                            'type': talk_type
                        }

                        if not any(t['url'] == href for t in talks):
                            talks.append(talk_info)

            print(f"Found {len(talks)} unique talks")
            return talks
        except Exception as e:
            print(f"Error: {e}")
            return []

    def fetch_talk_content(self, url):
        try:
            response = requests.get(url, headers=self.headers, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')

            # Try different selectors
            selectors = ['article', 'div.body-block', 'div.article-content', 'div.transcript', 'main']
            article = None
            for selector in selectors:
                article = soup.select_one(selector)
                if article:
                    break

            if article:
                for element in article(['script', 'style', 'nav', 'header', 'footer']):
                    element.decompose()
                text = article.get_text(separator=' ', strip=True)
                return re.sub(r'\s+', ' ', text)
            return None
        except Exception as e:
            return None

    def save_talk(self, talk_info, content):
        safe_title = re.sub(r'[^\w\s-]', '', talk_info['title'][:50])
        safe_title = re.sub(r'[-\s]+', '-', safe_title)
        year_match = re.search(r'\d{4}', talk_info['date'])
        year = year_match.group(0) if year_match else 'unknown'
        talk_type = talk_info['type'].replace(' ', '_')

        filename = f"{year}_{talk_type}_{safe_title}.txt"
        filepath = os.path.join(self.data_dir, filename)

        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(f"Title: {talk_info['title']}\n")
            f.write(f"Type: {talk_info['type']}\n")
            f.write(f"Date: {talk_info['date']}\n")
            f.write(f"URL: {talk_info['url']}\n")
            f.write("\n---\n\n")
            f.write(content)

        return filepath

    def fetch_all_talks(self, limit=None):
        talks = self.get_talk_links()
        if not talks:
            return 0

        if limit:
            talks = talks[:limit]
            print(f"Limiting to {limit} talks for demo purposes")

        successful = 0
        failed = []

        print(f"\nDownloading {len(talks)} talks...")
        print("-" * 50)

        for i, talk in enumerate(talks, 1):
            print(f"[{i}/{len(talks)}] {talk['title'][:60]}...", end=' ')

            # Check if already downloaded
            year_match = re.search(r'\d{4}', talk['date'])
            year = year_match.group(0) if year_match else 'unknown'
            safe_title = re.sub(r'[^\w\s-]', '', talk['title'][:50])
            safe_title = re.sub(r'[-\s]+', '-', safe_title)
            talk_type = talk['type'].replace(' ', '_')
            filename = f"{year}_{talk_type}_{safe_title}.txt"
            filepath = os.path.join(self.data_dir, filename)

            if os.path.exists(filepath):
                print("‚úì (already downloaded)")
                successful += 1
                continue

            content = self.fetch_talk_content(talk['url'])
            if content and len(content) > 100:
                self.save_talk(talk, content)
                print("‚úì")
                successful += 1
            else:
                print("‚úó")
                failed.append(talk['title'])

            time.sleep(1.5)  # Rate limiting

        print("\n" + "="*50)
        print(f"Downloaded {successful} talks successfully")
        if failed:
            print(f"Failed: {len(failed)} talks")

        return successful

# Fetch talks (limit to 50 for demo, remove limit for all talks)
fetcher = TalkFetcher()
num_talks = fetcher.fetch_all_talks(limit=50)  # Remove 'limit=50' to get all talks
print(f"\n‚úì Ready to analyze {num_talks} talks!")

## Step 5: Generate Word Clouds

In [None]:
import glob

# Get all talk files
talk_files = glob.glob('data/talks/*.txt')
print(f"Found {len(talk_files)} talk files to analyze\n")

# Combine all texts
all_text = ""
for filepath in talk_files:
    content = processor.extract_talk_content(filepath)
    all_text += " " + content

# Generate main word cloud
print("Generating main word cloud...")
wordcloud = WordCloud(
    width=1600,
    height=900,
    background_color='white',
    stopwords=ALL_STOPWORDS,
    max_words=150,
    colormap='viridis',
    relative_scaling=0.5,
    min_font_size=10
).generate(all_text)

plt.figure(figsize=(20, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('President Dallin H. Oaks - Complete Talk Collection', fontsize=24, pad=20)
plt.axis('off')
plt.tight_layout()
plt.savefig('output/wordclouds/main_wordcloud.png', dpi=150, bbox_inches='tight')
plt.show()

print("‚úì Main word cloud saved to output/wordclouds/main_wordcloud.png")

## Step 6: Word Frequency Analysis

In [None]:
# Analyze word frequencies
print("Analyzing word frequencies...\n")

all_tokens = []
for filepath in talk_files:
    content = processor.extract_talk_content(filepath)
    tokens = processor.process_text(content)
    all_tokens.extend(tokens)

# Get word frequencies
word_freq = Counter(all_tokens)
top_words = word_freq.most_common(30)

# Create DataFrame
df_freq = pd.DataFrame(top_words, columns=['Word', 'Frequency'])
total_words = sum(word_freq.values())
df_freq['Percentage'] = (df_freq['Frequency'] / total_words * 100).round(2)

# Display top words
print("Top 30 Most Frequent Words:")
print("="*50)
for idx, row in df_freq.iterrows():
    print(f"{idx+1:3}. {row['Word']:20} {row['Frequency']:6,} ({row['Percentage']:.2f}%)")

# Save to CSV
df_freq.to_csv('output/analysis/word_frequencies.csv', index=False)
print("\n‚úì Word frequencies saved to output/analysis/word_frequencies.csv")

# Create bar chart
plt.figure(figsize=(12, 8))
plt.bar(range(20), df_freq['Frequency'].head(20), color='steelblue')
plt.xticks(range(20), df_freq['Word'].head(20), rotation=45, ha='right')
plt.xlabel('Words', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title('Top 20 Most Frequent Words - President Dallin H. Oaks', fontsize=14)
plt.tight_layout()
plt.savefig('output/analysis/word_frequency_chart.png', dpi=150)
plt.show()

## Step 7: Analyze by Decade

In [None]:
# Group talks by decade
decades = defaultdict(list)

for filepath in talk_files:
    basename = os.path.basename(filepath)
    year_match = re.match(r'^(\d{4})', basename)

    if year_match:
        year = int(year_match.group(1))
        decade = f"{(year // 10) * 10}s"
        content = processor.extract_talk_content(filepath)
        decades[decade].append(content)

print(f"Found talks from {len(decades)} decades:")
for decade in sorted(decades.keys()):
    print(f"  {decade}: {len(decades[decade])} talks")

# Generate word cloud for each decade
print("\nGenerating decade word clouds...")
for decade in sorted(decades.keys()):
    if decades[decade]:
        decade_text = ' '.join(decades[decade])

        wordcloud = WordCloud(
            width=1200,
            height=600,
            background_color='white',
            stopwords=ALL_STOPWORDS,
            max_words=100,
            colormap='coolwarm'
        ).generate(decade_text)

        plt.figure(figsize=(12, 6))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.title(f'President Oaks - {decade}', fontsize=18)
        plt.axis('off')
        plt.savefig(f'output/wordclouds/decade_{decade}.png', dpi=150, bbox_inches='tight')
        plt.show()
        print(f"‚úì Generated word cloud for {decade}")

## Step 8: Analyze by Talk Type

In [None]:
# Group talks by type
talk_types = defaultdict(list)

for filepath in talk_files:
    basename = os.path.basename(filepath)
    parts = basename.split('_', 2)

    if len(parts) >= 2:
        talk_type = parts[1].replace('_', ' ')
        content = processor.extract_talk_content(filepath)
        talk_types[talk_type].append(content)

print(f"Found {len(talk_types)} talk types:")
for talk_type, texts in talk_types.items():
    print(f"  {talk_type}: {len(texts)} talks")

# Generate word cloud for each type
print("\nGenerating talk type word clouds...")
for talk_type, texts in talk_types.items():
    if texts:
        type_text = ' '.join(texts)

        wordcloud = WordCloud(
            width=1200,
            height=600,
            background_color='white',
            stopwords=ALL_STOPWORDS,
            max_words=100,
            colormap='plasma'
        ).generate(type_text)

        plt.figure(figsize=(12, 6))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.title(f'President Oaks - {talk_type}', fontsize=18)
        plt.axis('off')
        safe_type = talk_type.replace(' ', '_').lower()
        plt.savefig(f'output/wordclouds/type_{safe_type}.png', dpi=150, bbox_inches='tight')
        plt.show()
        print(f"‚úì Generated word cloud for {talk_type}")

## Step 9: Theme-Based Analysis

In [None]:
# Define themes
themes = {
    'Faith & Testimony': ['faith', 'testimony', 'believe', 'witness', 'know', 'truth', 'prayer'],
    'Family & Marriage': ['family', 'marriage', 'children', 'parent', 'father', 'mother', 'home'],
    'Service & Love': ['service', 'serve', 'love', 'charity', 'help', 'minister', 'compassion'],
    'Covenant & Temple': ['covenant', 'temple', 'ordinance', 'baptism', 'endowment', 'sealing'],
    'Scripture & Revelation': ['scripture', 'revelation', 'prophet', 'bible', 'book', 'mormon']
}

print("Generating theme-based word clouds...\n")

for theme_name, keywords in themes.items():
    print(f"Processing theme: {theme_name}")

    # Extract sentences containing theme keywords
    theme_sentences = []
    for filepath in talk_files:
        content = processor.extract_talk_content(filepath)
        sentences = sent_tokenize(content)

        for sentence in sentences:
            sentence_lower = sentence.lower()
            if any(keyword in sentence_lower for keyword in keywords):
                theme_sentences.append(sentence)

    if theme_sentences:
        theme_text = ' '.join(theme_sentences)

        wordcloud = WordCloud(
            width=1200,
            height=600,
            background_color='white',
            stopwords=ALL_STOPWORDS,
            max_words=80,
            colormap='viridis'
        ).generate(theme_text)

        plt.figure(figsize=(12, 6))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.title(f'Theme: {theme_name}', fontsize=18)
        plt.axis('off')
        safe_theme = theme_name.replace(' & ', '_').lower()
        plt.savefig(f'output/wordclouds/theme_{safe_theme}.png', dpi=150, bbox_inches='tight')
        plt.show()
        print(f"‚úì Generated word cloud for {theme_name} ({len(theme_sentences)} sentences)\n")

## Step 10: Download Results\nRun this cell to create a ZIP file with all results that you can download.

In [None]:
import zipfile
import os
from google.colab import files

# Create a ZIP file with all outputs
print("Creating ZIP file with all results...")

zip_filename = 'oaks_analysis_results.zip'
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    # Add wordclouds
    for root, dirs, files_list in os.walk('output/wordclouds'):
        for file in files_list:
            if file.endswith('.png'):
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, '.')
                zipf.write(file_path, arcname)

    # Add analysis files
    for root, dirs, files_list in os.walk('output/analysis'):
        for file in files_list:
            if file.endswith(('.csv', '.json', '.txt')):
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, '.')
                zipf.write(file_path, arcname)

    # Add downloaded talks
    for root, dirs, files_list in os.walk('data/talks'):
        for file in files_list[:10]:  # Just include first 10 talks as sample
            if file.endswith('.txt'):
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, '.')
                zipf.write(file_path, arcname)

print(f"‚úì Created {zip_filename}")
print("\nContents:")
print("  - Word cloud images")
print("  - Word frequency analysis")
print("  - Sample talk texts")
print("\nüì• Click below to download your results:")

# Trigger download
files.download(zip_filename)

## Summary Statistics

In [None]:
# Generate summary statistics
print("="*60)
print("ANALYSIS SUMMARY")
print("="*60)

# Count statistics
all_words = len(all_tokens)
unique_words = len(set(all_tokens))
num_talks = len(talk_files)

print(f"\nüìä Overall Statistics:")
print(f"  Total talks analyzed: {num_talks}")
print(f"  Total words processed: {all_words:,}")
print(f"  Unique words found: {unique_words:,}")
print(f"  Average words per talk: {all_words // num_talks if num_talks > 0 else 0:,}")

print(f"\nüìÖ Temporal Coverage:")
for decade in sorted(decades.keys()):
    print(f"  {decade}: {len(decades[decade])} talks")

print(f"\nüìù Talk Types:")
for talk_type, texts in sorted(talk_types.items(), key=lambda x: len(x[1]), reverse=True):
    print(f"  {talk_type}: {len(texts)} talks")

print(f"\n‚úÖ Analysis Complete!")
print(f"\nAll visualizations have been saved to the 'output' folder.")
print(f"Download the ZIP file above to get all your results.")

## Optional: Advanced Topic Modeling with LDA

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

print("Performing topic modeling with LDA...\n")

# Prepare documents
documents = []
for filepath in talk_files:
    content = processor.extract_talk_content(filepath)
    documents.append(content)

# Create TF-IDF matrix
vectorizer = TfidfVectorizer(
    max_features=100,
    min_df=2,
    max_df=0.8,
    stop_words=list(ALL_STOPWORDS),
    ngram_range=(1, 2)
)

doc_term_matrix = vectorizer.fit_transform(documents)

# LDA model
n_topics = 5
lda = LatentDirichletAllocation(
    n_components=n_topics,
    max_iter=10,
    learning_method='online',
    random_state=42
)

lda.fit(doc_term_matrix)

# Get feature names
feature_names = vectorizer.get_feature_names_out()

# Display topics
print(f"Top {n_topics} Topics Discovered:")
print("="*60)

for topic_idx, topic in enumerate(lda.components_):
    top_words_idx = topic.argsort()[-10:][::-1]
    top_words = [feature_names[i] for i in top_words_idx]
    print(f"\nTopic {topic_idx + 1}:")
    print("  ", ", ".join(top_words))

print("\n‚úì Topic modeling complete!")

---\n## üéâ Congratulations!\n\nYou've successfully analyzed President Dallin H. Oaks' complete talk collection!\n\n### Next Steps:\n1. **Download your results** using the ZIP file created above\n2. **Explore the word clouds** to see patterns across decades and talk types\n3. **Review the frequency analysis** for key themes\n4. **Share your findings** or adapt this notebook for other speakers\n\n### To analyze ALL talks (not just the demo set):\n- Remove `limit=50` from Step 4 to download the complete collection\n- Note: This will take 30-60 minutes but gives you hundreds of talks\n\n### Educational Applications:\nThis notebook demonstrates:\n- Web scraping and data collection\n- Natural Language Processing (NLP)\n- Data visualization with word clouds\n- Statistical text analysis\n- Topic modeling with machine learning\n\nPerfect for teaching Python applications in business and operations management!