In [193]:
import re
import requests
import bs4
import os
import nltk
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import matplotlib as plt
import scipy
from nltk.corpus import stopwords
import sklearn
import sklearn.mixture

In [194]:
# Create a directory for NLTK data if it doesn't exist
nltk_data_dir = os.path.join(os.path.expanduser("~"), "nltk_data")
if not os.path.exists(nltk_data_dir):
    os.makedirs(nltk_data_dir)

# Append this directory to NLTK data path
nltk.data.path.append(nltk_data_dir)

# Download the resources again
nltk.download('punkt')
nltk.download('wordnet', download_dir=nltk_data_dir)
nltk.download('stopwords', download_dir=nltk_data_dir)

[nltk_data] Downloading package punkt to C:\Users\EmmaK/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\EmmaK\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\EmmaK\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [195]:
def fetch_text_file(url, title):
    """
    Fetches the content of a text file from a given URL.
    
    Parameters:
    url (str): The URL of the text file.
    
    Returns:
    str: The content of the text file as a string.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors
        HEADER_TEXT_MARKER = "*** START OF THE PROJECT GUTENBERG EBOOK " + title.upper() + " ***"
        FOOTER_TEXT_MARKER = "*** END OF THE PROJECT GUTENBERG EBOOK " + title.upper() + " ***"
        start_index = response.text.find(HEADER_TEXT_MARKER) + len(HEADER_TEXT_MARKER)
        end_index = response.text.find(FOOTER_TEXT_MARKER)

        return response.text[start_index:end_index]
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return ""
    
book_url = 'https://www.gutenberg.org/cache/epub/55/pg55.txt'
title = "The Wonderful Wizard of Oz"
my_text = fetch_text_file(book_url, title)

In [229]:
stop_words = set(stopwords.words('english'))

def preprocess_and_prune(paragraphs):
    """
    Preprocesses the input text by normalizing it for further analysis.

    Parameters:
    text (str): The raw text to be preprocessed.

    Returns:
    str: The preprocessed text, in lowercase and stripped of unwanted characters.
    """
    cleaned_paragraphs = []
    for text in paragraphs:
        text = text.lower()
        text = re.sub(r'\W+', ' ', text)
        words = text.split()

        if len(words) < 4 or "chapter" in words:
            continue

        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words and 3 <= len(word) <= 15]
    
        if words:
            cleaned_paragraphs.append(' '.join(words))
    
    return cleaned_paragraphs

mytext_paragraphs = re.split(r'\n\s*\n', my_text.strip())
cleaned_paragraphs = preprocess_and_prune(mytext_paragraphs)[9:]

for i, paragraph in enumerate(cleaned_paragraphs):
    print(f"Paragraph {i + 1}: {paragraph.strip()}")

Paragraph 1: dorothy stood doorway looked around could see nothing great gray prairie every side tree house broke broad sweep flat country reached edge sky direction sun baked plowed land gray mass little crack running even grass green sun burned top long blade gray color seen everywhere house painted sun blistered paint rain washed away house dull gray everything else
Paragraph 2: aunt came live young pretty wife sun wind changed taken sparkle eye left sober gray taken red cheek lip gray also thin gaunt never smiled dorothy orphan first came aunt startled child laughter would scream press hand upon heart whenever dorothy merry voice reached ear still looked little girl wonder could find anything laugh
Paragraph 3: uncle henry never laughed worked hard morning till night know joy gray also long beard rough boot looked stern solemn rarely spoke
Paragraph 4: toto made dorothy laugh saved growing gray surroundings toto gray little black dog long silky hair small black eye twinkled merrily

In [230]:
def prune(vocab):
    nltk_stop_words = nltk.corpus.stopwords.words('english')
    pruned_vocab = []
    for word, freq in vocab:
        # rule 1: check the nltk stop words list
        if(word in nltk_stop_words):
            continue
        # rule 2: check if is in the top 1% of frequent words
        if(freq >= vocab[int(len(vocab)/100)][1]):
            continue
        # rule 3: if the word occurs less than 4 times
        if(freq < 4):
            continue
        # rule 4: word is overly short (less than 3 characters) or long (over than 15 characters)
        if(len(word) < 3 or len(word) > 15):
            continue
        else:
            pruned_vocab.append((word,freq))
    return pruned_vocab

def tokenize_and_lemmatize(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum()]
    return lemmatized_tokens

def create_pruned_vocabulary(paragraphs):
    vocabulary = {}
    for paragraph in paragraphs:
        lemmatized_tokens = tokenize_and_lemmatize(paragraph)
        for token in lemmatized_tokens:
            vocabulary[token] = vocabulary.get(token, 0) + 1
    pruned_vocabulary = prune(sorted(vocabulary.items(), key=lambda x: x[1], reverse=True))
    return pruned_vocabulary

pruned_vocab = create_pruned_vocabulary(cleaned_paragraphs)
print(pruned_vocab)

[('city', 80), ('answered', 78), ('must', 75), ('man', 75), ('wicked', 72), ('emerald', 71), ('heart', 69), ('come', 67), ('country', 66), ('away', 66), ('room', 66), ('well', 66), ('long', 64), ('way', 64), ('like', 64), ('tree', 63), ('time', 63), ('looked', 61), ('never', 59), ('know', 59), ('people', 59), ('saw', 58), ('brain', 58), ('eye', 57), ('thought', 54), ('make', 53), ('replied', 51), ('kansa', 50), ('monkey', 50), ('first', 49), ('day', 48), ('big', 48), ('many', 48), ('found', 47), ('walked', 46), ('friend', 46), ('road', 46), ('ever', 45), ('forest', 45), ('soon', 44), ('much', 44), ('went', 43), ('beast', 43), ('around', 42), ('give', 42), ('house', 41), ('arm', 40), ('thing', 40), ('wizard', 40), ('think', 39), ('beautiful', 38), ('help', 38), ('shoe', 37), ('mouse', 37), ('land', 36), ('air', 36), ('woman', 36), ('old', 36), ('stood', 35), ('quite', 35), ('wish', 35), ('voice', 34), ('put', 34), ('last', 33), ('foot', 33), ('yellow', 33), ('find', 32), ('cap', 32), ('

In [231]:
def tf_idf(paragraphs, pruned_vocab):

    vocab_indices = {word[0]: idx for idx, word in enumerate(pruned_vocab)}

    n_docs = len(paragraphs)
    n_vocab = len(pruned_vocab)

    # matrix of term frequencies
    tfmatrix = scipy.sparse.lil_matrix((n_docs, n_vocab))
    # row vector of document frequencies
    dfvector = scipy.sparse.lil_matrix((1,n_vocab))
    
    # loop over sentences
    for k in range(n_docs):
        temp_dfvector=scipy.sparse.lil_matrix((1,n_vocab))
        words = paragraphs[k].split()
        #print(f"Document {k}: {words}")  # Print words in the document

        # loop over words
        for word in words:
            if word in vocab_indices:
                current_word_index = vocab_indices[word]
                tfmatrix[k, current_word_index] += 1
                temp_dfvector[0, current_word_index] = 1
                #print(f"Incremented tfmatrix[{k}, {current_word}] for word '{word}'")  # Debugging output
        
        dfvector += temp_dfvector
    
    idfvector = np.squeeze(np.array(dfvector.todense()))
    idfvector = 1 + np.log((idfvector + 1) / n_docs)

    # TF-IDF matrix
    tfidfmatrix=scipy.sparse.lil_matrix((n_docs,n_vocab))

    for k in range(n_docs):
        tempindices=np.nonzero(tfmatrix[k,:])[1]
        tfterm=np.squeeze(np.array(tfmatrix[k,tempindices].todense()))
        tfidfmatrix[k,tempindices]=tfterm*idfvector[tempindices]

    #print("Document Frequency (DF) vector (before IDF):\n", dfvector.todense())
    #print("Inverse Document Frequency (IDF) vector:\n", idfvector)
    #print("Term Frequency (TF) matrix:\n", tfmatrix.todense())
    #print("TF-IDF matrix:\n", tfidfmatrix.todense())

    return tfidfmatrix

tfidfmatrix = tf_idf(cleaned_paragraphs, pruned_vocab).todense()
print(tfidfmatrix)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [232]:
# Reduce the data to 500 highest total TF-IDF features
import scipy.sparse

dimension_totals = np.squeeze(np.array(np.sum(tfidfmatrix, axis=0)))
highest_totals = np.argsort(-dimension_totals)
X_small = tfidfmatrix[:,highest_totals[0:500]]

if scipy.sparse.issparse(X_small):
    X_small = X_small.todense()  # Convert to dense format if it's sparse
X_small = np.array(X_small)  # Ensure it's a NumPy array

# Normalize the documents to unit vector norm
tempnorms = np.squeeze(np.array(np.sum(np.multiply(X_small, X_small), axis=1)))

# If any documents have zero norm, avoid dividing them by zero
tempnorms[tempnorms==0]=1
X_small = scipy.sparse.diags(tempnorms**-0.5).dot(X_small)

mixture_model = sklearn.mixture.GaussianMixture(n_components=10, covariance_type='diag', max_iter=100, init_params='random')

fitted_mixture = mixture_model.fit(X_small)
sklearn_mixturemodel_means = fitted_mixture.means_
sklearn_mixturemodel_weights = fitted_mixture.weights_
sklearn_mixturemodel_covariances = fitted_mixture.covariances_

In [233]:
# Vocabulary words
vocab_words = [word[0] for word in pruned_vocab]

n_components = sklearn_mixturemodel_means.shape[0]

for i in range(n_components):
    component_means = sklearn_mixturemodel_means[i]

    top_word_indices = np.argsort(-np.abs(component_means))[:10]

    print(f"Component {i+1}: ")
    for index in top_word_indices:
        print(f" Word: {vocab_words[index]}, Mean: {component_means[index]}")
    print()

Component 1: 
 Word: wing, Mean: -0.048903705833236166
 Word: loud, Mean: -0.03350077323715148
 Word: exclaimed, Mean: -0.03207048416223438
 Word: rather, Mean: -0.03094788136461853
 Word: many, Mean: -0.030609438605886413
 Word: color, Mean: -0.027842313132288656
 Word: uncle, Mean: -0.026369767492849812
 Word: dark, Mean: -0.02585091423707495
 Word: flower, Mean: -0.025159482147514962
 Word: forward, Mean: -0.024710129310359253

Component 2: 
 Word: fire, Mean: -0.03783561616148362
 Word: angry, Mean: -0.0328538699764899
 Word: carefully, Mean: -0.03263159768465171
 Word: got, Mean: -0.03263159768465171
 Word: lying, Mean: -0.03204093269065866
 Word: set, Mean: -0.03124999899608066
 Word: top, Mean: -0.030287577699659056
 Word: rich, Mean: -0.02885386839878869
 Word: alone, Mean: -0.027534149908337382
 Word: call, Mean: -0.02589979895799871

Component 3: 
 Word: bigger, Mean: -0.05131232429871366
 Word: lost, Mean: -0.03342115282963465
 Word: dressed, Mean: -0.030723108435604423
 Wor

In [234]:
membership_probabilities = mixture_model.predict_proba(X_small)

for i in range(n_components):
    highest_prob_index = np.argmax(membership_probabilities[:,i])

    # Print the document (paragraph) with the highest membership probability
    print(f"Component {i + 1}:")
    print(f"Document (Paragraph): {cleaned_paragraphs[highest_prob_index]}")
    print(f"Highest Membership Probability: {membership_probabilities[highest_prob_index, i]}")
    print()  # For better readability

Component 1:
Document (Paragraph): said dorothy moment thought aunt told witch dead year year ago
Highest Membership Probability: 1.0

Component 2:
Document (Paragraph): little woman evidently expected answer dorothy said hesitation kind must mistake killed anything
Highest Membership Probability: 1.0

Component 3:
Document (Paragraph): aunt came live young pretty wife sun wind changed taken sparkle eye left sober gray taken red cheek lip gray also thin gaunt never smiled dorothy orphan first came aunt startled child laughter would scream press hand upon heart whenever dorothy merry voice reached ear still looked little girl wonder could find anything laugh
Highest Membership Probability: 1.0

Component 4:
Document (Paragraph): hour hour passed away slowly dorothy got fright felt quite lonely wind shrieked loudly nearly became deaf first wondered would dashed piece house fell hour passed nothing terrible happened stopped worrying resolved wait calmly see future would bring last crawled