In [21]:
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [22]:
# Download NLTK data (stopwords and Porter stemmer)
nltk.download('stopwords')
nltk.download('punkt')

# Initialize the Porter Stemmer and stopwords
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gpaul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gpaul\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [23]:
# Directory containing the corpus of text files
corpus_dir = './nasa/'

# List to store the content of each text file
corpus = []

In [24]:
# Loop through each file in the corpus directory
for filename in os.listdir(corpus_dir):
    if filename.endswith(".txt"):
        file_path = os.path.join(corpus_dir, filename)
        with open(file_path, 'r') as txtfile:
            text = txtfile.read()
            corpus.append(text)

In [25]:
# corpus

In [30]:
import re
# Function to preprocess and tokenize text
def preprocess_text(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    filtered_tokens = []
    # Remove stopwords and apply stemming
    for word in tokens:
        if word.lower() in stop_words:
            continue
        if re.match(r"\b\W+\b", word):
            continue
        filtered_tokens.append(stemmer.stem(word))
    
    # Join the filtered tokens back into a string
    return ' '.join(filtered_tokens)

In [31]:
# Function to create a boolean term-document matrix as a NumPy array
vectorizer = CountVectorizer(binary=True, preprocessor=preprocess_text)
term_document_matrix = vectorizer.fit_transform(corpus)
term_doc_matrix = term_document_matrix.toarray()
feature_names = vectorizer.get_feature_names_out()

In [32]:
term_doc_matrix

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]], dtype=int64)

In [33]:
feature_names

array(['000', '001', '0012', ..., 'zero', 'zoom', 'µm'], dtype=object)