# WEEK 9: Removing punctuation and stop words from a corpus


Installing ntlk and python-docx libraries

In [1]:
!pip install nltk
!pip install python-docx



Importing required libraries

In [2]:
import nltk
from zipfile import ZipFile
import glob
import os
import string
import re
from docx import Document
from io import StringIO

Now downloading stopwords and punkt modules from nltk

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\devdp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\devdp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

In [6]:
def get_filenames():
    """
    Extracts the zip file containig all text files
    then returns list of filenames of all files in zip directory

    Returns:
        list[str]: list of filenames
    """
    with ZipFile('week_10_txt_and_docx.zip', 'r') as zipDir:
        path = r'week_10_txt_and_docx'
        zipDir.extractall('week_10_txt_and_docx')
        filenames = glob.glob(path + "/*.txt") + glob.glob(path + "/*.docx") # concating .text and .docx files
    return filenames

In [7]:
filenames = get_filenames()
print('\n'.join(filenames))

week_10_txt_and_docx\52256-0.txt
week_10_txt_and_docx\53031-0.txt
week_10_txt_and_docx\58108-0.txt
week_10_txt_and_docx\blind_text.txt
week_10_txt_and_docx\dr_yawn.txt
week_10_txt_and_docx\how_rubber_goods_are_made.txt
week_10_txt_and_docx\most_boring_ever.txt
week_10_txt_and_docx\most_boring_part2.txt
week_10_txt_and_docx\pg12814.txt
week_10_txt_and_docx\pg14895.txt
week_10_txt_and_docx\pg43994.txt
week_10_txt_and_docx\random_text.txt
week_10_txt_and_docx\smiley_the_bunny.txt
week_10_txt_and_docx\week_10_document1.docx
week_10_txt_and_docx\week_10_document2.docx


In [8]:

def remove_punctuation_and_stop_words(filenames: list[str]):
    """
    This function reads all files and removes punctuation and stopwords.
    It uses ntlk to tokenize and remove stopwords from sentence.


    Args:
        filenames (list[str]): list of filenames

    Returns:
        dict: dictioary with key as filename and value as list of unique words
    """
    stop_words = set(map(lambda x: x.lower(), list(
        set(stopwords.words('english')))))
    # https://www.geeksforgeeks.org/python-check-url-string/
    url_regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"

    # https://stackoverflow.com/a/3868861
    phone_regex = r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})'

    words_data = dict()
    for filename in filenames:
        with open(filename, 'r') as file:
            file_text = None
            try:
                file_text = file.read()

            except UnicodeDecodeError:  # Handleling file decoding errors
                if '.docx' in filename:
                    # if file is a docx file then using python-docx to load file
                    # ref : https://python-docx.readthedocs.io/en/latest/user/documents.html#opening-a-file-like-document
                    file = open(filename, 'rb')
                    document = Document(file)
                    file_text = '\n'.join(
                        list(map(lambda x: x.text, [paragrphs for paragrphs in document.paragraphs])))
                    file.close()
                else:
                    print("Error decodingin file : %s" % filename)
                    continue

            # Check if file_text if not None i.e. file is loaded successfully.
            if file_text is not None:

                # removes urls from text
                file_text = re.sub(url_regex, " ", file_text)

                # removes phone numbers from text
                file_text = re.sub(phone_regex, ' ', file_text)
                words = []

                # this will detect non-alphanumeric characters and this will also identify "_".
                text_only_regex = r'[^\w\s]|_+'
                for sentence in list(set(sent_tokenize(file_text))):
                    words += [
                        word for word in
                        word_tokenize(
                            # replacing non-alpha-numeric chars with space
                            re.sub(text_only_regex, ' ', sentence)
                        )
                        # checking if word is not a stopword
                        if (word.lower() not in stop_words)
                        and (word not in words)  # checking if word is unique
                        # checking if word is not a punctuation
                        and (word not in string.punctuation)
                    ]
                words_data[os.path.basename(filename)] = list(
                    set(map(lambda x: x.lower(), words)))

    return words_data

In [9]:
words_data = remove_punctuation_and_stop_words(filenames)

In [10]:
def write_words_data(words_data):
    """
    Generates .dat file from data  
    """
    data = ''
    for key, value in words_data.items():
        # logic to convert data into target format string
        data += """"%s":%s""" % (key, ','.join(value))

    with open('output.dat', 'w') as outputfile:
        outputfile.write(data)
    return """Generated "output.dat" file"""

In [11]:
write_words_data(words_data)

'Generated "output.dat" file'