In [None]:
#|default_exp utils

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

# Utils
> Various utils for cleaning, organizing and capturing other information.

## Generic utils

In [None]:
#|export
from __future__ import annotations

import pickle
import re
from pathlib import Path

import numpy as np
from fastcore.foundation import L, patch
from fastcore.meta import delegates
from fastcore.script import call_parse
from fastcore.xtras import globtastic
import nltk
import unidecode
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

In [None]:
#| hide
#| local
%cd ~

/home/deven


In [None]:
#| exporti
def check_files(files):  # files to validate
    flen = len(files)
    if flen <= 0:
        print(f"Found {flen} files")
        print("Check `path` and try again")
        return False

    if isinstance(files[0], str):
        _type = "npy" if files[0].endswith("npy") else "pkl"
    elif isinstance(files[0], Path):
        _type = "npy" if files[0].name.endswith("npy") else "pkl"

    print(f"Found {flen} {_type} files")
    print("-" * 45)
    return True


In [None]:
#|hide
#| local
files = globtastic('embeddings/A_Modest_Proposal/pkl', recursive=False)
_ = check_files(files)

Found 0 files
Check `path` and try again


In [None]:
#|hide
#| local
files = globtastic('embeddings/A_Modest_Proposal/', recursive=False)
_ = check_files(files)

Found 0 files
Check `path` and try again


In [None]:
#|export
@delegates(globtastic)
def loader(
    path: str | Path,  # path to a given folder,
    extension: str,  # extension of the file you want
    **kwargs,
) -> L:  # returns `L`
    """Given a Path and an extension, returns all files with the extension
    in the path"""
    files = globtastic(path, file_glob=f"*{extension}", **kwargs).map(Path)

    return files


In [None]:
#|export
def get_data(
    fname: str | Path,  # path to the file
) -> str:  # returns content of the file
    "Reads from a txt file"
    with open(fname, "r") as f:
        all_text = f.read()
    return all_text


In [None]:
#|export
def load_pmi(fname: str | Path) -> np.ndarray:  # name of pmi file matrix
    """
    Loads the PMI matrix
    """
    file_ = loader(fname, ".npy")
    pmi = np.load(file_[0])
    print(f"Loaded {fname}")
    return pmi


In [None]:
x = np.random.randint(0 , 100, (100, 100))
np.save('test.npy', x)
read_file = load_pmi('test.npy')

Loaded test.npy


In [None]:
#|hide
from fastcore.test import test_eq

In [None]:
test_eq(x, read_file)

In [None]:
#|export
def load_dictionary(
    fname: str,  # path to the pkl file
) -> dict:  # returns the contents
    """
    Given a fname, function loads a `pkl` dictionary
    from the current directory
    """
    fname = open(fname, "rb")
    data = pickle.load(fname)
    return data


In [None]:
#|export
def normalize(
    data: np.ndarray,  # input array
) -> np.ndarray:  # normalized array
    """
    Given an input array, return normalized array
    """
    return (data - np.min(data)) / (np.max(data) - np.min(data))


In [None]:
test_eq(normalize([1, 2, 3, 4, 5]), [0.  , 0.25, 0.5 , 0.75, 1.  ])

In [None]:
#|export
@call_parse
def chelp():
    "Show help for all console scripts"
    from fastcore.xtras import console_help

    console_help("clean_plot")


In [None]:
chelp()

[1m[94mclean_file[0m                      Takes name of a txt file and writes the tokenized sentences into a
    new txt file
[1m[94mcorr_hm[0m                         Generates correlation plots from normalized SSMs
[1m[94mcp_help[0m                         Show help for all console scripts
[1m[94mheatmaps[0m                        Generates plots for embeddings in the folder
[1m[94mheatmaps_pkl[0m                    Generates SSMs from pkl files
[1m[94mhistograms[0m                      Generates histograms for embeddings in the folder
[1m[94mlex_ts[0m                          Generate lexical TS from Lexical SSM
[1m[94mmake_pkl[0m                        Create pkl for time series from embeddings
[1m[94mts_pkl[0m                          Plot timeseries from the pkl file


## Utils for cleaning text

Before using any of the cleaning utils in the file, please run `download_nltk_dep` first.

In [None]:
#|export
def download_nltk_dep():
    """
    Downloads the `nltk` dependencies
    """
    import nltk

    nltk.download("punkt")
    nltk.download("stopwords")
    nltk.download("averaged_perceptron_tagger")
    nltk.download("wordnet")
    nltk.download("omw-1.4")


In [None]:
#| hide
from nltk.corpus import stopwords
import nltk

In [None]:
#| hide
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
    nltk.data.find('corpora/wordnet')
    nltk.data.find('corpora/omw-1.4')
    nltk.data.find('taggers/averaged_perceptron_tagger')
except:
    print('Downloading dependencies')
    download_nltk_dep()

Downloading dependencies


[nltk_data] Downloading package punkt to /home/deven/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/deven/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/deven/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/deven/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/deven/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
#|export
def split_by_newline(
    text: str,  # sentences separated by \n
) -> L:  # list of sentences
    """
    Only use when sentences are already tokenized
    returns sentences split by `\n`
    """
    return L([line for line in text.split("\n") if len(line) > 0])


In [None]:
text = "Hello there!\nThis is how this functions works!"
split_by_newline(text)

(#2) ['Hello there!','This is how this functions works!']

In [None]:
#|export
def rm_useless_spaces(
    t: str,  # sentence with extra spaces
) -> str:  # sentence without extra spaces
    """
    Removes useless spaces
    """
    _re_space = re.compile(" {2,}")
    return _re_space.sub(" ", t).lstrip().rstrip()


In [None]:
rm_useless_spaces('  This is      test sentence.  This removes  all the extra  spaces.  ')

'This is test sentence. This removes all the extra spaces.'

In [None]:
#|export
def make_sentences(text: str) -> L:  # bulk text  # list of sentences
    """
    Converts given bulk into sentences
    """
    all_cleaned = text.replace("\n", " ")
    all_cleaned = rm_useless_spaces(all_cleaned)
    all_cleaned = all_cleaned.strip()
    all_cleaned = unidecode.unidecode(all_cleaned)
    sentences = sent_tokenize(all_cleaned)
    return L(sentences)


In [None]:
#|export
def write_to_file_cleaned(
    sentences: list, fname: str  # list of sentences  # name of output file
) -> None:
    """
    Writes the sentences to a .txt file
    """
    with open(f"{fname.stem}_cleaned.txt", "w") as f:
        for line in sentences:
            f.write(f"{line}\n")
    f.close()


In [None]:
#|export
@call_parse
def clean(fname: str) -> None:  # name of input txt file
    """Takes name of a txt file and writes the tokenized sentences into a
    new txt file"""
    fname = Path(fname)
    text = get_data(fname)
    sentences = make_sentences(text)
    print(f"{fname.name} contains {len(sentences)} sentences")
    write_to_file_cleaned(sentences, fname)


In [None]:
#|hide
#| local
%cd ~/projects/clean_plot/nbs

/home/deven/projects/clean_plot/nbs


In [None]:
fname = '../files/dummy.txt'
text = get_data(fname)
print(text)

MARLEY was dead: to begin with. There is no doubt
whatever about that. The register of his burial was
signed by the clergyman, the clerk, the undertaker,
and the chief mourner. Scrooge signed it: and
Scrooge's name was good upon 'Change, for anything he
chose to put his hand to. Old Marley was as dead as a
door-nail.

Mind! I don't mean to say that I know, of my
own knowledge, what there is particularly dead about
a door-nail. I might have been inclined, myself, to
regard a coffin-nail as the deadest piece of ironmongery
in the trade. But the wisdom of our ancestors
is in the simile; and my unhallowed hands
shall not disturb it, or the Country's done for. You
will therefore permit me to repeat, emphatically, that
Marley was as dead as a door-nail.

This is a new sentence.


It goes from this to 

The `clean` function writes these sentences into a txt file with the name `<fname>_cleaned.txt` 

In [None]:
#|export
def get_wordnet_pos(
    word: str,  # input word token
) -> str:  # POS of the given word
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {
        "J": wordnet.ADJ,
        "N": wordnet.NOUN,
        "V": wordnet.VERB,
        "R": wordnet.ADV,
    }

    return tag_dict.get(tag, wordnet.NOUN)


In [None]:
#| export
def get_wordnet_pos(
    word: str,  # input word token
) -> str:  # POS of the given word
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {
        "J": wordnet.ADJ,
        "N": wordnet.NOUN,
        "V": wordnet.VERB,
        "R": wordnet.ADV,
    }

    return tag_dict.get(tag, wordnet.NOUN)


In [None]:
#|export
def remove_stopwords(sentence: str,  # input sentence
                     ) -> str:    # output sentence
    """
    Takes a sentence and removes stopwords from it
    """
    sentences = []
    STOPWORDS = set(stopwords.words("english"))
    for word in sentence.split():
        if word.lower() not in STOPWORDS:
            sentences.append(word)
    return " ".join(sentences)


In [None]:
#|export
def remove_punctuations(
    sentence: str,  # input sentence
) -> str:  # output sentence
    """
    Takes a sentence and removes punctuations from it
    """
    pat2 = re.compile(r"[^a-zA-Z0-9 ]+")
    pat1 = re.compile(r"[\s]+")

    doc = pat2.sub(" ", sentence)
    doc = pat1.sub(" ", doc)
    doc = doc.strip()
    return doc


In [None]:
#|export
def remove_punc_clean(
    sentence: str,  # input sentence
    lemmatize: bool = False,  # flag to `lemmatize`
) -> str:
    """
    Takes a sentence and removes punctuations and stopwords from it

    Will lemmatize words if `lemmatize = True`
    """
    doc = remove_punctuations(sentence)
    doc = remove_stopwords(doc)

    if lemmatize:
        lemmatizer = WordNetLemmatizer()
        doc = " ".join(
            [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in doc.split()]
        )
    return doc


In [None]:
#|export
def process_for_lexical(fname: str) -> L:  # name of the input txt file  #
    "Given an input txt file, return removed sentences"
    fname = Path(fname)
    all_data = get_data(fname)
    sentences = make_sentences(all_data)
    clean_sentences = []
    removed_sentences = []
    for i, sentence in enumerate(sentences):
        t = remove_punc_clean(sentence)
        if len(t) > 0:
            clean_sentences.append(t)
        else:
            removed_sentences.append(i)

    # write_to_file_lexical(clean_sentences, fname)
    print("Done processing", fname.name)
    return L(removed_sentences)


def process_for_lexical(fname: str) -> L:  # name of the input txt file  #
    "Given an input txt file, return removed sentences"
    fname = Path(fname)
    all_data = get_data(fname)
    sentences = make_sentences(all_data)
    clean_sentences = []
    removed_sentences = []
    for i, sentence in enumerate(sentences):
        t = remove_punc_clean(sentence)
        if len(t) > 0:
            clean_sentences.append(t)
        else:
            removed_sentences.append(i)

    # write_to_file_lexical(clean_sentences, fname)
    print("Done processing", fname.name)
    return L(removed_sentences)


In [None]:
data = get_data(fname)
sentences = make_sentences(data)
sentences

(#11) ['MARLEY was dead: to begin with.','There is no doubt whatever about that.','The register of his burial was signed by the clergyman, the clerk, the undertaker, and the chief mourner.',"Scrooge signed it: and Scrooge's name was good upon 'Change, for anything he chose to put his hand to.",'Old Marley was as dead as a door-nail.','Mind!',"I don't mean to say that I know, of my own knowledge, what there is particularly dead about a door-nail.",'I might have been inclined, myself, to regard a coffin-nail as the deadest piece of ironmongery in the trade.',"But the wisdom of our ancestors is in the simile; and my unhallowed hands shall not disturb it, or the Country's done for.",'You will therefore permit me to repeat, emphatically, that Marley was as dead as a door-nail.'...]

Let's continue the same example from above

Here, the `remove_punc_clean` function removes punctuations, STOPWORDS and lemmatizes the word and returns the cleaned sentence. 

:::{.callout-Note}

It is possible that a sentence may be removed completely as it may contain only STOPWORDS.

:::
This method is to be used for methods involving lexical analysis.

Without lemmatization

In [None]:
for sentence in sentences:
    print(remove_punc_clean(sentence))

MARLEY dead begin
doubt whatever
register burial signed clergyman clerk undertaker chief mourner
Scrooge signed Scrooge name good upon Change anything chose put hand
Old Marley dead door nail
Mind
mean say know knowledge particularly dead door nail
might inclined regard coffin nail deadest piece ironmongery trade
wisdom ancestors simile unhallowed hands shall disturb Country done
therefore permit repeat emphatically Marley dead door nail
new sentence


With Lemmatization

In [None]:
for sentence in sentences:
    print(remove_punc_clean(sentence, lemmatize=True))

MARLEY dead begin
doubt whatever
register burial sign clergyman clerk undertaker chief mourner
Scrooge sign Scrooge name good upon Change anything chose put hand
Old Marley dead door nail
Mind
mean say know knowledge particularly dead door nail
might inclined regard coffin nail deadest piece ironmongery trade
wisdom ancestor simile unhallowed hand shall disturb Country do
therefore permit repeat emphatically Marley dead door nail
new sentence


In [None]:
clean('../files/dummy.txt')

dummy.txt contains 11 sentences


In [None]:
process_for_lexical('../files/dummy.txt')

Done processing dummy.txt


(#0) []

In [None]:
Path('../files/').ls()

(#2) [Path('../files/dummy.txt'),Path('../files/dummy_cleaned.txt')]

In [None]:
#| export
def num_words(sentence: str) -> int:  # input sentence  # number of words
    "Returns the number of words in a sentence"
    return len(remove_punctuations(sentence).split())


In [None]:
def num_words(sentence: str) -> int:  # input sentence  # number of words
    "Returns the number of words in a sentence"
    return len(remove_punctuations(sentence).split())


In [None]:
print(sentences[1])
num_words(sentences[1])

There is no doubt whatever about that.


7

### Patches to `pathlib.Path`

With all these utility functions, these are just some additional functions which are applied to `pathlib.Path`. There are 3 additional functions/properties when you have a numpy array or a txt file inside a Path object.

In [None]:
#|export
@patch(as_prop=True)
def shape(self: Path):
    name = str(self)
    if name.endswith(".npy"):
        return np.load(self).shape
    raise AssertionError("not a npy array")


@patch(as_prop=True)
def shape(self: Path):
    name = str(self)
    if name.endswith(".npy"):
        return np.load(self).shape
    raise AssertionError("not a npy array")


In [None]:
#|hide
from fastcore.utils import working_directory

In [None]:
#|local
with working_directory('/home/deven'):
    p = 'test.npy'
    arr = np.load(p)
arr.shape

(100, 100)

Instead of all of that, I can just call `Path().shape`, like this

In [None]:
#| local
with working_directory('/home/deven'): 
    shp = Path('test.npy').shape
    test_eq(arr.shape, Path('test.npy').shape)

In [None]:
#|hide
%cd ../files/

/home/deven/projects/clean_plot/files


In [None]:
#|export
@patch(as_prop=True)
def text(self: Path):
    if str(self).endswith(".txt"):
        with open(self) as f:
            return f.read()
    raise AssertionError("not a txt file")


@patch(as_prop=True)
def text(self: Path):
    if str(self).endswith(".txt"):
        with open(self) as f:
            return f.read()
    raise AssertionError("not a txt file")


In [None]:
Path('dummy.txt').text

"MARLEY was dead: to begin with. There is no doubt\nwhatever about that. The register of his burial was\nsigned by the clergyman, the clerk, the undertaker,\nand the chief mourner. Scrooge signed it: and\nScrooge's name was good upon 'Change, for anything he\nchose to put his hand to. Old Marley was as dead as a\ndoor-nail.\n\nMind! I don't mean to say that I know, of my\nown knowledge, what there is particularly dead about\na door-nail. I might have been inclined, myself, to\nregard a coffin-nail as the deadest piece of ironmongery\nin the trade. But the wisdom of our ancestors\nis in the simile; and my unhallowed hands\nshall not disturb it, or the Country's done for. You\nwill therefore permit me to repeat, emphatically, that\nMarley was as dead as a door-nail.\n\nThis is a new sentence."

In [None]:
#|export
@patch(as_prop=True)
def sentences(self: Path):
    name = str(self)
    if name.endswith(".txt"):
        if name.endswith("_cleaned.txt"):
            return split_by_newline(self.text)
        else:
            return make_sentences(self.text)
    raise AssertionError("not a txt file")

In [None]:
@patch(as_prop=True)
def sentences(self: Path):
    name = str(self)
    if name.endswith(".txt"):
        if name.endswith("_cleaned.txt"):
            return split_by_newline(self.text)
        else:
            return make_sentences(self.text)
    raise AssertionError("not a txt file")

In [None]:
#|hide
from nbdev import nbdev_export; nbdev_export()