In [None]:
#|default_exp functions

In [None]:
#|hide
%load_ext autoreload
%autoreload 2

# Functions

> This includes majority of the functions for cleaning text files

- hide_colab_badge: true

In [None]:
#|hide
from clean_plot.core import *
from nbdev.showdoc import *
from fastcore.test import *

In [None]:
#|export
from clean_plot.core import *
from pathlib import Path
import os
import pandas as pd
import pickle
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import unidecode
import re

from fastcore.foundation import L
from fastcore.test import test_eq, test_ne
from fastcore.script import call_parse

In [None]:
#|export
from typing import Callable, Iterator, Union, Optional, List

In [None]:
#|export
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer

## Methods

In [None]:
#|export
def download_ntlk_dep():
    """
    Downloads the `nltk` dependencies
    """
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('averaged_perceptron_tagger')
    nltk.download('wordnet')
    nltk.download('omw-1.4')

In [None]:
#|export
def normalize(data: np.ndarray) -> np.ndarray:
    """
    The function takes an array, matrix as input and normalizes
    it between 0 and 1

    Args:
        data (ndarray): any 1-D, or 2-D numpy array

    Returns:
        (ndarray): normalized ndarray
    """
    return (data - np.min(data)) / (np.max(data) - np.min(data))

In [None]:
normalize([1,2,3,4,5])

array([0.  , 0.25, 0.5 , 0.75, 1.  ])

In [None]:
test_eq(normalize([1,2,3,4,5]), [0.  , 0.25, 0.5 , 0.75, 1.  ])

In [None]:
#|export
def split_by_newline(text: str) -> List[str]:
    """
    Only use when sentences are already tokenized
    returns sentences split by '\\n' if len(line) > 0

    Args:
        all (str): tokenized string to be split by '\\n'

    Returns:
        list: list of sentences split by '\\n'
    """
    return L([line for line in text.split('\n') if len(line) > 0])

In [None]:
text = "Hello there!\nThis is how this functions works!"
split_by_newline(text)

(#2) ['Hello there!','This is how this functions works!']

In [None]:
#|export
def rm_useless_spaces(t: str) -> str: 
    """
    Removes useless spaces
    """
    _re_space = re.compile(' {2,}')
    return _re_space.sub(' ', t).lstrip().rstrip()

In [None]:
rm_useless_spaces('  This is      test sentence.  This removes  all the extra  spaces.  ')

'This is test sentence. This removes all the extra spaces.'

In [None]:
#|export
def make_sentences(text: str) -> List[str]:
    """
    Converts given bulk into sentences
    """
    try:
        sent_tokenize('')
    except Exception as error:
        download_ntlk_dep()
        print(f'Run download_nltk_dep() first') 
#     all_cleaned = re.sub('\n', ' ', text)
    all_cleaned = text.replace('\n', ' ')
    all_cleaned = rm_useless_spaces(all_cleaned)
    all_cleaned = all_cleaned.strip()
    all_cleaned = unidecode.unidecode(all_cleaned)
    sentences = sent_tokenize(all_cleaned)
    return L(sentences)

In [None]:
#|export
def write_to_file_cleaned(sentences: List[str], fname: str) -> None:
    """
    Writes the sentences to a .txt file
    """
    with open(f'{fname.stem}_cleaned.txt', 'w') as f:
        for line in sentences:
            f.write(f'{line}\n')
    f.close()

In [None]:
#|export
@call_parse
def clean(fname: str) -> None:
    """
    Takes name of a txt file and writes the tokenized sentences into a new txt file
    """
    fname = Path(fname)
    text = get_data(fname)
    sentences = make_sentences(text)
    print(f'{fname.name} contains {len(sentences)} sentences')
    write_to_file_cleaned(sentences, fname)

All functions mentioned above are merged into a single function called `clean`. 
You only need to give it the name of the .txt file that you want to clean and call the function

In [None]:
fname = '../files/dummy.txt'
text = get_data(fname)
print(text)

MARLEY was dead: to begin with. There is no doubt
whatever about that. The register of his burial was
signed by the clergyman, the clerk, the undertaker,
and the chief mourner. Scrooge signed it: and
Scrooge's name was good upon 'Change, for anything he
chose to put his hand to. Old Marley was as dead as a
door-nail.

Mind! I don't mean to say that I know, of my
own knowledge, what there is particularly dead about
a door-nail. I might have been inclined, myself, to
regard a coffin-nail as the deadest piece of ironmongery
in the trade. But the wisdom of our ancestors
is in the simile; and my unhallowed hands
shall not disturb it, or the Country's done for. You
will therefore permit me to repeat, emphatically, that
Marley was as dead as a door-nail.

This is a new sentence.


It goes from this to 

In [None]:
make_sentences(get_data(fname))

(#11) ['MARLEY was dead: to begin with.','There is no doubt whatever about that.','The register of his burial was signed by the clergyman, the clerk, the undertaker, and the chief mourner.',"Scrooge signed it: and Scrooge's name was good upon 'Change, for anything he chose to put his hand to.",'Old Marley was as dead as a door-nail.','Mind!',"I don't mean to say that I know, of my own knowledge, what there is particularly dead about a door-nail.",'I might have been inclined, myself, to regard a coffin-nail as the deadest piece of ironmongery in the trade.',"But the wisdom of our ancestors is in the simile; and my unhallowed hands shall not disturb it, or the Country's done for.",'You will therefore permit me to repeat, emphatically, that Marley was as dead as a door-nail.'...]

The `clean` function writes these sentences into a txt file with the name `<fname>_cleaned.txt` 

## Further cleaning

In [None]:
#|export
def get_wordnet_pos(word: str) -> str:
    """Map POS tag to first character lemmatize() accepts"""
    try:
        nltk.pos_tag('x')
    except:
        print(f'Run download_nltk_dep() first')
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [None]:
#|export
def remove_stopwords(sentence: str) -> str:
    """
    Takes a sentence and removes stopwords from it
    """
    try:
        stopwords.words('english')
    except:
        print(f'Run download_nltk_dep() first')
    sentences = []
    STOPWORDS = set(stopwords.words('english'))
    for word in sentence.split():
        if word.lower() not in STOPWORDS:
            sentences.append(word)
    return ' '.join(sentences)

In [None]:
#|export
def remove_punctuations(sentence: str) -> str:
    """
    Takes a sentence and removes punctuations from it
    """
    pat2 = re.compile('[^a-zA-Z0-9 ]+')
    pat1 = re.compile('[\s]+')

    doc = pat2.sub(' ', sentence)
    doc = pat1.sub(' ', doc)
    doc = doc.strip()
    return doc

In [None]:
#|export
def remove_punc_clean(sentence: str, lemmatize: bool = False) -> str:
    """
    Takes a sentence and removes punctuations and stopwords from it
    
    Will lemmatize words if `lemmatize = True`
    """
    doc = remove_punctuations(sentence)
    doc = remove_stopwords(doc)
    
    
    if lemmatize:
        lemmatizer = WordNetLemmatizer()
        doc = ' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in doc.split()])
    return doc

In [None]:
#|export
def process(fname: str) -> List[str]:
    fname = Path(fname)
    all_data = get_data(fname)
    sentences = make_sentences(all_data)
    clean_sentences = []
    removed_sentences = []
    for i, sentence in enumerate(sentences):
        t = remove_punc_clean(sentence)
        if len(t) > 0:
            clean_sentences.append(t)
        else:
            removed_sentences.append(i)

    # write_to_file_lexical(clean_sentences, fname)
    print('Done processing', fname.name)
    return L(removed_sentences)

Let's continue the same example from above

In [None]:
data = get_data(fname)
sentences = make_sentences(data)
sentences

(#11) ['MARLEY was dead: to begin with.','There is no doubt whatever about that.','The register of his burial was signed by the clergyman, the clerk, the undertaker, and the chief mourner.',"Scrooge signed it: and Scrooge's name was good upon 'Change, for anything he chose to put his hand to.",'Old Marley was as dead as a door-nail.','Mind!',"I don't mean to say that I know, of my own knowledge, what there is particularly dead about a door-nail.",'I might have been inclined, myself, to regard a coffin-nail as the deadest piece of ironmongery in the trade.',"But the wisdom of our ancestors is in the simile; and my unhallowed hands shall not disturb it, or the Country's done for.",'You will therefore permit me to repeat, emphatically, that Marley was as dead as a door-nail.'...]

Here, the `remove_punc_clean` function removes punctuations, STOPWORDS and lemmatizes the word and returns the cleaned sentence. 

> Note: It is possible that a sentence may be removed completely as it may contain only STOPWORDS. 
This method is to be used for methods involving lexical analysis.

Without lemmatization

In [None]:
for sentence in sentences:
    print(remove_punc_clean(sentence))

MARLEY dead begin
doubt whatever
register burial signed clergyman clerk undertaker chief mourner
Scrooge signed Scrooge name good upon Change anything chose put hand
Old Marley dead door nail
Mind
mean say know knowledge particularly dead door nail
might inclined regard coffin nail deadest piece ironmongery trade
wisdom ancestors simile unhallowed hands shall disturb Country done
therefore permit repeat emphatically Marley dead door nail
new sentence


With Lemmatization

In [None]:
for sentence in sentences:
    print(remove_punc_clean(sentence, lemmatize=True))

MARLEY dead begin
doubt whatever
register burial sign clergyman clerk undertaker chief mourner
Scrooge sign Scrooge name good upon Change anything chose put hand
Old Marley dead door nail
Mind
mean say know knowledge particularly dead door nail
might inclined regard coffin nail deadest piece ironmongery trade
wisdom ancestor simile unhallowed hand shall disturb Country do
therefore permit repeat emphatically Marley dead door nail
new sentence


In [None]:
clean('../files/dummy.txt')

dummy.txt contains 11 sentences


In [None]:
process('../files/dummy.txt')

Done processing dummy.txt


(#0) []

In [None]:
Path('../files/').ls()

(#2) [Path('../files/dummy.txt'),Path('../files/dummy_cleaned.txt')]