In [None]:
# default_exp functions

# Functions
> This includes majority of the functions

In [None]:
#hide
from clean_plot import * 
from nbdev.showdoc import *
from fastcore.test import *

In [None]:
#export
#hide
import os
import pandas as pd
import pickle
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet, stopwords
import unidecode
import re
from nltk.stem import WordNetLemmatizer


[nltk_data] Downloading package punkt to C:\Users\Deven
[nltk_data]     Mistry\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Deven
[nltk_data]     Mistry\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Deven Mistry\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to C:\Users\Deven
[nltk_data]     Mistry\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Methods

In [None]:
#export
def normalize(data):
    """
    The function takes an array, matrix as input and normalizes
    it between 0 and 1

    Args:
        data (ndarray): any 1-D, or 2-D numpy array

    Returns:
        (ndarray): normalized ndarray
    """
    return (data - np.min(data)) / (np.max(data) - np.min(data))

In [None]:
normalize([1,2,3,4,5])

array([0.  , 0.25, 0.5 , 0.75, 1.  ])

In [None]:
#export
def split_by_newline(all):
    """
    Only use when sentences are already tokenized
    returns sentences split by '\\n' if len(line) > 0

    Args:
        all (str): tokenized string to be split by '\\n'

    Returns:
        list: list of sentences split by '\\n'
    """
    return [line.lower() for line in all.split('\n') if len(line) > 0]

In [None]:
all = "Hello there!\nThis is how this functions works!"
split_by_newline(all)

['hello there!', 'this is how this functions works!']

In [None]:
#export
def rm_useless_spaces(t):
    "Remove multiple spaces"
    _re_space = re.compile(' {2,}')
    return _re_space.sub(' ', t)

In [None]:
rm_useless_spaces('This is      test sentence.  This removes  all the extra  spaces .')

'This is test sentence. This removes all the extra spaces .'

In [None]:
#export
def make_sentences(all):
    all_cleaned = all.replace('\n',' ')
    all_cleaned = rm_useless_spaces(all_cleaned)
    all_cleaned = all_cleaned.strip()
    all_cleaned = unidecode.unidecode(all_cleaned)
    sentences = sent_tokenize(all_cleaned)
    return sentences

In [None]:
#export
def write_to_file_cleaned(sentences, fname):
    with open(fname[:-4]+'_cleaned.txt', 'w') as f:
        for line in sentences:
            f.write(line + '\n')
    f.close()

In [None]:
#export
def clean(fname):
    all = get_data(fname)
    sentences = make_sentences(all)
    print(fname[:-4].title() + ' contains {} sentences'.format(len(sentences)))
    write_to_file_cleaned(sentences, fname)

All functions mentioned above are merged into a single function called clean. 
You only need to give it the name of the .txt file that you want to clean and call the function

In [None]:
fname = 'dummy.txt'
get_data(fname)

"MARLEY was dead: to begin with. There is no doubt\nwhatever about that. The register of his burial was\nsigned by the clergyman, the clerk, the undertaker,\nand the chief mourner. Scrooge signed it: and\nScrooge's name was good upon 'Change, for anything he\nchose to put his hand to. Old Marley was as dead as a\ndoor-nail.\n\nMind! I don't mean to say that I know, of my\nown knowledge, what there is particularly dead about\na door-nail. I might have been inclined, myself, to\nregard a coffin-nail as the deadest piece of ironmongery\nin the trade. But the wisdom of our ancestors\nis in the simile; and my unhallowed hands\nshall not disturb it, or the Country's done for. You\nwill therefore permit me to repeat, emphatically, that\nMarley was as dead as a door-nail."

It goes from this to 

In [None]:
make_sentences(get_data(fname))

['MARLEY was dead: to begin with.',
 'There is no doubt whatever about that.',
 'The register of his burial was signed by the clergyman, the clerk, the undertaker, and the chief mourner.',
 "Scrooge signed it: and Scrooge's name was good upon 'Change, for anything he chose to put his hand to.",
 'Old Marley was as dead as a door-nail.',
 'Mind!',
 "I don't mean to say that I know, of my own knowledge, what there is particularly dead about a door-nail.",
 'I might have been inclined, myself, to regard a coffin-nail as the deadest piece of ironmongery in the trade.',
 "But the wisdom of our ancestors is in the simile; and my unhallowed hands shall not disturb it, or the Country's done for.",
 'You will therefore permit me to repeat, emphatically, that Marley was as dead as a door-nail.']

The `clean` function writes these sentences into a txt file with the name `<fname>_cleaned.txt` 

## Let's move on to further cleaning

Let us first create a set of STOPWORDS which we can use

In [None]:
STOPWORDS = set(stopwords.words('english'))

In [None]:
#export
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [None]:
#export
def remove_stopwords(text):
    sentence = []
    for word in text.split():
        if word not in STOPWORDS:
            sentence.append(word)
    return ' '.join(sentence)

In [None]:
#export
def remove_punc_clean(sentence):
    pat2 = re.compile('[^a-zA-Z0-9 ]+')
    pat1 = re.compile('[\s]+')

    doc = pat2.sub(' ', sentence)
    doc = pat1.sub(' ', doc)
    doc = doc.strip().lower()

    doc = remove_stopwords(doc)
    # doc = ' '.join(list(OrderedDict.fromkeys(doc.split())))

    lemmatizer = WordNetLemmatizer()
    doc = ' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in doc.split()])
    return doc

In [None]:
def process(fname):
    all_data = get_data(fname)
    all_data = unidecode.unidecode(all_data)
    sentences = make_sentences(all_data)
    clean_sentences = []
    removed_sentences = []
    for i, sentence in enumerate(sentences):
        t = remove_punc_clean(sentence)
        if len(t) > 0:
            clean_sentences.append(t)
        else:
            removed_sentences.append(i)

    # write_to_file_lexical(clean_sentences, fname)
    print('Done processing', fname)
    return removed_sentences

Let's continue the same example from above

In [None]:
data = get_data(fname)
sentences = make_sentences(data)
sentences

['MARLEY was dead: to begin with.',
 'There is no doubt whatever about that.',
 'The register of his burial was signed by the clergyman, the clerk, the undertaker, and the chief mourner.',
 "Scrooge signed it: and Scrooge's name was good upon 'Change, for anything he chose to put his hand to.",
 'Old Marley was as dead as a door-nail.',
 'Mind!',
 "I don't mean to say that I know, of my own knowledge, what there is particularly dead about a door-nail.",
 'I might have been inclined, myself, to regard a coffin-nail as the deadest piece of ironmongery in the trade.',
 "But the wisdom of our ancestors is in the simile; and my unhallowed hands shall not disturb it, or the Country's done for.",
 'You will therefore permit me to repeat, emphatically, that Marley was as dead as a door-nail.']

Here, the `remove_punc_clean` function removes punctuations, STOPWORDS and lemmatizes the word and returns the cleaned sentence. 

**NOTE** It is possible that a sentence gets removed completely as it may contain only STOPWORDS. 
This method is to be used for methods involving lexical analysis.

In [None]:
for sentence in sentences:
    print(remove_punc_clean(sentence))

marley dead begin
doubt whatever
register burial sign clergyman clerk undertaker chief mourner
scrooge sign scrooge name good upon change anything chose put hand
old marley dead door nail
mind
mean say know knowledge particularly dead door nail
might inclined regard coffin nail deadest piece ironmongery trade
wisdom ancestor simile unhallowed hand shall disturb country do
therefore permit repeat emphatically marley dead door nail
