# Rule 3: If it is possible to cut a word out, always cut it out.

This list of unnecessary words comes from the Purdue Online Writing Lab articles on [eliminating words](https://owl.english.purdue.edu/owl/resource/572/02/) and [avoiding common pitfalls.](https://owl.english.purdue.edu/owl/resource/572/04/) Here, we are simply calculating the number of occurences of removable words and putting it in a nice data frame.

In [2]:
import re
import pandas as pd
import pprint as pp


unnecessary_words_csv = 'unnecessary_words.csv'

def load_csv(filename):
    try:
        f = open(filename)
    except:
        pp.pprint('Bad filename ' + filename)
        return None
    words = f.read().split(',')
    return words

def regex_for_word(word):
    return word.replace('*', '[a-zA-Z]+')

pp.pprint(load_csv(unnecessary_words_csv))

['kind of',
 'sort of',
 'type of',
 'really',
 'basically',
 'for all intents and purposes',
 'definitely',
 'actually',
 'generally',
 'individual',
 'specific',
 'particular',
 'past memories',
 'various differences',
 'each individual *',
 'basic fundamentals',
 'true facts',
 'important essentials',
 'future plans',
 'terrible tragedy',
 'end result',
 'final outcome',
 'free gift',
 'past history',
 'unexpected surprise',
 'sudden crisis',
 'large in size',
 'often times',
 'of a bright color',
 'heavy in weight',
 'period in time',
 'round in shape',
 'at an early time',
 'economics field',
 'of cheap quality',
 'honest in character',
 'of a * condition',
 'in a * state',
 '* in nature',
 '* in degree',
 'of a * type',
 'the reason for',
 'for the reason that',
 'due to the fact that',
 'in light of the fact that',
 'considering the fact that',
 'on the grounds that',
 'this is why',
 'owing to the fact',
 'on the occasion of',
 'in a situation in which',
 'under circumstances i

In [11]:
def remove_quotes_from_text(text):
    # Check for all types of quotes
    quote_regex = r'"(.*?)"|“(.*?)”'
    text = re.sub(quote_regex, '', text)
    return text

def find_phrases_in_text(text, phrases):
    phrase_list = []
    for phrase in phrases:
        phrase_count = len(re.findall(regex_for_word(phrase), text, flags=re.IGNORECASE))
        if phrase_count is not 0:
            phrase_list.append((phrase, phrase_count))
    return phrase_list

def unnnecessary_phrase_count_in_text(text):
    unnecessay_regexes = load_csv(unnecessary_words_csv)
    text = remove_quotes_from_text(text)
    text_phrases = find_phrases_in_text(text, unnecessay_regexes)
    frame = pd.DataFrame(text_phrases)
    frame.columns = ['PHRASE', 'COUNT']
    return frame

# This article can be found here:
# http://www.newyorker.com/magazine/2008/10/20/late-bloomers-malcolm-gladwell
with open('gladwell_latebloomers.txt', 'r') as f:
    rule3_count = unnnecessary_phrase_count_in_text(f.read())
    print(rule3_count)

      PHRASE  COUNT
0    kind of      5
1    type of      1
2   actually      1
3  generally      1
4   specific      1
