In [1]:
from nltk.stem import PorterStemmer, LancasterStemmer, RegexpStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import re
import csv
import yaml

In [2]:
# Stemming
word_stemmer = PorterStemmer()
print(word_stemmer.stem('writing'))
print(word_stemmer.stem('eating'))

Lanc_stemmer = LancasterStemmer()
print(Lanc_stemmer.stem('eats'))

Reg_stemmer = RegexpStemmer('ing')
print(Reg_stemmer.stem('eating'))

SnowballStemmer.languages
French_stemmer = SnowballStemmer('french')
print(French_stemmer.stem('Bonjoura'))

write
eat
eat
eat
bonjour


In [3]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('eating'))
print(lemmatizer.lemmatize('books'))
print(lemmatizer.lemmatize('bookes'))
print(lemmatizer.lemmatize('bookies'))

eating
book
bookes
bookie


In [4]:
# Word Replacement
class REReplacer(object):
    def __init__(self, pattern=[(r'won\'t', 'will not'), (r'can\'t', 'cannot'),
                                (r'i\'m', 'i am'), (r'(\w+)\'ll', 'g<1> will'),
                                (r'(\w+)n\'t', 'g<1> not'), (r'(\w+)\'ve', 'g<1> have'),
                                (r'(\w+)\'s', 'g<1> is'), (r'(\w+)\'re', 'g<1> are')]):
        self.pattern = [(re.compile(regex), repl) for (regex, repl) in pattern]

    def replace(self, text):
        s = text
        for (pattern, repl) in self.pattern:
            s = re.sub(pattern, repl, s)
        return s

rep_word = REReplacer()
print(rep_word.replace("I won't do it"))

I will not do it


In [5]:
from nltk.tokenize import word_tokenize
rep_word = REReplacer()
print(word_tokenize("I won't be able to do this now"))
print(word_tokenize(rep_word.replace("I won't be able to do this now")))

['I', 'wo', "n't", 'be', 'able', 'to', 'do', 'this', 'now']
['I', 'will', 'not', 'be', 'able', 'to', 'do', 'this', 'now']


In [6]:
class Rep_word_removal(object):
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'

    def replace(self, word):
        if wordnet.synsets(word):
            return word
        repl_word = self.repeat_regexp.sub(self.repl, word)
        if repl_word != word:
            return self.replace(repl_word)
        else:
            return repl_word

rep_word = Rep_word_removal()
print(rep_word.replace("Hiiiiiiiiiiiiiiiiii"))
print(rep_word.replace('Helloooooooooooooo'))

Hi
Hello


In [7]:
class word_syn_replacer(object):
    def __init__(self, word_map):
        self.word_map = word_map

    def replace(self, word):
        return self.word_map.get(word, word)

rep_syn = word_syn_replacer({'bday': 'birthday'})
print(rep_syn.replace('bday'))

birthday


In [8]:
class CSVword_syn_replacer(word_syn_replacer):
    def __init__(self, fname):
        word_map = {}
        for line in csv.reader(open(fname)):
            word, syn = line
            word_map[word] = syn
        super(CSVword_syn_replacer, self).__init__(word_map)

rep_syn = CSVword_syn_replacer('syn.csv')
print(rep_syn.replace('bday'))

FileNotFoundError: [Errno 2] No such file or directory: 'syn.csv'

In [9]:
class YAMLword_syn_replacer(word_syn_replacer):
    def __init__(self, fname):
        word_map = yaml.load(open(fname))
        super(YAMLword_syn_replacer, self).__init__(word_map)

rep_syn = YAMLword_syn_replacer('syn.yaml')
print(rep_syn.replace('bday'))

FileNotFoundError: [Errno 2] No such file or directory: 'syn.yaml'

In [10]:
class word_antonym_replacer(object):
    def replace(self, word, pos=None):
        antonyms = set()
        for syn in wordnet.synsets(word, pos=pos):
            for lemma in syn.lemmas():
                for antonym in lemma.antonyms():
                    antonyms.add(antonym.name())
        if len(antonyms) == 1:
            return antonyms.pop()
        else:
            return None

    def replace_negations(self, sent):
        i, l = 0, len(sent)
        words = []
        while i < l:
            word = sent[i]
            if word == 'not' and i+1 < l:
                ant = self.replace(sent[i+1])
                if ant:
                    words.append(ant)
                i += 2
                continue
            words.append(word)
            i += 1
        return words

rep_antonym = word_antonym_replacer()
print(rep_antonym.replace('uglify'))

sentence = ["Let us", "not", "uglify", "our", "country"]
print(rep_antonym.replace_negations(sentence))

beautify
['Let us', 'beautify', 'our', 'country']
