-
Notifications
You must be signed in to change notification settings - Fork 838
/
normalization.py
104 lines (81 loc) · 3.2 KB
/
normalization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# -*- coding: utf-8 -*-
"""
Created on Fri Aug 26 20:45:10 2016
@author: DIP
"""
from contractions import CONTRACTION_MAP
import re
import nltk
import string
from nltk.stem import WordNetLemmatizer
stopword_list = nltk.corpus.stopwords.words('english')
wnl = WordNetLemmatizer()
def tokenize_text(text):
tokens = nltk.word_tokenize(text)
tokens = [token.strip() for token in tokens]
return tokens
def expand_contractions(text, contraction_mapping):
contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
flags=re.IGNORECASE|re.DOTALL)
def expand_match(contraction):
match = contraction.group(0)
first_char = match[0]
expanded_contraction = contraction_mapping.get(match)\
if contraction_mapping.get(match)\
else contraction_mapping.get(match.lower())
expanded_contraction = first_char+expanded_contraction[1:]
return expanded_contraction
expanded_text = contractions_pattern.sub(expand_match, text)
expanded_text = re.sub("'", "", expanded_text)
return expanded_text
from pattern.en import tag
from nltk.corpus import wordnet as wn
# Annotate text tokens with POS tags
def pos_tag_text(text):
def penn_to_wn_tags(pos_tag):
if pos_tag.startswith('J'):
return wn.ADJ
elif pos_tag.startswith('V'):
return wn.VERB
elif pos_tag.startswith('N'):
return wn.NOUN
elif pos_tag.startswith('R'):
return wn.ADV
else:
return None
tagged_text = tag(text)
tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag))
for word, pos_tag in
tagged_text]
return tagged_lower_text
# lemmatize text based on POS tags
def lemmatize_text(text):
pos_tagged_text = pos_tag_text(text)
lemmatized_tokens = [wnl.lemmatize(word, pos_tag) if pos_tag
else word
for word, pos_tag in pos_tagged_text]
lemmatized_text = ' '.join(lemmatized_tokens)
return lemmatized_text
def remove_special_characters(text):
tokens = tokenize_text(text)
pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])
filtered_text = ' '.join(filtered_tokens)
return filtered_text
def remove_stopwords(text):
tokens = tokenize_text(text)
filtered_tokens = [token for token in tokens if token not in stopword_list]
filtered_text = ' '.join(filtered_tokens)
return filtered_text
def normalize_corpus(corpus, tokenize=False):
normalized_corpus = []
for text in corpus:
text = expand_contractions(text, CONTRACTION_MAP)
text = lemmatize_text(text)
text = remove_special_characters(text)
text = remove_stopwords(text)
normalized_corpus.append(text)
if tokenize:
text = tokenize_text(text)
normalized_corpus.append(text)
return normalized_corpus