In [27]:
import nltk
import re
import string
from pprint import pprint

corpus = ["The brown fox wasn't that quick and he couldn't win zhe race",
          "Hey that's a great deal!I just bought a phone for $199",
          "@@You'll (learn) a **lot** in the book. Python is an amazing language!@@"]

<h3>1、文本清洗</h3>

In [14]:
# 通用文本切分函数
def tokenize_text(text):
    sentences = nltk.sent_tokenize(text)
    word_tokens = [nltk.word_tokenize(sentence) for sentence in sentences]
    
    return word_tokens

In [16]:
token_list = [tokenize_text(text) for text in corpus]
print(token_list)

[[['The', 'brown', 'fox', 'was', "n't", 'that', 'quick', 'and', 'he', 'could', "n't", 'win', 'zhe', 'race']], [['Hey', 'that', "'s", 'a', 'great', 'deal', '!', 'I', 'just', 'bought', 'a', 'phone', 'for', '$', '199']], [['@', '@', 'You', "'ll", '(', 'learn', ')', 'a', '**lot**', 'in', 'the', 'book', '.'], ['Python', 'is', 'an', 'amazing', 'language', '!'], ['@', '@']]]


In [43]:
#切分之后删除特殊字符
def remove_characters_after_tokenization(tokens):
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    filtered_tokens = list(filter(None, [pattern.sub('', token) for token in tokens]))
    
    return filtered_tokens

In [47]:
for tokens in token_list[0]:
    filterd = remove_characters_after_tokenization(tokens)
    print(filterd)

['The', 'brown', 'fox', 'was', 'nt', 'that', 'quick', 'and', 'he', 'could', 'nt', 'win', 'zhe', 'race']


In [52]:
# 需在使用filter之前加list
filtered_list_1 = [list(filter(None, [remove_characters_after_tokenization(tokens) for tokens in sentence_tokens]))\
                   for sentence_tokens in token_list]
print(filtered_list_1)

[[['The', 'brown', 'fox', 'was', 'nt', 'that', 'quick', 'and', 'he', 'could', 'nt', 'win', 'zhe', 'race']], [['Hey', 'that', 's', 'a', 'great', 'deal', 'I', 'just', 'bought', 'a', 'phone', 'for', '199']], [['You', 'll', 'learn', 'a', 'lot', 'in', 'the', 'book'], ['Python', 'is', 'an', 'amazing', 'language']]]


In [54]:
#切分之前删除特殊字符
def remove_characters_before_tokenization(sentence, keep_apostrophes=False):
    sentence = sentence.strip()
    if keep_apostrophes:
        PATTERN = r'[?|$|&|*|%|@|(|)|~]'
        filtered_sentence = re.sub(PATTERN, r' ', sentence)
    else:
        PATTERN = r'[^a-zA-Z0-9]'
        filtered_sentence = re.sub(PATTERN, r' ', sentence)
        
    return filtered_sentence

In [55]:
filtered_list_2 = [remove_characters_before_tokenization(sentence) for sentence in corpus]
print(filtered_list_2)

['The brown fox wasn t that quick and he couldn t win zhe race', 'Hey that s a great deal I just bought a phone for  199', '  You ll  learn  a   lot   in the book  Python is an amazing language   ']


In [59]:
cleaned_corpus = [remove_characters_before_tokenization(sentence, keep_apostrophes=True) for sentence in corpus]
print(cleaned_corpus)

["The brown fox wasn't that quick and he couldn't win zhe race", "Hey that's a great deal!I just bought a phone for  199", "  You'll  learn  a   lot   in the book. Python is an amazing language!  "]


In [67]:
# 扩展缩写词
from contractions import contractions_dict

def expand_contractions(sentence, contraction_mapping):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), flags=re.IGNORECASE|re.DOTALL)
    
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())
        expanded_contraction = first_char + expanded_contraction[1:]
        
        return expanded_contraction
    
    expanded_sentence = contractions_pattern.sub(expand_match, sentence)
    
    return expanded_sentence

In [68]:
expanded_corpus = [expand_contractions(sentence, contractions_dict) for sentence in cleaned_corpus]
print(expanded_corpus)

['The brown fox was not that quick and he could not win zhe race', 'Hey that is a great deal!I just bought a phone for  199', '  You will  learn  a   lot   in the book. Python is an amazing language!  ']
