# 编写代码清洗数据

和写代码处理异常一样，你也应该学习编写预防型代码来处理意外情况。


In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
def getNgrams(content,n):
    content = content.split(" ")
    output = []
    for i in range(len(content) -n+1):
        output.append(content[i:i+n])
    return output
    
html = urlopen('http://en.wikipedia.org/wiki/Python_(programming_language)')
bs = BeautifulSoup(html, 'html.parser')

content = bs.find("div",{"id":"mw-content-text"}).get_text()
ngrams = getNgrams(content,2)
ngrams

[['General-purpose', 'programming'],
 ['programming', 'language\n\n\nPythonParadigmMulti-paradigm:'],
 ['language\n\n\nPythonParadigmMulti-paradigm:', 'object-oriented,[1]'],
 ['object-oriented,[1]', 'procedural'],
 ['procedural', '(imperative),'],
 ['(imperative),', 'functional,'],
 ['functional,', 'structured,'],
 ['structured,', 'reflectiveDesigned\xa0byGuido'],
 ['reflectiveDesigned\xa0byGuido', 'van'],
 ['van', 'RossumDeveloperPython'],
 ['RossumDeveloperPython', 'Software'],
 ['Software', 'FoundationFirst\xa0appeared20\xa0February'],
 ['FoundationFirst\xa0appeared20\xa0February', '1991;'],
 ['1991;', '31'],
 ['31', 'years'],
 ['years', 'ago\xa0(1991-02-20)[2]Stable'],
 ['ago\xa0(1991-02-20)[2]Stable', 'release3.10.6[3]\xa0\n'],
 ['release3.10.6[3]\xa0\n', ''],
 ['', ''],
 ['', '/'],
 ['/', '2'],
 ['2', 'August'],
 ['August', '2022;'],
 ['2022;', '21'],
 ['21', 'days'],
 ['days', 'ago\xa0(2'],
 ['ago\xa0(2', 'August'],
 ['August', '2022)Preview'],
 ['2022)Preview', 'release3.11.0r

In [3]:
print("2-grams count is:"+str(len(ngrams)))

2-grams count is:11633


In [4]:
'''我们首先用一些正则表达式来移除转义字符(如 \n)，
再把 Unicode 字符过滤掉。可以通 过下面的函数对之前输出的结果进行清理:
'''
import re 
def getNgrams(content,n):
    content=re.sub("\n|[[\d+\]]","",content)
    content = bytes(content,"UTF-8")
    content = content.decode("ascii","ignore")
    content = content.split(" ")
    content =[word for word in content if word != '']
    output = []
    for i in range(len(content)-n+1):
        output.append(content[i:i+n])
    return output


现在“清洗任务”列表变得越来越长，并且你还引入了“句子”的概念，使得你的程序变 得更加复杂，因此最好把规则都移出来，创建 4 个不同的函数。

In [5]:
from collections import Counter
from time import sleep
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string


class Clean_data:
    def __init__(self,sentence,content,n):
        self.content = content 
        self.sentence = sentence 
        self.n = n 
    
    def cleanSentence(self):
        sentence = self.sentence.split(" ")
        sentence = [word.strip(string.punctuation+string.whitespace)
             for word in sentence]

        sentence = [word for word in sentence if len(word)>1 or 
                    (word.lower()=='a'or word.lower()=='i')]
        return sentence 
    
    def cleanInput(self):
        content = re.sub("\n|[[\d+\]]"," ",self.content)
        content = bytes(content,"UTF-8")

        content = content.decode("ascii", "ignore")
        sentences = content.split('. ')
        return [self.cleanSentence(sentence) for sentence in sentences]


    def getNgramsFromSentence(self):
        output = []
        for i in range(len(self.content)-self.n+1):
            output.append(self.content[i:i+self.n])
        return output
    
    def getNgrams(self):
        content = self.cleanInput(self.content)
        ngrams = Counter()

        for sentence in content:
            newNgrams = [' '.join(ngram) for ngram in
                 self.getNgramsFromSentence(sentence, 2)]
            ngrams.update(newNgrams)
        return ngrams




# 8.2 数据存储后再清洗