In [2]:
import numpy as np
import matplotlib.pyplot as plt
from urllib.request import urlopen
from bs4 import BeautifulSoup

# Data Cleaning

## HTMLの解析とクリーニング
Stack Overflowのコンテンツは, questionクラスに質問, answerクラスに回答が記載されている.  このプログラムではurllibを用いてページを取得し, BeautifulSoupを用いてHTMLから質問と回答を抽出する処理を行う.

![image.png](attachment:de61de13-73f7-463f-aee9-8a7f74884fba.png)

![image.png](attachment:178239fb-b13c-49be-a2f9-5b21f61fb3e1.png)

In [7]:
# 取得するページを定義
url = "https://stackoverflow.com/questions/415511/how-to-get-the-current-time-in-python"
# HTMLを取得
html = urlopen(url).read()
# HTML parserで構造を取得
soupified = BeautifulSoup(html,"html.parser")

# 質問を取得
question = soupified.find("div",{"class":"question"})
questiontext = question.find("div",{"class":"s-prose js-post-body"})
print("Question:\n", questiontext.get_text().strip())

answer = soupified.find("div",{"class":"answer"})
answertext = answer.find("div",{"class":"s-prose js-post-body"})
print("Best Answer:\n", answertext.get_text().strip())

Question:
 What is the module/method used to get the current time?
Best Answer:
 Use:
>>> import datetime
>>> datetime.datetime.now()
datetime.datetime(2009, 1, 6, 15, 8, 24, 78915)

>>> print(datetime.datetime.now())
2009-01-06 15:08:24.789150

And just the time:
>>> datetime.datetime.now().time()
datetime.time(15, 8, 24, 78915)

>>> print(datetime.datetime.now().time())
15:08:24.789150

See the documentation for more information.
To save typing, you can import the datetime object from the datetime module:
>>> from datetime import datetime

Then remove the leading datetime. from all of the above.


## Unicodeの正規化

HTMLには記号や絵文字を代表とするUnicode文字が現れる. これらの文字はパイプラインにおいてエラーの原因になるため正規化処理を行う. この処理をテキストエンコーディングという.

In [8]:
text = "I love Pizza 🍕 ! Shall we book a cab 🚙 to gizza?"
text = text.encode("utf-8")
text

b'I love Pizza \xf0\x9f\x8d\x95 ! Shall we book a cab \xf0\x9f\x9a\x99 to gizza?'

## スペル修正
スペル修正として省略表記を直すことと, 指が太い人がタイプミスをするファットフィンガー問題に対する対応を行う. ここではスペル修正APIとしてBing Spell Check APIを用いる.  
Bing Spell Check APIの参考文献  
https://docs.microsoft.com/en-us/bing/search-apis/bing-spell-check/quickstarts/rest/python  
Microsoft Azure > リソース管理 > クイックスタート

In [25]:
import requests
import json

api_key = "<ENTER_API_KEY>"
endpoint = "https://api.bing.microsoft.com/v7.0/SpellCheck"

example_text = "Hollo, wrld" # スペルチェック用のテキスト
data = {"text":example_text}
params = {"mkt":"en-us",
         "mode":"proof"}

headers = {
    'Content-Type': 'application/x-www-form-urlencoded',
    'Ocp-Apim-Subscription-Key': api_key,
}
response = requests.post(endpoint,headers=headers,params=params,data=data)
json_response = response.json()
print(json.dumps(json_response,indent=4))

{
    "_type": "SpellCheck",
    "flaggedTokens": [
        {
            "offset": 0,
            "token": "Hollo",
            "type": "UnknownToken",
            "suggestions": [
                {
                    "suggestion": "Hello",
                    "score": 0.9086817909466962
                },
                {
                    "suggestion": "Hollow",
                    "score": 0.8172678761353781
                }
            ]
        },
        {
            "offset": 7,
            "token": "wrld",
            "type": "UnknownToken",
            "suggestions": [
                {
                    "suggestion": "world",
                    "score": 0.9086817909466962
                }
            ]
        }
    ]
}


## システム固有の誤り訂正
このプログラムではPDFを画像化したデータから文字列を抽出する.

In [1]:
from PIL import Image
from pytesseract import image_to_string

In [3]:
fname = "OpenSource.png"
text = image_to_string(Image.open(fname))
text

'Tue Belgian sociologist, Waxweiler, once said it was not the task of\nsociology to explain what “society” is. May I venture in the same\ndirection and say it is not the task of linguistics to say what “ language ” is.\n“ Personality ” is perhaps more manageable, though I do not propose\nto say in existential terms what that is either. Some understanding of\nthe relations suggested by the title, however, is attainable in the light of\nsociology, psychology, biology, and descriptive linguistics.\n\x0c'