##### Define Pages

In [1]:
source_detail = "https://www.ird.gov.hk/charity/view_detail.php"
source_root = "https://www.ird.gov.hk/eng/tax/ach_index.htm"
source_search = "https://www.ird.gov.hk/charity/search_result.php"

##### Import Dependencies

In [55]:
import requests
from bs4 import BeautifulSoup as bs

In [3]:
def search_page(term):
    data = {"q": str(term)}
    return requests.post(source_search, data)

In [47]:
def detail_search(orgId):
    data = {"org_id": str(orgId)}
    return requests.post(source_detail, data)

In [56]:
def extract_html(response):
    """
    Given a response, return the html_string from the response body
    
    dependencies: BeautifulSoup as bs 
    """
    html = response.text
    soup = bs(html, "html.parser")
    return soup.body
    

##### Search with query "crossroads"

In [4]:
r = search_page("crossroads")

In [5]:
html = r.text

Show search result for search page

In [53]:
search_response = search_page("crossroads")
response_body = extract_html(search_response)

response_body.find(id = "search_des").text

'Search result for "crossroads": 5 record(s) found搜尋結果 "crossroads": 共找到5個記錄'

In [40]:
# print(bs(str(body), "html.parser").prettify())

#### Format the org data so we can extract orgId, english_name, chinese_name, isSubsidary

In [38]:
orgs = body.find_all("form")

for org in orgs:
    orgId = org.find("input").attrs.get("value")
    names = org.find("a").contents
    english_name = names[0]
    chinese_name = names[-1]
    isSubsiduary = "S" in orgId
    
    print(orgId, english_name, chinese_name, isSubsiduary)

S010351 Crossroads' Global X-perience 環球 X 體驗 True
91/04820 CROSSROADS FOUNDATION LIMITED 國際十字路協會有限公司 False
S006486 Crossroads Global Distribution 十字路會環球分派中心 True
S008168 Crossroads Global Handicrafts 十字路會環球工藝村 True
S008169 Crossroads Global Village 十字路基金會環球村 True


---

I want to be able to do the following:

| orgId | english_name | chinese_name | effective_date | active | isSubsiduary | parentCompany |
| --- | --- | --- | --- | --- | --- | --- |
| 91/04820 | CROSSROADS FOUNDATION | 十字路會環球分派中心 | 07.01.1997 | true | false | |
| S010353 | Goodcity.hk | 好人好市 | (from parent) 07.01.1997 | true | true | 91/04820 |



In [None]:
##### look at the detail html body for a subsiduary

In [60]:
# subsiduary response
detail_response = detail_search("S010351")
detail_body = extract_html(detail_response)
# detail_body

<table cellpadding="5" cellspacing="5" width="100%">
<!--display its org name as it have grandpa-->
<tr><td align="left" colspan="2">Crossroads' Global X-perience</td></tr>
<tr><td align="left" colspan="2">環球 X 體驗</td></tr>
<tr><td align="center" colspan="2"><hr noshade="" size="1"/><div style="text-align:center">Operated by 由以下團體主辦:</div><hr noshade="" size="1"/></td></tr>
<tr><td align="left" width="200"><b>Name of organization:</b></td><td>CROSSROADS FOUNDATION LIMITED<td></td></td></tr>
<tr><td align="left"><b>慈善團體名字:</b></td><td>國際十字路協會有限公司</td></tr>
<tr><td align="left"><b>Alias:</b></td><td><br/></td></tr>
<tr><td align="left"><b>別名:</b></td><td><br/></td></tr>
<tr><td align="left"><b>Effective Date:</b><br/><b>生效日期:</b></td><td>07.01.1997</td></tr>
</table>

##### look at the detail html body for a parent company

<table cellpadding="5" cellspacing="5" width="100%">
<tr><td align="left" width="200"><b>Name of organization:</b></td><td>CROSSROADS FOUNDATION LIMITED<td></td></td></tr>
<tr><td align="left"><b>慈善團體名字:</b></td><td>國際十字路協會有限公司</td></tr>
<tr><td align="left"><b>Alias:</b></td><td><br/></td></tr>
<tr><td align="left"><b>別名:</b></td><td><br/></td></tr>
<tr><td align="left"><b>Effective Date:</b><br/><b>生效日期:</b></td><td>07.01.1997</td></tr>
</table>
<p><table border="0" cellpadding="3" cellspacing="0" width="600">
<tr><td style="background:#cccccc"> </td><td style="background: #cccccc" width="50%"><b>Subsidiaries</b></td><td style="background: #cccccc" width="50%"><b>附屬團體</b></td></tr>
<tr style="background: #ffffff">
<td>1.</td><td width="50%">Crossroads Global Distribution</td>
<td width="50%">十字路會環球分派中心</td>
</tr>
<tr style="background: #eeeeee">
<td>2.</td><td width="50%">Crossroads Global Handicrafts</td>
<td width="50%">十字路會環球工藝村</td>
</tr>
<tr style="background: #ffffff">
<td>3.</td><td width="50%">Crossroads Global Village</td>
<td width="50%">十字路基金會環球村</td>
</tr>
<tr style="background: #eeeeee">
<td>4.</td><td width="50%">Crossroads' Global X-perience</td>
<td width="50%">環球 X 體驗</td>
</tr>
<tr style="background: #ffffff">
<td>5.</td><td width="50%">Global Hand</td>
<td width="50%">環球援手網絡</td>
</tr>
<tr style="background: #eeeeee">
<td>6.</td><td width="50%">GoodCity.hk</td>
<td width="50%">好人好市</td>
</tr>
<tr style="background: #ffffff">
<td>7.</td><td width="50%">Silk Road Café</td>
<td width="50%">絲路咖啡室</td>
</tr>
</table>

In [61]:
# org (parent) response
detail_response = detail_search("91/04820")
detail_body = extract_html(detail_response)
# detail_body