Permalink
Browse files

Add shop links parsing

  • Loading branch information...
enginebai committed Sep 3, 2016
1 parent bcec6da commit 6741fde67ea829544c01712d9f8aa76625789135
Showing with 99 additions and 0 deletions.
  1. +6 −0 .idea/encodings.xml
  2. +8 −0 .idea/iPeenCrawler.iml
  3. +14 −0 .idea/misc.xml
  4. +8 −0 .idea/modules.xml
  5. +42 −0 .idea/workspace.xml
  6. +21 −0 ipeen_crawler.py
View
@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding">
<file url="PROJECT" charset="UTF-8" />
</component>
</project>
View
@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.5.1 virtualenv at ~/py" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
View
@@ -0,0 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectLevelVcsManager" settingsEditedManually="false">
<OptionsSetting value="true" id="Add" />
<OptionsSetting value="true" id="Remove" />
<OptionsSetting value="true" id="Checkout" />
<OptionsSetting value="true" id="Update" />
<OptionsSetting value="true" id="Status" />
<OptionsSetting value="true" id="Edit" />
<ConfirmationsSetting value="0" id="Add" />
<ConfirmationsSetting value="0" id="Remove" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7.3 (/usr/bin/python2.7)" project-jdk-type="Python SDK" />
</project>
View
@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/iPeenCrawler.iml" filepath="$PROJECT_DIR$/.idea/iPeenCrawler.iml" />
</modules>
</component>
</project>
View
@@ -0,0 +1,42 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ChangeListManager">
<option name="TRACKING_ENABLED" value="true" />
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
<option name="LAST_RESOLUTION" value="IGNORE" />
</component>
<component name="CreatePatchCommitExecutor">
<option name="PATCH_PATH" value="" />
</component>
<component name="ProjectLevelVcsManager" settingsEditedManually="false">
<OptionsSetting value="true" id="Add" />
<OptionsSetting value="true" id="Remove" />
<OptionsSetting value="true" id="Checkout" />
<OptionsSetting value="true" id="Update" />
<OptionsSetting value="true" id="Status" />
<OptionsSetting value="true" id="Edit" />
<ConfirmationsSetting value="0" id="Add" />
<ConfirmationsSetting value="0" id="Remove" />
</component>
<component name="ShelveChangesManager" show_recycled="false">
<option name="remove_strategy" value="false" />
</component>
<component name="TaskManager">
<task active="true" id="Default" summary="Default task">
<created>1472871486251</created>
<option name="number" value="Default" />
<option name="presentableId" value="Default" />
<updated>1472871486251</updated>
</task>
<servers />
</component>
<component name="VcsContentAnnotationSettings">
<option name="myLimit" value="2678400000" />
</component>
<component name="XDebuggerManager">
<breakpoint-manager />
<watches-manager />
</component>
</project>
View
@@ -0,0 +1,21 @@
import requests
from bs4 import BeautifulSoup
HTML_PARSER = "html.parser"
ROOT_URL = 'http://www.ipeen.com.tw'
LIST_URL = 'http://www.ipeen.com.tw/search/taiwan/000/1-0-0-0/'
def get_shop_link_list():
list_req = requests.get(LIST_URL)
if list_req.status_code == requests.codes.ok:
soup = BeautifulSoup(list_req.content, HTML_PARSER)
shop_links_a_tags = soup.find_all('a', attrs={'data-label': '店名'})
shop_links = []
for link in shop_links_a_tags:
print(ROOT_URL + link['href'])
shop_links.append(ROOT_URL + link['href'])
if __name__ == '__main__':
get_shop_link_list()

0 comments on commit 6741fde

Please sign in to comment.