-
Notifications
You must be signed in to change notification settings - Fork 0
/
mango_scraper.py
61 lines (51 loc) · 2.11 KB
/
mango_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#This code is built to run on both python and python3
import os, csv
from selenium import webdriver
from bs4 import BeautifulSoup
class Mango(object):
#creating a class for mango.com
def __init__(self):
#lambda function to create soup of html code
self._Soup = lambda htm: BeautifulSoup(htm,'html.parser')
#static string variable with URL
self._URL = 'http://shop.mango.com/DE'
#static variable with lcoation of chromedriver
self._chromedriver = "./chromedriver"
#get products info when object is created
self._products=self._getproducts()
def _gethtml(self,url):
#get html source code of the webpage using chromedriver
os.environ["webdriver.chrome.driver"] = self._chromedriver
driver = webdriver.Chrome(self._chromedriver)
driver.get(url)
htmsrc=driver.page_source
driver.quit()
return htmsrc
def _getproducts(self):
#loads the products and categories into a dictionary data structure
soup = self._Soup(self._gethtml(self._URL))
proddic={}
proddic['damen']=proddic['herren']=proddic['violeta']=proddic['kinder']=[]
for link in soup.findAll('a', href=True):
lnk=link['href']
if "accessoires" in link['href'] or "artikel" in link['href']:
item=link['href'].split('/')
try:
proddic[item[-3]].append(item[-1])
except KeyError:
pass
return (proddic)
def _dict2csv(self,nm):
#converting the dictionary of lists into a csv format
with open(nm, 'w') as f: # Just use 'w' mode in 3.x
w = csv.writer(f,lineterminator='\n')
w.writerow(self._products.keys())
w.writerows(zip(*self._products.values()))
def display(self):
#To print the result in a dictionary format
print (self._products)
def getcsv(self,nm):
#To create a csv file of the results
self._dict2csv(nm)