/
scraper.py
148 lines (120 loc) · 5.21 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# This is a template for a Python scraper on morph.io (https://morph.io)
# including some code snippets below that you should find helpful
# import scraperwiki
# import lxml.html
#
# # Read in a page
# html = scraperwiki.scrape("http://foo.com")
#
# # Find something on the page using css selectors
# root = lxml.html.fromstring(html)
# root.cssselect("div[align='left']")
#
# # Write out to the sqlite database using scraperwiki library
#scraperwiki.sqlite.save(unique_keys=['name'], data={"name": "susan", "occupation": "software developer"})
#
# # An arbitrary query against the database
# scraperwiki.sql.select("* from data where 'name'='peter'")
# You don't have to do things with the ScraperWiki and lxml libraries.
# You can use whatever libraries you want: https://morph.io/documentation/python
# All that matters is that your final data is written to an SQLite database
# called "data.sqlite" in the current working directory which has at least a table
# called "data".
from lxml import html
from lxml.etree import tostring
import requests
from BeautifulSoup import BeautifulSoup
import re
#import scraperwiki
#brand = "volkswagen"
#model = "MUFT5FYQ6QOR9FZQ6FSHWPZFSK600O9QY21UYHJQSK6R2ESPZ66SKRCFWYZKHIZLE44URK3K4MKIYZICE9OHZQJSYPTR8EMZIQE0TWRZWRT2JGSY4W4HJIQCQG3GLH4QQYJ3OEWTZJOPIZGQ6LWT6CC6OHRCEECWLZ2EE/"
#model = "golf"
header = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0',}
#url_base = "http://www.hasznaltauto.hu/auto/" + brand + "/" + model + "/"
#url_base = "http://www.hasznaltauto.hu/talalatilista/auto/" + model
url_base = "http://www.hasznaltauto.hu/talalatilista/auto/MUFT5FYQ6QOR9FZQ6FSHWPZFSK600O9QY21UYHJQSK6R2ESPZ66SKRCFWYZKHIZLE44URK3K4MKIYZICE9OHZQJSYPTR8EMZIQE0TWRZWRT2JGSY4W4HJIQCQG3GLH4QQYJ3OEWTZJOPIZGQ6LWT6CC6OHRCEECWLZ2EE/"
url_base_page = url_base + 'page1'
page = requests.get(url_base_page, headers=header)
tree = html.fromstring(page.content)
num_of_pages = int(tree.xpath('//*[@class="oldalszam"][last()]')[0].text)
database = []
age = []
cost = []
for i in range(num_of_pages):
print i + 1, "/", num_of_pages
url = url_base + "page" + str(i+1)
page = requests.get(url, headers=header)
tree = html.fromstring(page.content)
links = tree.xpath('//*[contains(concat(" ", normalize-space(@class), " "), " talalati_lista_head ")]')
prices = tree.xpath('//*[contains(concat(" ", normalize-space(@class), " "), " arsor ")]')
years = tree.xpath('//*[contains(concat(" ", normalize-space(@class), " "), " talalati_lista_infosor ")]')
felszereltseg = tree.xpath('//*[contains(concat(" ", normalize-space(@class), " "), " felszereltseg-nyomtatas ")]')
kilometers = tree.xpath('//*[contains(concat(" ", normalize-space(@class), " "), " talalati_lista_infosor ")]')
for idx, price in enumerate(prices):
try:
price_string = price[0].text[:-3]
except IndexError:
#print "Ar nelkul"
continue
try:
act_price = int(re.sub("\.", '', price_string))
except ValueError:
#print "Akcios"
continue
soup = BeautifulSoup(tostring(links[idx]))
act_link = soup.find('a').get('href')
try:
soup = BeautifulSoup(tostring(years[idx]))
text = soup.text
pattern = "(\d{4}|\d{4}\/\d*) "
act_year = re.findall(pattern, soup.text)
if len(act_year) == 0:
act_year = '0'
else:
act_year = act_year[0]
if ';Dízel' in text:
act_motor = 'Diesel'
elif 'Benzin' in text:
act_motor = 'Benzin'
else:
act_motor = 'nincs adat'
pattern_kobcenti = '(\d{4}) cm³'
act_kobcenti = re.findall(pattern_kobcenti, soup.text)
if len(act_kobcenti) == 0:
act_kobcenti = '0'
else:
act_kobcenti = act_kobcenti[0]
pattern_loero = '(\d{2,3} LE)'
loero_tmp = re.findall(pattern_loero, soup.text)[0]
if len(loero_tmp) == 0:
act_loero = '0'
else:
act_loero = loero_tmp.replace(' LE', '')
except IndexError:
act_year = '0'
act_loero = '0'
act_kobcenti = '0'
act_motor = 'nincs adat'
try:
soup = BeautifulSoup(tostring(felszereltseg[idx]))
act_felszereltseg = soup.text[19:]
except IndexError:
act_felszereltseg = ''
try:
soup = BeautifulSoup(tostring(kilometers[idx]))
act_kilometer = soup.find('abbr').string.replace(' km', '')
except IndexError:
act_kilometer = '0'
data = {
'index': i,
'link': act_link,
'year': act_year,
'price': act_price,
'kilometer': act_kilometer,
'felszereltseg': act_felszereltseg,
'motor': act_motor,
'kobcenti': act_kobcenti,
'loero: ': act_loero
}
#print(act_link, act_year, act_price, act_kilometer, act_felszereltseg, act_motor, act_kobcenti, act_loero)
scraperwiki.sqlite.save(unique_keys=['index'], data=data)