-
Notifications
You must be signed in to change notification settings - Fork 0
/
spider.py
100 lines (86 loc) · 2.84 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider
from scrapy.http import Request
from scrapy.log import *
from scrapy_rakitan.settings import *
from scrapy_rakitan.items import *
#import pprint
from MySQLdb import escape_string
#import urlparse
import hashlib
cursor = CONN.cursor() # important MySQLdb Cursor object
def encode(str):
return str.encode('utf8', 'ignore')
def encode_post(text):
'''
INPUT : Raw Text (contains html is ok)
OUTPUT : Fresh Text (MySQL Escaped and Stripped)
'''
# text = text.encode('ascii','ignore').strip()
try :
text = text.encode('ascii','replace')
except:
print '\nDECODE ERROR Encode Post\n' + text
text = text.strip()
text = re.sub('\t','',text)
text = re.sub('\n','',text)
text = re.sub('\r','',text)
try:
text = MySQLdb.escape_string(text)
except:
# hash = hashlib.sha224(text).hexdigest()
print '\nESCAPE ERROR ENCODE Post\n' + text
return text
def insert_table(datas):
sql = "INSERT INTO %s (Hash, item_name, item_price, item_link, item_category) \
values('%s', '%s', '%s', '%s', '%s')" % (SQL_TABLE,
hashlib.sha224(datas['item_name']).hexdigest(),
escape_string(datas['item_name']),
escape_string(datas['item_price']),
escape_string(datas['item_link']),
escape_string(datas['item_category'])
)
# print sql
if cursor.execute(sql):
print "Inserted"
else:
print "Something wrong"
def complete_url(string):
"""Return complete url"""
return "http://rakitan.com/" + string
class RakitanSpider(CrawlSpider):
name = 'scrapy_rakitan_spider'
start_urls = [
'http://rakitan.com/'
]
total = 0
def parse(self, response):
hxs = HtmlXPathSelector(response)
# HXS to find url that goes to detail page
c = 0
items = hxs.select('//div/a/@href')
title = items.select('/text()').extract()
for item in items:
c += 1
link = item.extract()
#print complete_url(link)
#if (c == 1):
yield Request(complete_url(link), callback=self.parse_category)
print "Total = " + `self.total`
def parse_category(self, response):
i = 0
hxs = HtmlXPathSelector(response)
# HXS to Detail link inside td and a
category = hxs.select('//td[@align="CENTER"]/font/b/text()').extract()[0]
items = hxs.select('//tr[@bgcolor="#DDDDDD" or @bgcolor="#FFFFFF"]')
for item in items:
rakitan = ScrapyRakitanItem()
rakitan['item_name'] = item.select('td[1]/text()').extract()[0]
rakitan['item_price'] = item.select('td[2]/text()').extract()[0]
rakitan['item_link'] = unicode(response.url,'utf_8')
rakitan['item_category'] = category
# print rakitan
insert_table(rakitan)
self.total += 1
CONN.commit()
return self.total