-
Notifications
You must be signed in to change notification settings - Fork 36
/
gsearch.py
120 lines (94 loc) · 3.24 KB
/
gsearch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# -*- coding: utf-8 -*-
#
# Copyright © 2016 ethan-funny (https://github.com/ethan-funny)
#
# MIT Licence. See http://opensource.org/licenses/MIT
#
# Created on 2016-06-18
# Updated on 2016-12-19
import socket
import urllib2
from re import match
from urllib import urlencode
from HTMLParser import HTMLParser
from PySocks import socks
from PySocks.sockshandler import SocksiPyHandler
class GoogleSearch:
def __init__(self, query, port):
self.query = query.encode('utf-8')
self.url = u"http://www.google.com/search?" + \
urlencode({'q': self.query}) + u"&pws=0&gl=us&gws_rd=cr"
self.header = 'Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101'
self.SOCKS5_PROXY_HOST = '127.0.0.1'
self.SOCKS5_PROXY_PORT = port
def get_html_source(self):
html_source = ''
try:
if self.SOCKS5_PROXY_PORT == 0:
request = urllib2.Request(self.url)
request.add_header("User-Agent", self.header)
html_source = urllib2.urlopen(request).read()
else:
handler = SocksiPyHandler(
socks.SOCKS5,
self.SOCKS5_PROXY_HOST,
self.SOCKS5_PROXY_PORT
)
opener = urllib2.build_opener(handler)
opener.addheaders = [('User-agent', self.header)]
res = opener.open(self.url)
html_source = res.read()
except Exception as e:
print e
return html_source
class GoogleParser(HTMLParser):
h3_flag = False
a_flag = False
b_flag = False
title_part = ''
def __init__(self):
HTMLParser.__init__(self)
self.result_info = []
self.link = ''
self.title = ''
def handle_starttag(self, tag, attrs):
if tag == 'h3' and attrs == [('class', 'r')]:
self.h3_flag = True
if tag == 'a' and self.h3_flag:
self.a_flag = True
if tag == 'b' and self.a_flag:
self.b_flag = True
if self.a_flag:
for (key, value) in attrs:
if key == 'href':
if value.startswith("/url?"):
m = match('/url\?(url|q)=(.+?)&', value)
if m and len(m.groups()) == 2:
href = urllib2.unquote(m.group(2))
self.link = href
else:
self.link = value
def handle_endtag(self, tag):
if tag == 'h3':
self.h3_flag = False
if tag == 'a' and self.a_flag:
self.a_flag = False
self.result_info.append({
'title': self.title_part,
'href': self.link
})
self.title_part = ''
def handle_data(self, data):
if self.a_flag:
self.title_part += data
def search(query, port):
google_search = GoogleSearch(query, port)
page_source = google_search.get_html_source()
google_parser = GoogleParser()
google_parser.feed(page_source)
google_parser.close()
results = google_parser.result_info
return results
if __name__ == '__main__':
result_info = search('linux', 1234)
print result_info