forked from KishoreKonakanti/Regression
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pro_google.py
195 lines (173 loc) · 5.55 KB
/
pro_google.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
# -*- coding: utf-8 -*-
"""
Created on Thu Oct 25 15:46:59 2018
@author: kkonakan
"""
# -*- coding: utf-8 -*-
"""
Created on Wed Oct 24 20:41:55 2018
@author: kkonakan
"""
from urllib import request as ureq
import bs4
import re
from numpy import NaN
import os
import csv
import time
def getAlexaRank(url):
base_url = 'https://www.alexa.com/siteinfo/'
comp_url = base_url+url
html,_ = download(comp_url)
html = str(html)
rank = None
try:
rank = re.findall('.*global\":([\d].*?)}.*',html)[0]
return rank
except Exception:
return None
def getCountryReg(url):
'''whois limits 5 reqs per day'''
return None
addr = 'http://whois.domaintools.com/%s'%url
_,WS = download(addr)
lls = []
for div in WS.find_all('div'):
if(div.get('id') == 'stats'):
for ch in div.findChildren('td'):
lls.append(ch.contents)
ind = lls.index(['Registrant Country'])
return lls[ind+1] or None
def getHostCountry(url):
addr='http://data.alexa.com/data?cli=10&url=%s'%url
_,S = download(addr)
country = None
for i in S.find_all('country'):
country = re.findall(r'.*?name=\"(.*?)\".*', str(i))[0]
return country
def isUsingJS(soup):
t = soup.find_all('script')
if len(t) == 0:
return 0
else:
return 1
def isUsingCSS(html):
H = str(html)
if(str(H).find('css') > 0):
return 1
else:
return 0
def saveHTML(url, content):
'''
Saves the file to the disk and returns size of the html file in KB
'''
if content is None:
print('THERE IS NOTHING TO WRITE, RECEIVED NONE')
return -1
global path
fname = None
size = 0
try:
baseName = getBaseName(url)
fname = path+baseName+'.html'
ufile = open(fname, 'w')
ufile.writelines(str(content))
ufile.close()
size= round(os.path.getsize(fname)/1024)
except Exception:
print('Error occured during %s saving'%url)
return size
def download(url):
soup = None
html = None
req = ureq.Request(url, headers={'User-agent':'Mozilla/5.0'})
try:
html = ureq.urlopen(req, timeout=30).read()
soup = bs4.BeautifulSoup(html,'lxml')
except Exception:
print(url,' site is not allowing bots')
return html,soup
def fillNans(siteDetails):
global props
for prop in props:
if(siteDetails.get(prop,-12345) == -12345):
siteDetails[prop] = NaN
else: pass
return 0
def populateSiteDetails(url):
'''
Retrieves site Details
Saves the html content to disk for later processing
'''
html = None
contentSoup = None
siteDetails= {}
html,contentSoup = download(url)
if contentSoup is not None and contentSoup.title is not None:
#print('Content Soup is not NONE:', contentSoup.title)
siteDetails['url'] = url
siteDetails['numLinks'] = len(contentSoup.find_all('a'))
siteDetails['title'] = contentSoup.title.string
siteDetails['hostedIn'] = getHostCountry(url)
#siteDetails['RegIn'] = getCountryReg(url)
siteDetails['AlexaRank'] = getAlexaRank(url)
siteDetails['CSS'] = isUsingCSS(html)
siteDetails['JS'] = isUsingJS(contentSoup)
siteDetails['size'] = saveHTML(url, html)
for metaTags in contentSoup.find_all('meta'):
#print('Populating Tags')
attrs = metaTags.attrs
if ('name' in attrs.keys() and 'content' in attrs.keys()): # Reading meta tags
name = attrs['name']
dets = attrs['content']
if (name == 'title'):
siteDetails['title'] = dets
elif (name == 'description' ):
siteDetails['descr'] = dets
elif(name == 'keywords'):
siteDetails['kwords'] = dets
else:
pass
else: pass
else:
siteDetails['url'] = url
siteDetails['title']= getBaseName(url).upper()
return siteDetails
def getBaseName(url):
#print('Incoming ',url)
import re
pattern = '[whtps:/]{0,11}.([\w\W\d]{1,})\.*\.ai'
baseName= re.findall(pattern, url)[0]
#print('URL:%s->%s'%(url,baseName))
return baseName
def startPopulating(fileName):
linkFile = open('D:/AI/%s.txt'%fileName,'r')
cnt = 1
global start_time
for link in linkFile.readlines():
link = link.strip()
now = time.time()
print('Current Site (%d/1017):%s'%(cnt,link))
print('Time elapsed:%d seconds'%(now - start_time))
cnt += 1
siteDets = populateSiteDetails(link)
print(siteDets)
writer.writerow(siteDets)
print('Current count:',cnt)
linkFile.close()
try:
start_time = time.time()
path = 'D:/AI/DataSet/'
for inFile in ['AI','IO','ML']
dataFile=path+inFile+'.csv'
props = ['url','title','descr','numLinks','kwords','AlexaRank',
'hostedIn','CSS','JS','size']
csvfile=open(dataFile, 'w', encoding='utf-8-sig')
writer = csv.DictWriter(csvfile, props, restval=NaN)
writer.writeheader()
startPopulating(inFile)
csvfile.close()
except Exception as e:
print('Exception {0} has been raised'.format(e))
finally:
print('Time taken:%d seconds'%(time.time() - start_time))