/
main.py
159 lines (126 loc) · 5.62 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import csv
import json
import os
import shutil
import urllib
from urllib.error import HTTPError
from urllib.request import urlopen
from bs4 import BeautifulSoup
from datapackage import Package
url = 'https://opendata.socrata.com/browse?'
raw_data_url = 'https://opendata.socrata.com/api/views'
category = ''
def init(file='generated_links.txt'):
links_empty = False
with open(file, "r") as links:
if len(links.readlines()) == 0:
links_empty = True
if links_empty:
generate_links()
with open(file, "r") as links:
for i, link in enumerate(links.readlines()):
link = link.strip()
print(str(i) + ': ' + str(link))
try:
generate_dataset(link)
except HTTPError:
print('Error occured')
def generate_dataset(url):
global category
if 'Category' in url:
category = url.split(':')[1]
return
csv_url = raw_data_url + url[url.rindex('/'):] + '/rows.csv?accessType=DOWNLOAD'
rows = []
title = url[len('https://opendata.socrata.com/'+category+'/'): url.rindex('/')].lower()
directory = "datasets/" + category + '/' + title
if not os.path.exists(directory + "/data/"):
os.makedirs(directory + "/data/")
with open(directory + "/data/" + title + '.csv', "w", encoding='utf-8') as output_file:
number_of_values = 0
for i, line in enumerate(urlopen(csv_url)):
decoded_line = line.decode('ascii', 'ignore')
decoded_line = decoded_line.replace('\n', '')
decoded_line = decoded_line.replace('\t', '')
value_list = decoded_line.split(',')
if i == 0:
number_of_values = len(value_list)
if len(value_list) != number_of_values:
continue
if all(value is "" for value in value_list):
continue
decoded_line = decoded_line.strip().lower()
decoded_line = decoded_line.replace('"', '')
if decoded_line not in rows:
output_file.write(decoded_line + '\n')
rows.append(decoded_line)
number_of_rows = 0
with open(directory + "/data/" + title + '.csv', "r", encoding='utf-8') as output_file:
reader = csv.DictReader(output_file)
for row in reader:
number_of_rows += 1
if number_of_rows < 50:
shutil.rmtree(directory)
return
data_valid = datapackage_creator(location="datasets/" + category + '/' + title,
title=title.title().replace('-', ' '),
name=title,
source_title='FiveThirtyEight - ' + title.title().replace('-', ' '),
source_path=url)
if not data_valid:
shutil.rmtree(directory)
return
soup = BeautifulSoup(urlopen(url), 'html.parser')
for meta in soup.find_all('meta'):
meta_name = meta.get('name')
if meta_name and 'description' in meta_name:
with open(directory + '/README.md', "w", encoding='utf-8') as output_file:
output_file.write('## ' + title.title().replace('-', ' ') + '\n')
decoded_line = meta.get('content').strip().replace('"', '')
output_file.write(decoded_line + '\n')
output_file.write("\nThis dataset was scraped from [Socrata - " + title + '](' + url + ')')
def datapackage_creator(location, title, name, source_title, source_path):
package = Package()
package.descriptor['title'] = title
package.descriptor['name'] = name
package.descriptor['sources'] = [{}]
package.descriptor['sources'][0]['title'] = source_title
package.descriptor['sources'][0]['path'] = source_path
package.descriptor['licences'] = [{}]
package.descriptor['licences'][0]['name'] = 'odc-pddl'
package.descriptor['licences'][0]['title'] = 'Open Data Commons Public Domain Dedication and Licence (PDDL)'
package.descriptor['licences'][0]['path'] = 'http://opendatacommons.org/licenses/pddl/'
package.commit()
package.infer(location + '/data/*.csv')
package_json = package.descriptor
del package_json['profile']
for resource in package_json['resources']:
resource['path'] = resource['path'][len(location) + 1:]
if package.valid:
with open(location + '/datapackage.json', 'w') as data_file:
json.dump(package_json, data_file, indent=4, sort_keys=True)
return True
else:
print('DATAPACKAGE IS NOT VALID')
return False
def generate_links():
open('generated_links.txt', "w", encoding='ascii')
categories = ['Business', 'Demo', 'Education', 'Fun', 'Government', 'Personal', 'Public+Safety']
for category in categories:
print(category)
with open('generated_links.txt', "a") as generated_links:
generated_links.write('Category:' + category + '\n')
for i in range(10):
soup = BeautifulSoup(urlopen(url + 'category=' + category + '&limitTo=datasets&page=' + str(i+1)), 'html.parser')
with open('generated_links.txt', "a", encoding='ascii') as generated_links:
for a in soup.find_all('a'):
clazz = a.get('class')
if a and clazz and 'browse2-result-name-link' in clazz:
href = a.get('href')
try:
generated_links.write(href + '\n')
print(href)
except UnicodeEncodeError:
print('Error: ' + href)
continue
init()