Skip to content

Commit

Permalink
web crawler updatet to drop unnecessary info
Browse files Browse the repository at this point in the history
  • Loading branch information
christophlandolt committed Dec 2, 2023
1 parent 0be9fb4 commit f2d25d3
Showing 1 changed file with 113 additions and 10 deletions.
123 changes: 113 additions & 10 deletions src/data_acquisition/web_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,13 @@ def __init__(self, url_list, server_name, server_port, database_name, collection
self.collection_name = collection_name
self.server_name = server_name
self.server_port = server_port
self.query_strings = [
(78, 'Active wheelchair'),
(81, 'No limitations'),
(79, 'E-wheelchair'),
(80, 'Stroller'),
(3951, 'Scewo BRO')
]

def download_and_store(self) -> None:
"""
Expand All @@ -53,8 +60,96 @@ def download_and_store(self) -> None:
for url in self.url_list:
self._download_json(url)

# Define a function to replace grade values with accessibility descriptions
def _replace_accessibility(self, grade):
if grade == 1:
return "Completely accessible"
elif grade == 2:
return "Partially accessible"
elif grade == 3:
return "Not easily accessible"
else:
return "Unknown accessibility"

def _traverse_and_replace(self, obj):
if isinstance(obj, dict):
if "accessibility" in obj and "grade" in obj["accessibility"]:
obj["accessibility"]["grade"] = self._replace_accessibility(obj["accessibility"]["grade"])
for value in obj.values():
self._traverse_and_replace(value)
elif isinstance(obj, list):
for item in obj:
self._traverse_and_replace(item)

def _remove_not_needed_keys(self, obj):
list_of_keys = [
"version",
"createdAt",
"updatedAt",
"url",
"accessUrl",
"approval",
"readyForApproval",
"ratingProfileNotice",
"status",
"webUrl",
"resourceUrl",
"changesUrl",
"attributionUrl",
"isOpenData",
"license",
"position",
"mainImage",
"totalClassifications",
"companyAssignment",
"numberOfComments",
"structure",
"areaClassifications"
]

for key in list_of_keys:
if key in obj:
obj.pop(key)

if 'accessibility' in obj and isinstance(obj['accessibility'], dict):
grade_value = obj['accessibility'].get('grade')
obj['accessibility'] = grade_value

def _remove_property_values(self, obj):
if isinstance(obj, dict):
obj.pop("propertyValues", None) # Remove propertyValues if present
for value in obj.values():
self._remove_property_values(value)
elif isinstance(obj, list):
for item in obj:
self._remove_property_values(item)


def _modify_structure(self, obj):
if isinstance(obj, dict):
if "accessibility" in obj and isinstance(obj["accessibility"], dict):
obj["accessibility"] = obj["accessibility"].get("grade", None)

obj.pop("readyForApproval", None)

obj.pop("images", None)

for value in obj.values():
self._modify_structure(value)
elif isinstance(obj, list):
for item in obj:
self._modify_structure(item)

def _move_criterion_values(self, obj):
if "pathClassifications" in obj and isinstance(obj["pathClassifications"], list):
for item in obj["pathClassifications"]:
if "criterion" in item:
criterion_values = item.pop("criterion")
item.update(criterion_values)



def _download_json(self, url : str) -> None:
def _download_json(self, input_url : str) -> None:
"""
This provate function downloads the JSCOn data from a specific internet ressource and initiates the storage in the MongoDB
Expand All @@ -63,15 +158,23 @@ def _download_json(self, url : str) -> None:
url : str
url to the JSON-File
"""
try:
response = requests.get(url)
if response.status_code == 200:
data = response.json()
self.store_in_mongodb(data)
else:
print(f"Failed to download JSON. Status code: {response.status_code}")
except requests.RequestException as e:
print(f"An error occurred: {e}")
for id, description in self.query_strings:
url = input_url + "?rating_profile_id="+str(id)
try:
response = requests.get(url, verify=False)
if response.status_code == 200:
data = response.json()
self._traverse_and_replace(data)
self._remove_not_needed_keys(data)
self._remove_property_values(data)
self._move_criterion_values(data)
self._modify_structure(data)
data["category"] = description
self.store_in_mongodb(data)
else:
print(f"Failed to download JSON. Status code: {response.status_code}")
except requests.RequestException as e:
print(f"An error occurred: {e}")

def store_in_mongodb(self, data : dict) -> None:
"""
Expand Down

0 comments on commit f2d25d3

Please sign in to comment.