This repository has been archived by the owner on Mar 16, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
49 lines (37 loc) · 1.76 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# This is the main entry point of our web crawler.
# The goal is to develop a crawler that can download any web page including its HTML, CSS, JS alongside
# all content (videos, images, etc.)
# Authors: Arneet Singh Kalra, Qadeer Assan, Cristian Ciungu and Moayad.
# Import statements
import urllib.request
from urllib.parse import urlparse
import youtubescraper as ys
from bs4 import BeautifulSoup as bs
# Provide link of web page to be stored locally
youtubeLinkExample = "https://www.youtube.com/watch?v=VvGFL8yb9dM" # Example web page
wikipediaLinkExample = "https://en.wikipedia.org/wiki/5G"
tedTalkLinkExample = "https://www.ted.com/talks/julian_burschka_what_your_breath_could_reveal_about_your_health"
def crawl(link):
# Check what type of link we are working with:
domain = urlparse(link).netloc
print(domain)
# Depending on domain of link, decide which parser to run:
# One big user case is youtube
if domain == 'www.youtube.com':
scraper = ys.YoutubeScraper(link)
# All other use cases (at the moment)
else:
# Create a copy of the source code of the web page
response = urllib.request.urlopen(link)
source_code = response.read()
# Store it in a new file -> source.html
f = open('source.html', 'wb')
f.write(source_code)
f.close
# TODO: Run an instance of Singlefile/Webarchive to download the HTML,CSS,JS of the webpage
# TODO: Run through the original source_code to see if the website contains any videos
# TODO: If the website contains a video, download the video and then embed it into the singlefile html code
# urllib.request.urlretrieve(link, 'video.mp4')
# Example execution of crawler:
#crawl(tedTalkLinkExample)
crawl(youtubeLinkExample)