Skip to content

Commit

Permalink
remove multithreading
Browse files Browse the repository at this point in the history
  • Loading branch information
gravelcycles committed Jun 12, 2021
1 parent 2852d18 commit 33f8e80
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 48 deletions.
7 changes: 0 additions & 7 deletions pywebcopy/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
import shutil
import zipfile
from datetime import datetime
import threading

from .configs import config, SESSION
from .globals import MARK, __version__, lru_cache
Expand All @@ -32,12 +31,6 @@ def zip_project(timeout=10):
"""
# wait for the threads to finish downloading files

for thread in threading.enumerate():
if not thread or isinstance(thread, threading._MainThread):
continue
if thread.is_alive():
thread.join(timeout=timeout)

zip_fn = os.path.abspath(config['project_folder']) + '.zip'

with zipfile.ZipFile(zip_fn, 'w', zipfile.ZIP_DEFLATED) as archive:
Expand Down
23 changes: 9 additions & 14 deletions pywebcopy/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
"""
import os
import threading
import warnings

from .elements import TagBase
Expand All @@ -26,8 +25,6 @@


INDEX = set()
INDEX_LOCK = threading.Lock()


class SubPage(TagBase):
"""Custom anchor tag handler.
Expand Down Expand Up @@ -93,13 +90,12 @@ def run(self):
if _sub_page is None or not getattr(_sub_page, '_stack'):
return

with INDEX_LOCK:
elements = list(_sub_page.elements)
for elem in elements:
if elem.url not in INDEX:
INDEX.add(elem.url)
else:
_sub_page.elements.remove(elem)
elements = list(_sub_page.elements)
for elem in elements:
if elem.url not in INDEX:
INDEX.add(elem.url)
else:
_sub_page.elements.remove(elem)

_sub_page.save_complete()

Expand Down Expand Up @@ -150,10 +146,9 @@ def run(self):

self.parse()

with INDEX_LOCK:
elements = list(self.elements)
for elem in elements:
INDEX.add(elem.url)
elements = list(self.elements)
for elem in elements:
INDEX.add(elem.url)

self.save_complete()

Expand Down
36 changes: 17 additions & 19 deletions pywebcopy/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from six.moves.urllib.request import pathname2url
from .configs import config, SESSION
from .core import is_allowed
from .globals import CSS_IMPORTS_RE, CSS_URLS_RE, POOL_LIMIT, MARK, __version__, lru_cache
from .globals import CSS_IMPORTS_RE, CSS_URLS_RE, MARK, __version__, lru_cache
from .urls import URLTransformer, relate

__all__ = ['TagBase', 'AnchorTag', 'ImgTag', 'ScriptTag', 'LinkTag', '_ElementFactory']
Expand Down Expand Up @@ -53,8 +53,7 @@ def __repr__(self):

def run(self):
# XXX: This could wait for any condition
with POOL_LIMIT:
self.download_file()
self.download_file()

save_file = run

Expand Down Expand Up @@ -302,23 +301,22 @@ def run(self):
Thus css file content needs to be searched for urls and then it will proceed
as usual.
"""
with POOL_LIMIT:
if os.path.exists(self.file_path):
if not config['over_write']:
LOGGER.info("File already exists at location: [%r]" % self.file_path)
return
# LinkTags can also be specified for elements like favicon etc.
# Thus a check is necessary to validate it is a proper css file or not.
if not self._url.endswith('.css'):
super(LinkTag, self).run()

# Custom request object creation
req = SESSION.get(self.url, stream=True)

# if some error occurs
if not req or not req.ok:
LOGGER.error("URL returned an unknown response: [%s]" % self.url)
if os.path.exists(self.file_path):
if not config['over_write']:
LOGGER.info("File already exists at location: [%r]" % self.file_path)
return
# LinkTags can also be specified for elements like favicon etc.
# Thus a check is necessary to validate it is a proper css file or not.
if not self._url.endswith('.css'):
super(LinkTag, self).run()

# Custom request object creation
req = SESSION.get(self.url, stream=True)

# if some error occurs
if not req or not req.ok:
LOGGER.error("URL returned an unknown response: [%s]" % self.url)
return

# Try to avoid pulling the contents in the ram
# while substituting urls in the contents would NOT
Expand Down
2 changes: 0 additions & 2 deletions pywebcopy/globals.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# -*- coding: utf-8 -*-
import re
import textwrap
import threading

from . import __version__

Expand Down Expand Up @@ -94,7 +93,6 @@ def _cache_wrapper(self, caller, *args, **kwargs):
# under control so that the resource overloading could
# be prevented and the program remains memory efficient
#: new in version: 6.0.0
POOL_LIMIT = threading.Semaphore(5)

MARK = textwrap.dedent("""
{0}
Expand Down
7 changes: 1 addition & 6 deletions pywebcopy/webpage.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,8 @@

import logging
import os
import threading
from operator import attrgetter

from .globals import POOL_LIMIT
from .configs import SESSION, config
from .elements import _ElementFactory, LinkTag, ScriptTag, ImgTag, AnchorTag, TagBase
from .exceptions import ParseError
Expand Down Expand Up @@ -228,10 +226,7 @@ def save_assets(self):
LOGGER.log(100, "Queueing download of <%d> asset files." % len(elms))

for elem in elms:
with POOL_LIMIT:
t = threading.Thread(name=repr(elem), target=elem.run)
t.start()
self._threads.append(t)
elem.run()

def save_html(self, file_name=None, raw_html=False):
"""Saves the html of the page to a default or specified file.
Expand Down

0 comments on commit 33f8e80

Please sign in to comment.