Skip to content

Commit

Permalink
Ability to set default User-Agent for either fetching types directly …
Browse files Browse the repository at this point in the history
…in the UI (#2375)
  • Loading branch information
dgtlmoon committed May 20, 2024
1 parent a8959be commit f49eb45
Show file tree
Hide file tree
Showing 8 changed files with 70 additions and 20 deletions.
1 change: 0 additions & 1 deletion changedetectionio/content_fetchers/puppeteer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, BrowserConnectError


class fetcher(Fetcher):
fetcher_description = "Puppeteer/direct {}/Javascript".format(
os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize()
Expand Down
5 changes: 0 additions & 5 deletions changedetectionio/content_fetchers/requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,6 @@ def run(self,
if self.browser_steps_get_valid_steps():
raise BrowserStepsInUnsupportedFetcher(url=url)

# Make requests use a more modern looking user-agent
if not {k.lower(): v for k, v in request_headers.items()}.get('user-agent', None):
request_headers['User-Agent'] = os.getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT",
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36')

proxies = {}

# Allows override the proxy on a per-request basis
Expand Down
6 changes: 6 additions & 0 deletions changedetectionio/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,10 @@ class SingleExtraBrowser(Form):
browser_connection_url = StringField('Browser connection URL', [validators.Optional()], render_kw={"placeholder": "wss://brightdata... wss://oxylabs etc", "size":50})
# @todo do the validation here instead

class DefaultUAInputForm(Form):
html_requests = StringField('Plaintext requests', validators=[validators.Optional()], render_kw={"placeholder": "<default>"})
if os.getenv("PLAYWRIGHT_DRIVER_URL") or os.getenv("WEBDRIVER_URL"):
html_webdriver = StringField('Chrome requests', validators=[validators.Optional()], render_kw={"placeholder": "<default>"})

# datastore.data['settings']['requests']..
class globalSettingsRequestForm(Form):
Expand All @@ -537,6 +541,8 @@ class globalSettingsRequestForm(Form):
extra_proxies = FieldList(FormField(SingleExtraProxy), min_entries=5)
extra_browsers = FieldList(FormField(SingleExtraBrowser), min_entries=5)

default_ua = FormField(DefaultUAInputForm, label="Default User-Agent overrides")

def validate_extra_proxies(self, extra_validators=None):
for e in self.data['extra_proxies']:
if e.get('proxy_name') or e.get('proxy_url'):
Expand Down
5 changes: 5 additions & 0 deletions changedetectionio/model/App.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
)

_FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT = 6
DEFAULT_SETTINGS_HEADERS_USERAGENT='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'

class model(dict):
base_config = {
Expand All @@ -22,6 +23,10 @@ class model(dict):
'time_between_check': {'weeks': None, 'days': None, 'hours': 3, 'minutes': None, 'seconds': None},
'timeout': int(getenv("DEFAULT_SETTINGS_REQUESTS_TIMEOUT", "45")), # Default 45 seconds
'workers': int(getenv("DEFAULT_SETTINGS_REQUESTS_WORKERS", "10")), # Number of threads, lower is better for slow connections
'default_ua': {
'html_requests': getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT", DEFAULT_SETTINGS_HEADERS_USERAGENT),
'html_webdriver': None,
}
},
'application': {
# Custom notification content
Expand Down
4 changes: 4 additions & 0 deletions changedetectionio/processors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,10 @@ def call_browser(self):
request_headers.update(self.datastore.get_all_base_headers())
request_headers.update(self.datastore.get_all_headers_in_textfile_for_watch(uuid=self.watch.get('uuid')))

ua = self.datastore.data['settings']['requests'].get('default_ua')
if ua and ua.get(prefer_fetch_backend):
request_headers.update({'User-Agent': ua.get(prefer_fetch_backend)})

# https://github.com/psf/requests/issues/4525
# Requests doesnt yet support brotli encoding, so don't put 'br' here, be totally sure that the user cannot
# do this by accident.
Expand Down
1 change: 0 additions & 1 deletion changedetectionio/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,7 +554,6 @@ def has_extra_headers_file(self):
return os.path.isfile(filepath)

def get_all_base_headers(self):
from .model.App import parse_headers_from_text_file
headers = {}
# Global app settings
headers.update(self.data['settings'].get('headers', {}))
Expand Down
16 changes: 13 additions & 3 deletions changedetectionio/templates/settings.html
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,6 @@
<p>Use the <strong>Basic</strong> method (default) where your watched sites don't need Javascript to render.</p>
<p>The <strong>Chrome/Javascript</strong> method requires a network connection to a running WebDriver+Chrome server, set by the ENV var 'WEBDRIVER_URL'. </p>
</span>
<br>
Tip: <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Proxy-configuration#brightdata-proxy-support">Connect using Bright Data and Oxylabs Proxies, find out more here.</a>
</div>
<fieldset class="pure-group" id="webdriver-override-options" data-visible-for="application-fetch_backend=html_webdriver">
<div class="pure-form-message-inline">
Expand All @@ -121,6 +119,18 @@
{{ render_field(form.application.form.webdriver_delay) }}
</div>
</fieldset>
<div class="pure-control-group inline-radio">
{{ render_field(form.requests.form.default_ua) }}
<span class="pure-form-message-inline">
Applied to all requests.<br><br>
Note: Simply changing the User-Agent often does not defeat anti-robot technologies, it's important to consider <a href="https://changedetection.io/tutorial/what-are-main-types-anti-robot-mechanisms">all of the ways that the browser is detected</a>.
</span>
</div>
<div class="pure-control-group">
<br>
Tip: <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Proxy-configuration#brightdata-proxy-support">Connect using Bright Data and Oxylabs Proxies, find out more here.</a>

</div>
</div>

<div class="tab-pane-inner" id="filters">
Expand Down Expand Up @@ -190,7 +200,7 @@ <h4>Chrome Extension</h4>
<a id="chrome-extension-link"
title="Try our new Chrome Extension!"
href="https://chromewebstore.google.com/detail/changedetectionio-website/kefcfmgmlhmankjmnbijimhofdjekbop">
<img src="{{ url_for('static_content', group='images', filename='Google-Chrome-icon.png') }}">
<img src="{{ url_for('static_content', group='images', filename='Google-Chrome-icon.png') }}" alt="Chrome">
Chrome Webstore
</a>
</p>
Expand Down
52 changes: 42 additions & 10 deletions changedetectionio/tests/test_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,12 +256,40 @@ def test_method_in_request(client, live_server):
def test_headers_textfile_in_request(client, live_server):
#live_server_setup(live_server)
# Add our URL to the import page

webdriver_ua = "Hello fancy webdriver UA 1.0"
requests_ua = "Hello basic requests UA 1.1"

test_url = url_for('test_headers', _external=True)
if os.getenv('PLAYWRIGHT_DRIVER_URL'):
# Because its no longer calling back to localhost but from the browser container, set in test-only.yml
test_url = test_url.replace('localhost', 'cdio')

print ("TEST URL IS ",test_url)
form_data = {
"application-fetch_backend": "html_requests",
"application-minutes_between_check": 180,
"requests-default_ua-html_requests": requests_ua
}

if os.getenv('PLAYWRIGHT_DRIVER_URL'):
form_data["requests-default_ua-html_webdriver"] = webdriver_ua

res = client.post(
url_for("settings_page"),
data=form_data,
follow_redirects=True
)
assert b'Settings updated' in res.data

res = client.get(url_for("settings_page"))

# Only when some kind of real browser is setup
if os.getenv('PLAYWRIGHT_DRIVER_URL'):
assert b'requests-default_ua-html_webdriver' in res.data

# Field should always be there
assert b"requests-default_ua-html_requests" in res.data

# Add the test URL twice, we will check
res = client.post(
url_for("import_page"),
Expand All @@ -272,15 +300,14 @@ def test_headers_textfile_in_request(client, live_server):

wait_for_all_checks(client)


# Add some headers to a request
res = client.post(
url_for("edit_page", uuid="first"),
data={
"url": test_url,
"tags": "testtag",
"fetch_backend": 'html_webdriver' if os.getenv('PLAYWRIGHT_DRIVER_URL') else 'html_requests',
"headers": "xxx:ooo\ncool:yeah\r\n"},
"url": test_url,
"tags": "testtag",
"fetch_backend": 'html_webdriver' if os.getenv('PLAYWRIGHT_DRIVER_URL') else 'html_requests',
"headers": "xxx:ooo\ncool:yeah\r\n"},
follow_redirects=True
)
assert b"Updated watch." in res.data
Expand All @@ -292,7 +319,7 @@ def test_headers_textfile_in_request(client, live_server):
with open('test-datastore/headers.txt', 'w') as f:
f.write("global-header: nice\r\nnext-global-header: nice")

with open('test-datastore/'+extract_UUID_from_client(client)+'/headers.txt', 'w') as f:
with open('test-datastore/' + extract_UUID_from_client(client) + '/headers.txt', 'w') as f:
f.write("watch-header: nice")

client.get(url_for("form_watch_checknow"), follow_redirects=True)
Expand All @@ -306,7 +333,7 @@ def test_headers_textfile_in_request(client, live_server):
# Not needed anymore
os.unlink('test-datastore/headers.txt')
os.unlink('test-datastore/headers-testtag.txt')
os.unlink('test-datastore/'+extract_UUID_from_client(client)+'/headers.txt')
os.unlink('test-datastore/' + extract_UUID_from_client(client) + '/headers.txt')
# The service should echo back the request verb
res = client.get(
url_for("preview_page", uuid="first"),
Expand All @@ -319,7 +346,12 @@ def test_headers_textfile_in_request(client, live_server):
assert b"Watch-Header:nice" in res.data
assert b"Tag-Header:test" in res.data

# Check the custom UA from system settings page made it through
if os.getenv('PLAYWRIGHT_DRIVER_URL'):
assert "User-Agent:".encode('utf-8') + webdriver_ua.encode('utf-8') in res.data
else:
assert "User-Agent:".encode('utf-8') + requests_ua.encode('utf-8') in res.data

#unlink headers.txt on start/stop
# unlink headers.txt on start/stop
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data
assert b'Deleted' in res.data

0 comments on commit f49eb45

Please sign in to comment.