Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ability to set default UA for either fetching types #2375

Merged
merged 7 commits into from
May 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion changedetectionio/content_fetchers/puppeteer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, BrowserConnectError


class fetcher(Fetcher):
fetcher_description = "Puppeteer/direct {}/Javascript".format(
os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize()
Expand Down
5 changes: 0 additions & 5 deletions changedetectionio/content_fetchers/requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,6 @@ def run(self,
if self.browser_steps_get_valid_steps():
raise BrowserStepsInUnsupportedFetcher(url=url)

# Make requests use a more modern looking user-agent
if not {k.lower(): v for k, v in request_headers.items()}.get('user-agent', None):
request_headers['User-Agent'] = os.getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT",
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36')

proxies = {}

# Allows override the proxy on a per-request basis
Expand Down
6 changes: 6 additions & 0 deletions changedetectionio/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,10 @@ class SingleExtraBrowser(Form):
browser_connection_url = StringField('Browser connection URL', [validators.Optional()], render_kw={"placeholder": "wss://brightdata... wss://oxylabs etc", "size":50})
# @todo do the validation here instead

class DefaultUAInputForm(Form):
html_requests = StringField('Plaintext requests', validators=[validators.Optional()], render_kw={"placeholder": "<default>"})
if os.getenv("PLAYWRIGHT_DRIVER_URL") or os.getenv("WEBDRIVER_URL"):
html_webdriver = StringField('Chrome requests', validators=[validators.Optional()], render_kw={"placeholder": "<default>"})

# datastore.data['settings']['requests']..
class globalSettingsRequestForm(Form):
Expand All @@ -537,6 +541,8 @@ class globalSettingsRequestForm(Form):
extra_proxies = FieldList(FormField(SingleExtraProxy), min_entries=5)
extra_browsers = FieldList(FormField(SingleExtraBrowser), min_entries=5)

default_ua = FormField(DefaultUAInputForm, label="Default User-Agent overrides")

def validate_extra_proxies(self, extra_validators=None):
for e in self.data['extra_proxies']:
if e.get('proxy_name') or e.get('proxy_url'):
Expand Down
5 changes: 5 additions & 0 deletions changedetectionio/model/App.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
)

_FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT = 6
DEFAULT_SETTINGS_HEADERS_USERAGENT='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'

class model(dict):
base_config = {
Expand All @@ -22,6 +23,10 @@ class model(dict):
'time_between_check': {'weeks': None, 'days': None, 'hours': 3, 'minutes': None, 'seconds': None},
'timeout': int(getenv("DEFAULT_SETTINGS_REQUESTS_TIMEOUT", "45")), # Default 45 seconds
'workers': int(getenv("DEFAULT_SETTINGS_REQUESTS_WORKERS", "10")), # Number of threads, lower is better for slow connections
'default_ua': {
'html_requests': getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT", DEFAULT_SETTINGS_HEADERS_USERAGENT),
'html_webdriver': None,
}
},
'application': {
# Custom notification content
Expand Down
4 changes: 4 additions & 0 deletions changedetectionio/processors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,10 @@ def call_browser(self):
request_headers.update(self.datastore.get_all_base_headers())
request_headers.update(self.datastore.get_all_headers_in_textfile_for_watch(uuid=self.watch.get('uuid')))

ua = self.datastore.data['settings']['requests'].get('default_ua')
if ua and ua.get(prefer_fetch_backend):
request_headers.update({'User-Agent': ua.get(prefer_fetch_backend)})

# https://github.com/psf/requests/issues/4525
# Requests doesnt yet support brotli encoding, so don't put 'br' here, be totally sure that the user cannot
# do this by accident.
Expand Down
1 change: 0 additions & 1 deletion changedetectionio/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,7 +554,6 @@ def has_extra_headers_file(self):
return os.path.isfile(filepath)

def get_all_base_headers(self):
from .model.App import parse_headers_from_text_file
headers = {}
# Global app settings
headers.update(self.data['settings'].get('headers', {}))
Expand Down
16 changes: 13 additions & 3 deletions changedetectionio/templates/settings.html
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,6 @@
<p>Use the <strong>Basic</strong> method (default) where your watched sites don't need Javascript to render.</p>
<p>The <strong>Chrome/Javascript</strong> method requires a network connection to a running WebDriver+Chrome server, set by the ENV var 'WEBDRIVER_URL'. </p>
</span>
<br>
Tip: <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Proxy-configuration#brightdata-proxy-support">Connect using Bright Data and Oxylabs Proxies, find out more here.</a>
</div>
<fieldset class="pure-group" id="webdriver-override-options" data-visible-for="application-fetch_backend=html_webdriver">
<div class="pure-form-message-inline">
Expand All @@ -121,6 +119,18 @@
{{ render_field(form.application.form.webdriver_delay) }}
</div>
</fieldset>
<div class="pure-control-group inline-radio">
{{ render_field(form.requests.form.default_ua) }}
<span class="pure-form-message-inline">
Applied to all requests.<br><br>
Note: Simply changing the User-Agent often does not defeat anti-robot technologies, it's important to consider <a href="https://changedetection.io/tutorial/what-are-main-types-anti-robot-mechanisms">all of the ways that the browser is detected</a>.
</span>
</div>
<div class="pure-control-group">
<br>
Tip: <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Proxy-configuration#brightdata-proxy-support">Connect using Bright Data and Oxylabs Proxies, find out more here.</a>

</div>
</div>

<div class="tab-pane-inner" id="filters">
Expand Down Expand Up @@ -190,7 +200,7 @@ <h4>Chrome Extension</h4>
<a id="chrome-extension-link"
title="Try our new Chrome Extension!"
href="https://chromewebstore.google.com/detail/changedetectionio-website/kefcfmgmlhmankjmnbijimhofdjekbop">
<img src="{{ url_for('static_content', group='images', filename='Google-Chrome-icon.png') }}">
<img src="{{ url_for('static_content', group='images', filename='Google-Chrome-icon.png') }}" alt="Chrome">
Chrome Webstore
</a>
</p>
Expand Down
52 changes: 42 additions & 10 deletions changedetectionio/tests/test_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,12 +256,40 @@ def test_method_in_request(client, live_server):
def test_headers_textfile_in_request(client, live_server):
#live_server_setup(live_server)
# Add our URL to the import page

webdriver_ua = "Hello fancy webdriver UA 1.0"
requests_ua = "Hello basic requests UA 1.1"

test_url = url_for('test_headers', _external=True)
if os.getenv('PLAYWRIGHT_DRIVER_URL'):
# Because its no longer calling back to localhost but from the browser container, set in test-only.yml
test_url = test_url.replace('localhost', 'cdio')

print ("TEST URL IS ",test_url)
form_data = {
"application-fetch_backend": "html_requests",
"application-minutes_between_check": 180,
"requests-default_ua-html_requests": requests_ua
}

if os.getenv('PLAYWRIGHT_DRIVER_URL'):
form_data["requests-default_ua-html_webdriver"] = webdriver_ua

res = client.post(
url_for("settings_page"),
data=form_data,
follow_redirects=True
)
assert b'Settings updated' in res.data

res = client.get(url_for("settings_page"))

# Only when some kind of real browser is setup
if os.getenv('PLAYWRIGHT_DRIVER_URL'):
assert b'requests-default_ua-html_webdriver' in res.data

# Field should always be there
assert b"requests-default_ua-html_requests" in res.data

# Add the test URL twice, we will check
res = client.post(
url_for("import_page"),
Expand All @@ -272,15 +300,14 @@ def test_headers_textfile_in_request(client, live_server):

wait_for_all_checks(client)


# Add some headers to a request
res = client.post(
url_for("edit_page", uuid="first"),
data={
"url": test_url,
"tags": "testtag",
"fetch_backend": 'html_webdriver' if os.getenv('PLAYWRIGHT_DRIVER_URL') else 'html_requests',
"headers": "xxx:ooo\ncool:yeah\r\n"},
"url": test_url,
"tags": "testtag",
"fetch_backend": 'html_webdriver' if os.getenv('PLAYWRIGHT_DRIVER_URL') else 'html_requests',
"headers": "xxx:ooo\ncool:yeah\r\n"},
follow_redirects=True
)
assert b"Updated watch." in res.data
Expand All @@ -292,7 +319,7 @@ def test_headers_textfile_in_request(client, live_server):
with open('test-datastore/headers.txt', 'w') as f:
f.write("global-header: nice\r\nnext-global-header: nice")

with open('test-datastore/'+extract_UUID_from_client(client)+'/headers.txt', 'w') as f:
with open('test-datastore/' + extract_UUID_from_client(client) + '/headers.txt', 'w') as f:
f.write("watch-header: nice")

client.get(url_for("form_watch_checknow"), follow_redirects=True)
Expand All @@ -306,7 +333,7 @@ def test_headers_textfile_in_request(client, live_server):
# Not needed anymore
os.unlink('test-datastore/headers.txt')
os.unlink('test-datastore/headers-testtag.txt')
os.unlink('test-datastore/'+extract_UUID_from_client(client)+'/headers.txt')
os.unlink('test-datastore/' + extract_UUID_from_client(client) + '/headers.txt')
# The service should echo back the request verb
res = client.get(
url_for("preview_page", uuid="first"),
Expand All @@ -319,7 +346,12 @@ def test_headers_textfile_in_request(client, live_server):
assert b"Watch-Header:nice" in res.data
assert b"Tag-Header:test" in res.data

# Check the custom UA from system settings page made it through
if os.getenv('PLAYWRIGHT_DRIVER_URL'):
assert "User-Agent:".encode('utf-8') + webdriver_ua.encode('utf-8') in res.data
else:
assert "User-Agent:".encode('utf-8') + requests_ua.encode('utf-8') in res.data

#unlink headers.txt on start/stop
# unlink headers.txt on start/stop
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data
assert b'Deleted' in res.data