Skip to content

Commit

Permalink
Automatic cookies management
Browse files Browse the repository at this point in the history
  • Loading branch information
oltarasenko committed Feb 26, 2020
1 parent 4073eee commit 211acf4
Show file tree
Hide file tree
Showing 3 changed files with 111 additions and 3 deletions.
6 changes: 3 additions & 3 deletions documentation/basic_concepts.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,11 @@ Built-in middlewares:
3. `Crawly.Middlewares.UniqueRequest` - this middleware ensures that crawly would not schedule the same URL(request) multiple times.
4. `Crawly.Middlewares.UserAgent` - this middleware is used to set a User Agent HTTP header. Allows to rotate UserAgents, if the last one is defined as a list.
5. `Crawly.Middlewares.RequestOptions` - allows to set additional request options, for example timeout, of proxy string (at this moment the options should match options of the individual fetcher (e.g. HTTPoison))


6. `Crawly.Middlewares.AutoCookiesManager` - allows to turn on the automatic cookies management. Useful for cases when you need to login or enter form data used by a website.
Example:
```elixir
{Crawly.Middlewares.RequestOptions, [timeout: 30_000, recv_timeout: 15000]}
{Crawly.Middlewares.RequestOptions, [timeout: 30_000, recv_timeout: 15000]},
Crawly.Middlewares.AutoCookiesManager
```

### Item Pipelines
Expand Down
49 changes: 49 additions & 0 deletions lib/crawly/middlewares/auto_cookies_manager.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
defmodule Crawly.Middlewares.AutoCookiesManager do
@moduledoc """
Set/update cookies for requests. The cookies are being automatically picked
up from prev_responses stored by Crawly. Only name/value pairs are taken into
account, all other options like domain, secure and others are ignored.
### Example Declaration
```
middlewares: [
Crawly.Middlewares.AutoCookiesManager
]
```
"""
require Logger

def run(request, state) do
known_cookies = Map.get(state, :cookies_manager_seen_cookies, MapSet.new())

new_cookies =
case request.prev_response do
nil ->
[]

prev_response ->
:proplists.get_all_values("Set-Cookie", prev_response.headers)
end

new_known_cookies =
Enum.reduce(new_cookies, known_cookies, fn cookie, acc ->
# Take the first name/value pair and store it
cookie = hd(String.split(cookie, ";"))
MapSet.put(acc, cookie)
end)

case MapSet.size(new_known_cookies) do
0 ->
# No cookies required by the site
{request, state}

_other ->
cookies = new_known_cookies |> MapSet.to_list() |> Enum.join("; ")
new_request = Map.put(request, :headers, [{"Cookie", cookies}])

new_state =
Map.put(state, :cookies_manager_seen_cookies, new_known_cookies)
{new_request, new_state}
end
end
end
59 changes: 59 additions & 0 deletions test/middlewares/auto_cookies_manager_test.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
defmodule Middlewares.AutoCookiesManagertest do
use ExUnit.Case, async: false

test "Cookies are not added when there is no prev_response data" do
req = Crawly.Request.new("http://example.com")
middlewares = [Crawly.Middlewares.AutoCookiesManager]

{new_request, _state} = Crawly.Utils.pipe(middlewares, req, %{})

assert [] == new_request.headers
end

test "Cookies are not added when there is no set cookie in prev response" do
prev_response = %HTTPoison.Response{
body: "test",
headers: [
{"Date", "Wed, 26 Feb 2020 21:06:52 GMT"},
{"Content-Type", "text/html;charset=utf-8"},
{"Transfer-Encoding", "chunked"},
{"Connection", "keep-alive"}
]
}

req =
"http://example.com"
|> Crawly.Request.new()
|> Map.put(:prev_response, prev_response)

middlewares = [Crawly.Middlewares.AutoCookiesManager]

{new_request, _state} = Crawly.Utils.pipe(middlewares, req, %{})

assert [] == new_request.headers
end

test "Cookies are taken into account" do
prev_response = %HTTPoison.Response{
body: "test",
headers: [
{"Set-Cookie", "bucket=desktop; Domain=.example.com; path=/;"},
{"Set-Cookie", "OT_1073742440=72; SameSite=None; Secure"}
]
}

req =
"http://example.com"
|> Crawly.Request.new()
|> Map.put(:prev_response, prev_response)

middlewares = [Crawly.Middlewares.AutoCookiesManager]

{new_request, _state} = Crawly.Utils.pipe(middlewares, req, %{})

cookie = :proplists.get_value("Cookie", new_request.headers, [])

assert Enum.sort(String.split(cookie, "; ")) ==
Enum.sort(["bucket=desktop", "OT_1073742440=72"])
end
end

0 comments on commit 211acf4

Please sign in to comment.