From aed3eb71927d28b2023bce79ef95976f0f12ad23 Mon Sep 17 00:00:00 2001 From: Oleg Tarasenko Date: Sat, 7 Dec 2019 22:09:17 +0100 Subject: [PATCH] Migrate the static documentation to ex_doc --- docs/README.md | 2 +- documentation/assets/logo.png | Bin 0 -> 32038 bytes documentation/basic_concepts.md | 154 ++++++++++++ documentation/ethical_aspects.md | 12 + documentation/http_api.md | 34 +++ documentation/installation_guide.md | 14 ++ documentation/introduction.md | 152 ++++++++++++ documentation/quickstart.md | 79 ++++++ documentation/settings.md | 214 ++++++++++++++++ documentation/tutorial.md | 371 ++++++++++++++++++++++++++++ mix.exs | 35 ++- 11 files changed, 1065 insertions(+), 2 deletions(-) create mode 100644 documentation/assets/logo.png create mode 100644 documentation/basic_concepts.md create mode 100644 documentation/ethical_aspects.md create mode 100644 documentation/http_api.md create mode 100644 documentation/installation_guide.md create mode 100644 documentation/introduction.md create mode 100644 documentation/quickstart.md create mode 100644 documentation/settings.md create mode 100644 documentation/tutorial.md diff --git a/docs/README.md b/docs/README.md index 164831d7..07a52990 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,4 +1,4 @@ -# Crawly into +# Crawly intro --- Crawly is an application framework for crawling web sites and diff --git a/documentation/assets/logo.png b/documentation/assets/logo.png new file mode 100644 index 0000000000000000000000000000000000000000..68992e3a095e25b0573d7b63ca545737afbd7f23 GIT binary patch literal 32038 zcmdU&2Yi&}{lM?C6UZP5D?k=uZ=9{j7WWpb78R;kTsTm~1wq`_y=xudtXAvZBjO%) zTdYK4<{Y{RRf5~&GBOm( zZ_`NQhSmNb=3ORqfObGn|C5r$tSjL6umDbi99WyAbHH|R9-IPLI+P6mYl4SE$G}(c z1KbQnur_JS0SCheun_Dc7yj3zgbCjQE8$zvp9Qcs(f?!N8TcOVgZ^M%{I7`dx;wlH zQFtA8gj86Ym^WSEOjz8+(X`s-e?jz`r-4u1uKJ0r891j;Y@mtyfpdB_w+H>-0 z@CE3*3t>a*mocyvJOI^j6Ieet#z*KD-U3&g6Aq`n6b`5UgZK0u{w^F&q1GndN&h=R z)&`EFZRtPsXHa+7J?-rF4H08$1Uv^zVJ38g4Y5Hs`t6yNoc&}*n=O~+wm)J;o7^3L zNJ`48OG?g*wr#i9_Ze-sU7noMj<`)afHA!GSl@BD`UxEii{RgI8LUsni$q5c~=L1FqkDK^knVY(occLk+wJ)&l&wt`g2h&$H7q`z&qW z?RPcBeP52`bv!15o^{xzfjp>hle=@IxbG{GqKdmCSvjM=PD;+b78~?rgmtM~`XV2W zf<@q(e=}6Tx}48pYZR}grord19PWW%LOQHX7(Z#tl+;ddq4Ubp{_jS+^?s%{D`#x2 zdUJAvHb_k?j&&@(uBNpAd$k>kFI$<>W{dZTm;KqdE-_%9jt18&eg7_u2iKQPA=Q;Px)jw;iEp5>Al+5hymv`y$K%{)|=XIUC-&vcUH8>gyC!xFghZ>TS+SC*K zH6?vtk3clHaMCx)sU0uCc3z9DDc9Ur2G_tp!McAQl)#$KWA}QAdE_<0!(bkK0w+Re zSkpM&(CxlY1H)?9W}@Fe*|8I1E94yV}9XZ;>UM7 zcfVt0_aR@^758~Dnw`IYBq^yax!>sf#+DpNFN=04xhm3q@E3Jmdp;JSJ}wR?Wu8DE z*5H=J?T-Ifz@Bj%oJ?B?Yw`)ZH%Rn@p9SWD>x;Se6byxye3ZiG$?2K>zbx(lL8N@p zBK;asPqwe}J?A#&Kyur!tmJax9G`IcXD_9R7LX}P&(_9>{H&~1Gg**mZ z0po5q(dHw;ak+Mh*V^l7Tgua1a{p4+_GR6Nd|g*M;KO>>{TOv^Roz?74&mg6jI5#2 z&fV|AR*PdDO0K9$O)p&<3MZ>mqCb7DkBN==XB6xidIt`JHn6$HcpnQ-!w+E1nF8iV z;yjDV{k!u!pZxENq2D%?4p>l^*WoB)zfFAI!?&@tjEY)fyeSvL4Jm0wv4W!0YfAdP z6)mdxQw?=;B{g7Q`c15}uD!;+pYLj6F&qQdg2eH8yII6BTyH0V@$Z`W9P9^fcfa)J zdhhYVu5+p@hWxAEwU3xyiT*KsTA!IctfoWprL1*Z*Klk+2f|6|_32rIYTI=@w#J&7 zmeI3@V<)pe6%^xs2;2$ZgKPawPz){QAe%Rfs9!tS6J7`Ffi~D5^1%K5@$ECY?I$nq z*8ABA>w1K1U)KGUde?sPqPC>pn^D$-6y5y9?81`_TIG(<&=l0ni zjQdZ)oPQs7g~a&a?Ha{?Jbq*CWk2dV6&8VUK)nj>0Om+9eD@gjZY9?{k+h87QS5<^ z;fCat_A&e$VNI+n?)zFKufuO^sFSs<3+fw>15KaeevaVtr zbp@~AtSio6F`o{HEy4U!w{;PZD+1^E349Cp!){OkH*=j+qrY9_lc@1r+tn5no$2*% zl)6|irTyQJaeY*q-}#gX*U3?1gmVM^FK_DWVA?9s&Y#0=d&2hg~!3%I2)XAgE8GUZ;wdl?zczE27DB=CR7Yv)?jVy zKIEUV?t}lqwefS2?EL+#k%9i&;03rCoTIhE?{)qF{UHU`ChDP&^luedH>|nl(C(mA zf%#-yEr9dE{)4*1%QcE)=%Mr6;5>B~Yn(QXg6oYw8V>2OHqq~W;dfwsYCrp7PVw(v zifNT$;F|so#6TP00^7s7=0jkEOE_mKEP-2qm`Gx6>l)2&w=cC|Obyq+Q9ctyy7jKA zr8dU={1cCPa->=Q2R1Ms%q?x|oYJ5*3A4j`(G~QsHA#QyTYYXH=6gp7az|Tk3s=G4 zz~i;WBjB;^AUKwnt%~vT3z!R^!)IW;p+^!mAg#`|PaV^<2C93sTc77+T<7SMK-ZvE z=Q^kbW9nAWPwLi^sJG***Di1=yaz$O&<;mKZ)m9x&ozdpfO`9RN}Ig}=1w=z?vAr2 zvF|P5V)!cxxd0oJFKg8u~b>CcE3LnFKm;*yVKeaTzzA;W6 z_f&WVjA3I=Sy~a2~eTfaiP- zt`ir)ZeR^qAH6wzm3W@M9uJSgGB6gjgYobxyatZ<0lWc^!JTj;TmfgpG?)m};T?#1 z?cc8BaX)nJ^~4I+{pDOE`dJUX<2j%mtab0go1j0x0JklNf5GSQDZCHvcc0tm!%<+Y zWI_^bj2O?xnLgA9&gXlGf_@TnTWUf3{2RUl?ez^f_mAMO!1i!mX4~y*iz@D|;X0tU zY`}uruDu?Qw8`5g{(UINsK4hw0s3(!Xv;r>@uywvQ(M%)f51Gw40ZtPUovb;F&6B1 zENF*kVIlmpsePIDUbq_01CO~5UI6W-PTGhyAf3;_hY|Ph#VUq=6TklPyMcl(v-wW6 zAh73$96K4hLLRh-V(0_I;8$=mTm+6a9!ek$f*QojpG=&CbIAwmgL!X!@cF+@9oPO* zul*aZ|CV|ESMPVfUL$c#e2t)834Z}=&^AyAsqk~qk7=|j=A8Z-2WP;;VC-8jmf(Ls z2iJPd@3n9I`giab^?d)^^xaTgf8*r`Fn_)R<7f_y0{h<7@lBVTM9dRo_P6jNd0*MU1^J*Dr)1APY_)|(6gYOQ3Z_L-RPyk23LeMTJ z!6onk+yZxl@vIGu`Fp_lxCYegT2N2>-3H8eZEz@98@`3Rpcnj9;#$-n{tRCCyancZ zF7$&3U}5!?#;{yFFXC2%D?31c7?(!tu{+|~C|_zUR! zHqaN;{Yy9n0v)q>txfIQ8eyFM6130e7WK;m{rfktww({!!|{y^ePxcCzuLv^+F&;5 z-`zpo1JRy+;0|~Vg8kZP4rt>?VJO&_bI1Yx{x%q6`l1-@Yja8(oezVjK-=F5>h7GB zPEZL;L0{?@#k`mX&evAI2j|%54)8481+IDOsf>gV!5BCV+Cw15gK?n`Jmzpn?3cHj zRP={2sjZj5`>+Q%f5mnj7?`=4rD*5cpD^PK7sNCTPRJHs-}V_%}QO z##Ue(ecc~iJB)z`VKijGrWVKS4eG8A&UFR^{%|hl_(kwFdd*^<{%!lhZLl0Zhoc}D5=C7Pg86VQ7~2J4f5yboU`#ZZOg4-M z?Y{tz}&E(U0@<~gXZV3DZZ@>?FFzD)Nvvt+E+cbr#blvIFET?Ej9NxM4XF# z-VFNvaZq>t7>IrE0k4DWz4jRe>eiarZ=#;sIJMcf=I1)jFn9!vv8TatQXmj*pbgK1 ze}MD(BaDD7NF0;5YZUwH1joW+_z;eOHW2h3T2)%*WsuUTzYxKl?RD^uMuq z1sL~Aeg(!xqWFF*xCg$2ui*w53W@Q;+ZIJTnUnLuntLPY=frc3gFQj}{1dK)0bm?& zPEofl;ePO%z;+-c&W*S6x-y#f7SKPgTW5fEL_4-7>b?Wq2gZ-_e z5s(joc#PK-jqs-rR}D} zr(hk|wguomMZcH8ncy+v*okW)Z+~<#pRGOS_=~VN1bXpum89}&Yq%Z0g}=a_pjgWe z1Y=_noCBqxzZF}@IT1bpYyBnA7lLDW`ICrxjf6+QIJgsx_m=WeJ@K zXv2A+o$NOdV`>t-3)az#pdTb2!`q)sg83P@U&0*d2F-nAEF0%1!1M4NDAqIez6W-L z#F#dgc84dxbxu2J-=8Aq8MZ}H&vFQ^3(Rl*l>^PiJT{hhguR+}n&&%#*GSq@J1OeE zCwT4u9hfr*L85(l`?C=J@k=oN^^H#W`ee~-c1A7zMrG+Oe-$h!1y&r?}D454{V4S`!nG6 zrq;UgbJm^d#U+S?a^Cdn!EMaJ;8?+;yZ>aYDtl7z&PDq9s1Br#>e>ne{YL-!;F~~!uSqo;{=!%6Qc?=wISO0ctJa>q47l8ct;~J9mlVl zFM2HUJsnojq|jj{tsUFgG5@Dn*=;55Eap)~;VG#JC0CVHjx4G6*vgT))fu*>>D77m zm4{aLoe{2dtm+i}?AU3v%q5mg+dvzh9dGMd-_*gDZWwfNQya;kJ34Ntp*uQm=*@fv z!$b&&O8Gxh!Nnz!BhAEm+5!55^`NEe3^xC$VqQ8IugT`aF5r65lBm&+a1G1=>p8aj zNhO2P_kg#+b;fHnx3wf$Y}^SRglAza1nU?te{|7yonaO%frr3wGEQ0&<}_Bld|sP z-quOn<6PiD@k7`AKVwAO-41@9oCfN@F5$YTIs~g`LW8UYS!kk^70-*QKVF#C(2)=Ztg7`(@Becey3B}A8J$5itCso&#@;r@C*~rabF+dzTTDSQpK~cyE1)0S2P#b zAoJarc^giGF7RJtvTbdmEsXmh4&UVUWU!{XFOV?LUiQt*9&u}@^10PKXUa2JC)V;@ zPCW;3N#b)^h5Q!5^!PKV1w}I#qU-70uh@xwiT$K<{FZPFEP;7o+_&s=cC3TwO<8NDZG^4!a0Z2pnY{dFw8 zHpVkC@#jIPncNrb?*Sz79CRr$bVK}}Xmov#Tp!Q5L0p^9dqEW}f{TG1S+API;Vp{# zTYJ2AdH~D?*N1XQVGg;e85O4$cA33|=Qg6$nOc6=fcy9I*0OuTZ`Rmd7dv~ z9pe7wSV_M($m#33r*}HfZ4YAp@wyb{_^)6gEP=V8P17M!w3p-M!5Fv!J^;@%7srBf z5IhK%uGW8F56M42@>x?hv(J^1BN@o9KPnQP!+F>k)C& zz_GOP`}Wmg9*hQSkn;^Bi5KQ~f6(9egE{gvoDJ%uUW)6Kv11&0&GaE$1+E|ZEwNwT z#zp`1gGx~6Z;2tFjpjL`jQZ5{@)*A>gD-B3re_WD+45lCH5~g5IEL4VBf)tlj*-H9 zeY`JR2m0AKy9BHuWso>8&o$@O^;mck^u?{P7nFee(?FkOfb+MlfMej#@DUK(VV-v% z80%bqYqU#`2cujU5jXMQg>mlAOf?RM6>&RAc1~?CGqmK>(=bZ)mrX?{h zJYV0;1#{v}Fz!x-!{KO9$6v!ZFm8u~dRhmcrTZGM- zMeuvL1I(XuU`A8>Fs*TLCrpB#pslp?+Qef#b_kpWFM_@>XFh|)O|Rdft%AqlLHHx+ zuh*$VHRf@b9`~=Ley(HmATV4 z!5H}mVE1^<^1b`c1z(%{#{QN~uV1HCmp-74Hiu}N;h?QvhLd3soC+_2ad8nm4#&eh z_yBH#3*Z_sXWjlR90d8W9k@*$ra}R1>Rd>rPv`0!-hlI;T5q zu5bk00i)nI@ElwT*05aoHM{^%z|N2fn@W=C$l5j&T#v5@*LY(|d#aCfv6dQ7&T|N~ zfjsC8=F0=HEqL5@pw44Kzm|f2d=0LKK9B+%EBdh==xbx-b};X>Pj|Qij3w=--}iur zV0Z8sbuWY8!3|BX)%#F*1>9Z=`q$(1t8rv(c7u&s2b_C0Xd`Q#F?%35zT36EItBLD zj@yHFJqz?%Ti6BeffK-Uwd>a4wkp^aJU1PNgWIiZ`+@#*9vdQQoTR^v7jtqdbOCKO z0JP^vpsm`2w$dl+YOIWa3OE-$UpqK==hFey=PwZGs;``{zJ3^Pf>F>0)>i_*&*4OE ztY6E)@r{$C;azwFw3)V3wD%;K3p>FUV2oV?`op;^+Q8h`*7|E-@Lb0&g5%&Z@H%TW zs6%irFV`y8gd<@-Xa{ZS9QE@kFvgv~KCuP{eLKGMG>&XLfp&MU`ZExZH(PzGI zE}qu|?B^{w2TC9~$9=QmuVDP`3EHapcx+lF&RyHx0nYybXa}BS+#Czq%l#uke{76+ zzW&ipRp59*yr=QHKU@h5z*y4{`b{01i}5iSE&=CwB6I?6XP=(?Koc8V8|Wr3&O^KW z2Ij*&2;#_n&S_iFN3Vgp1ofMjtHgZj2{YjpxD<>*=dK-wg7xMRm;lzBB-j+<7{_;tGH;RzHkw| z4Avj#6pR!2LHj$8b3pqld7w=nhIud=Jl?tY0)6r#=#%ac^vTOjDvq}W+y+m1x zteY#wl5ya=aRw-!b1U2jL0s#*Qt+BYpIi&ZoqBCfG0*k|*A;Vm9Atr_9@fy^;E(Vi zD6T(O!I4l5DPZ1ChNt0n*a7T&bBn(12HLg?^m!i$d~WO7y%QV`igQ+nGSI%plfJWl zJ`0C|=LCM_S$^l!M!UEYVKZcx`+W z7#kJPTw1cH^E6f-hSNdc{FrlRxVQv54CUQOD2#7y;eXyv6vJb-HF}P=I^I^7RO_`_ zRq?iRL@0d`t9w;RD7k7xD6}Z2v8~Pth1nOXNo#78h_hLZAAD_Nj@i)Ih8H(}`JDWG zW1yzbW`U_>fnm1cDi(9v&_uAc;@c`g%e<<5&v=V$gK_BJ`?Ydy*^zEr_5ZEk9`j%P z+wY*Q=Y24*Rr4ItH1D8A|3w}bAGfrVt6`P5cE#tzU08{&w}AQH+q-PNy(_WxHm@?a zm(UJrY^xc^#iZ+HaoUW>=D z*?T%tviZB|Zv3tBD!%`}k?*t)r!8P-OX9qo<1O$WTY*}4;@%Aye9K6o? z|Ngimf!}`Ip5MKEjqiaOyO+NO$#103@$a$kq^6hdj4iZdAf8(W*THSj7yiFL-n1Z| z|7-Bt;zU?wSN?{Sos~2COn#rdx_#I4qyF90zh9s~zrTFb2J(PEF;z-mUq)=~uh-e2~AL4f5aOXd<_s?X~;?5^(}do0Waud`1AWde9z z@f1u3<2Er?wBG^nG@BQ3|Cs)d@jHak_T4Uwa-T>1J_z>L!cuT-S6S_AKh{6zasoUK zcffwo2^9ThtU9mf;czGh{h<%UygCe+W70R&z8LppL_3yTjqOXYeRu`?%rou3BY0iv zwO>ArgBw9R9|r?LJ16SsYkg(B*pG301z3ml<22AVogK_%@xD(XbI?xZ)r8RL5?O=cSJ!p%ULEqojgmLu=ak!GdVb>P-eYKX~ zhx50xm$UsII1x?;=d2%`!@;0#$*?Y>-;9B7Py(Lo>pkH*c%OLxfq#F(zhl)Ci`2rL z2-`ns+J6Xb53qLH&zf{*_gclc9R=!gDcs55Zoc8(3I2|MbFIGtO>M4bdlg&>55TEV z1ln|C#Ig5*Kf`Ho2HZ=xZ?oQgOHHm}4Oq(i``E6Zj)L*^Lwx&tw8onLz8J2Dp>PBk@4G=7tgmP{edxIMxiw4&W6F5a4_meZhN>N_6O_FSkML&z+70HXs2yKT~CBgFaWG0 zmqBN+b{S{Zl+j>3Xk+#32-?EBRR-1*=Wc&{fOBk3JYT4(*X(4b_K^y$u*Y$AfyVk96T+?RE{A!`jVlXMwgE367yS z7yan=qoECW?0IkjXlL!ZKBCTp!TFy7`QRLSf$^b_w*=*va3iR{el3St@Ou~tE%}Pg zYZd1_42<2$px@or4Xy@p9|guW2;Yilm+pN^ap=6C70 z=lv7Gq8L-tVGn-4HZ8Bi(I@fy&hgK(e2!RwO&6&gE*U@~Y;eMDjYo@Vkjyl#DI0_Wcp99tn^JN!46FrokHDHB(@Ok(< zw%rY$GZ+p7>zOr2ab5idoDY}7+00=c*Uw7}y3C4jFWskXb6s}~ZND>Ui{}1LXPfnG z1iLQgp29^u1H!-m)O`v4+3y~Z4T-kn?P_99o(h-XhbL0fx_pTL)qOV10K>Ai^Ie0r z$swRGwe?uAjvWiT!!e+ZnhTK~k4tk=`|Jbz!rrhW>;k`PdT;C|#*e}2nluSUgT6N2 z^t1Zf8n@bfTd*JJ)((^&;Jl2n!=W!EL!$Uz-?_g6roc|1IG+!FW5k v>A/schedule +``` + +## Stopping a spider + +The following command will stop a given Crawly spider: + +``` +curl -v localhost:4001/spiders//stop +``` + +## Getting currently running spiders + +``` +curl -v localhost:4001/spiders +``` + +## Getting spider stats + +``` +curl -v localhost:4001/spiders//scheduled-requests +curl -v localhost:4001/spiders//scraped-items +``` diff --git a/documentation/installation_guide.md b/documentation/installation_guide.md new file mode 100644 index 00000000..b5b1abc3 --- /dev/null +++ b/documentation/installation_guide.md @@ -0,0 +1,14 @@ +# Installation guide +--- + +Crawly requires Elixir v1.7 or higher. In order to make a Crawly +project execute the following steps: + +1. Generate an new Elixir project: `mix new --sup` +2. Add Crawly to you mix.exs file + ```elixir + def deps do + [{:crawly, "~> 0.6.0"}] + end + ``` +3. Fetch crawly: `mix deps.get` \ No newline at end of file diff --git a/documentation/introduction.md b/documentation/introduction.md new file mode 100644 index 00000000..500e1fee --- /dev/null +++ b/documentation/introduction.md @@ -0,0 +1,152 @@ +# Crawly intro +--- + +Crawly is an application framework for crawling web sites and +extracting structured data which can be used for a wide range of +useful applications, like data mining, information processing or +historical archival. + +## Walk-through of an example spider + +In order to show you what Crawly brings to the table, we’ll walk you +through an example of a Crawly spider using the simplest way to run a spider. + +Here’s the code for a spider that scrapes blog posts from the Erlang +Solutions blog: https://www.erlang-solutions.com/blog.html, +following the pagination: + +```elixir +defmodule Esl do +@behaviour Crawly.Spider + + @impl Crawly.Spider + def base_url(), do: "https://www.erlang-solutions.com" + + def init() do + [ + start_urls: ["https://www.erlang-solutions.com/blog.html"] + ] + end + + @impl Crawly.Spider + def parse_item(response) do + # Getting new urls to follow + urls = + response.body + |> Floki.find("a.more") + |> Floki.attribute("href") + |> Enum.uniq() + + # Convert URLs into requests + requests = + Enum.map(urls, fn url -> + url + |> build_absolute_url(response.request_url) + |> Crawly.Utils.request_from_url() + end) + + # Extract item from a page, e.g. + # https://www.erlang-solutions.com/blog/introducing-telemetry.html + title = + response.body + |> Floki.find("article.blog_post h1:first-child") + |> Floki.text() + + author = + response.body + |> Floki.find("article.blog_post p.subheading") + |> Floki.text(deep: false, sep: "") + |> String.trim_leading() + |> String.trim_trailing() + + time = + response.body + |> Floki.find("article.blog_post p.subheading time") + |> Floki.text() + + url = response.request_url + + %Crawly.ParsedItem{ + :requests => requests, + :items => [%{title: title, author: author, time: time, url: url}] + } + end + + def build_absolute_url(url, request_url) do + URI.merge(request_url, url) |> to_string() + end +end +``` + +Put this code into your project and run it using the Crawly REST API: +`curl -v localhost:4001/spiders/Esl/schedule` + +When it finishes you will get the ESL.jl file stored on your +filesystem containing the following information about blog posts: + +```json +{"url":"https://www.erlang-solutions.com/blog/erlang-trace-files-in-wireshark.html","title":"Erlang trace files in Wireshark","time":"2018-06-07","author":"by Magnus Henoch"} +{"url":"https://www.erlang-solutions.com/blog/railway-oriented-development-with-erlang.html","title":"Railway oriented development with Erlang","time":"2018-06-13","author":"by Oleg Tarasenko"} +{"url":"https://www.erlang-solutions.com/blog/scaling-reliably-during-the-world-s-biggest-sports-events.html","title":"Scaling reliably during the World’s biggest sports events","time":"2018-06-21","author":"by Erlang Solutions"} +{"url":"https://www.erlang-solutions.com/blog/escalus-4-0-0-faster-and-more-extensive-xmpp-testing.html","title":"Escalus 4.0.0: faster and more extensive XMPP testing","time":"2018-05-22","author":"by Konrad Zemek"} +{"url":"https://www.erlang-solutions.com/blog/mongooseim-3-1-inbox-got-better-testing-got-easier.html","title":"MongooseIM 3.1 - Inbox got better, testing got easier","time":"2018-07-25","author":"by Piotr Nosek"} +.... +``` + +## What just happened? + +When you ran the curl command: +```curl -v localhost:4001/spiders/Esl/schedule``` + +Crawly runs a spider ESL, Crawly looked for a Spider definition inside +it and ran it through its crawler engine. + +The crawl started by making requests to the URLs defined in the +start_urls attribute of the spider's init, and called the default +callback method `parse_item`, passing the response object as an +argument. In the parse callback, we loop: +1. Look through all pagination the elements using a Floki Selector and +extract absolute URLs to follow. URLS are converted into Requests, +using +`Crawly.Utils.request_from_url()` function +2. Extract item(s) (items are defined in separate modules, and this part +will be covered later on) +3. Return a Crawly.ParsedItem structure which is containing new +requests to follow and items extracted from the given page, all +following requests are going to be processed by the same `parse_item` function. + +Crawly is fully asynchronous. Once the requests are scheduled, they +are picked up by separate workers and are executed in parallel. This +also means that other requests can keep going even if some request +fails or an error happens while handling it. + + +While this enables you to do very fast crawls (sending multiple +concurrent requests at the same time, in a fault-tolerant way) Crawly +also gives you control over the politeness of the crawl through a few +settings. You can do things like setting a download delay between each +request, limiting the amount of concurrent requests per domain or +respecting robots.txt rules + +``` +This is using JSON export to generate the JSON lines file, but you can +easily extend it to change the export format (XML or CSV, for +example). + +``` + +## What else? + +You’ve seen how to extract and store items from a website using +Crawly, but this is just a basic example. Crawly provides a lot of +powerful features for making scraping easy and efficient, such as: + +1. Flexible request spoofing (for example user-agents rotation, +cookies management (this feature is planned.)) +2. Items validation, using pipelines approach. +3. Filtering already seen requests and items. +4. Filter out all requests which targeted at other domains. +5. Robots.txt enforcement. +6. Concurrency control. +7. HTTP API for controlling crawlers. +8. Interactive console, which allows you to create and debug spiders more easily. diff --git a/documentation/quickstart.md b/documentation/quickstart.md new file mode 100644 index 00000000..2d40ff71 --- /dev/null +++ b/documentation/quickstart.md @@ -0,0 +1,79 @@ +# Quickstart + +In this section we will show how to bootstrap a small project and to setup +Crawly for proper data extraction. + +1. Create a new Elixir project: `mix new crawly_example --sup` +2. Add Crawly to the dependencies (mix.exs file): +```elixir +defp deps do + [ + {:crawly, "~> 0.6.0"} + ] +end +``` +3. Fetch dependencies: `mix deps.get` +4. Define Crawling rules (Spider) +```elixir +cat > lib/crawly_example/esl_spider.ex << EOF +defmodule EslSpider do + @behaviour Crawly.Spider + alias Crawly.Utils + + @impl Crawly.Spider + def base_url(), do: "https://www.erlang-solutions.com" + + @impl Crawly.Spider + def init(), do: [start_urls: ["https://www.erlang-solutions.com/blog.html"]] + + @impl Crawly.Spider + def parse_item(response) do + hrefs = response.body |> Floki.find("a.more") |> Floki.attribute("href") + + requests = + Utils.build_absolute_urls(hrefs, base_url()) + |> Utils.requests_from_urls() + + title = response.body |> Floki.find("article.blog_post h1") |> Floki.text() + + %{ + :requests => requests, + :items => [%{title: title, url: response.request_url}] + } + end +end +EOF +``` + +5. Configure Crawly: +By default Crawly does not require any configuration. But obviously you will need +a configuration for fine tuning the Crawls: + +```elixir +config :crawly, + closespider_timeout: 10, + concurrent_requests_per_domain: 8, + follow_redirects: true, + closespider_itemcount: 1000, + output_format: "csv", + item: [:title, :url], + item_id: :title, + middlewares: [ + Crawly.Middlewares.DomainFilter, + Crawly.Middlewares.UniqueRequest, + Crawly.Middlewares.UserAgent + ], + pipelines: [ + Crawly.Pipelines.Validate, + Crawly.Pipelines.DuplicatesFilter, + Crawly.Pipelines.CSVEncoder, + Crawly.Pipelines.WriteToFile + ] +``` + + +6. Start the Crawl: + - `iex -S mix` + - `Crawly.Engine.start_spider(EslSpider)` + +7. Results can be seen in: `cat /tmp/EslSpider.csv` diff --git a/documentation/settings.md b/documentation/settings.md new file mode 100644 index 00000000..58d1c523 --- /dev/null +++ b/documentation/settings.md @@ -0,0 +1,214 @@ +# Crawly settings + +The Crawly settings allows you to customize the behaviour of all +Crawly components, including crawling speed, used pipelines and middlewares. + +Here’s a list of all available Crawly settings, along with their +default values and the scope where they apply. + +The scope, where available, shows where the setting is being used, if +it’s tied to any particular component. In that case the module of that +component will be shown, typically an extension, middleware or +pipeline. It also means that the component must be enabled in order +for the setting to have any effect. + +The settings are defined in the Elixir config style. For example: + +```elixir +config :crawly, + # The path where items are stored + base_store_path: "/tmp/", + # Item definition + item: [:title, :author, :time, :url], + # Identifier which is used to filter out duplicates + item_id: :title +``` + +### base_store_path :: binary() [DEPRECATED in 0.6.0] + +default: "/tmp" + +Defines the path where items are stored in the filesystem. This setting +is used by the Crawly.DataStorageWorker process. + +### user_agents :: list() + +default: ["Crawly Bot 1.0"] + +Defines a user agent string for Crawly requests. This setting is used +by the `Crawly.Middlewares.UserAgent` middleware. When the list has more than one +item, all requests will be executed, each with a user agent string chosen +randomly from the supplied list. + +### item :: [atom()] + +default: [] + +Defines a list of required fields for the item. When none of the default +fields are added to the following item (or if the values of +required fields are "" or nil), the item will be dropped. This setting +is used by the `Crawly.Pipelines.Validate` pipeline + +### item_id :: atom() + +default: nil + +Defines a field which will be used in order to identify if an item is +a duplicate or not. In most of the ecommerce websites the desired id +field is the SKU. This setting is used in +the `Crawly.Pipelines.DuplicatesFilter` pipeline. If unset, the related +middleware is effectively disabled. + +### pipelines :: [module()] + +default: [] + +Defines a list of pipelines responsible for pre processing all the scraped +items. All items not passing any of the pipelines are dropped. If +unset, all items are stored without any modifications. + +Example configuration of item pipelines: +``` +config :crawly, + pipelines: [ + Crawly.Pipelines.Validate, + Crawly.Pipelines.DuplicatesFilter, + Crawly.Pipelines.JSONEncoder, + Crawly.Pipelines.WriteToFile [NEW IN 0.6.0] + ] +``` + +#### CSVEncoder pipeline + +It's possible to export data in CSV format, if the pipelines are +defined in the following way: +``` +config :crawly, + pipelines: [ + Crawly.Pipelines.Validate, + Crawly.Pipelines.DuplicatesFilter, + Crawly.Pipelines.CSVEncoder, + Crawly.Pipelines.WriteToFile [NEW IN 0.6.0] + ] +``` + +**NOTE**: Set the file extension config for `WriteToFile` to "csv" + +#### JSONEncoder pipeline + +It's possible to export data in CSV format, if the pipelines are +defined in the following way: +``` +config :crawly, + pipelines: [ + Crawly.Pipelines.Validate, + Crawly.Pipelines.DuplicatesFilter, + Crawly.Pipelines.JSONEncoder, + Crawly.Pipelines.WriteToFile [NEW IN 0.6.0] + ] +``` + +**NOTE**: Set the file extension config for `WriteToFile` to "jl" + +#### WriteToFile pipeline + +Writes a given item to a file. +``` +config :crawly, + pipelines: [ + ... + Crawly.Pipelines.JSONEncoder, + Crawly.Pipelines.WriteToFile + ] + +config :crawly, Crawly.Pipelines.WriteToFile, + folder: "/tmp", + extension: "jl" + +``` + +**NOTE**: Set the file extension config for `WriteToFile` to "jl" + +### middlewares :: [module()] + +default: [ + Crawly.Middlewares.DomainFilter, + Crawly.Middlewares.UniqueRequest, + Crawly.Middlewares.RobotsTxt, + Crawly.Middlewares.UserAgent + ] + +Defines a list of middlewares responsible for pre-processing +requests. If any of the requests from the `Crawly.Spider` is not +passing the middleware, it's dropped. + +### closespider_itemcount :: pos_integer() + +default: 5000 + +An integer which specifies a number of items. If the spider scrapes +more than that amount and those items are passed by the item pipeline, +the spider will be closed. If set to nil the spider will not be +stopped. + +### closespider_timeout :: pos_integer() + +default: nil + +Defines a minimal amount of items which needs to be scraped by the +spider within the given timeframe (30s). If the limit is not reached +by the spider - it will be stopped. + +### follow_redirects :: boolean() + +default: false + +Defines is Crawly spider is supposed to follow HTTP redirects or not. + +### concurrent_requests_per_domain :: pos_integer() + +default: 4 + +The maximum number of concurrent (ie. simultaneous) requests that will +be performed by the Crawly workers. + +### using crawly with a proxy + +Now it's possible to direct all Crawly's requests through a proxy, +it's possible to set proxy using the proxy value of Crawly config, for example: +``` +config :crawly, + proxy: ":", +``` + +Example usage: +``` +iex(3)> Crawly.fetch("http://httpbin.org/ip") +{:ok, + %HTTPoison.Response{ + body: "{\n \"origin\": \"101.4.136.34, 101.4.136.34\"\n}\n", + headers: [ + {"Server", "nginx/1.7.10"}, + {"Date", "Sat, 03 Aug 2019 18:57:20 GMT"}, + {"Content-Type", "application/json"}, + {"Content-Length", "45"}, + {"Connection", "keep-alive"}, + {"Access-Control-Allow-Credentials", "true"}, + {"Access-Control-Allow-Origin", "*"}, + {"Referrer-Policy", "no-referrer-when-downgrade"}, + {"X-Content-Type-Options", "nosniff"}, + {"X-Frame-Options", "DENY"}, + {"X-XSS-Protection", "1; mode=block"} + ], + request: %HTTPoison.Request{ + body: "", + headers: [], + method: :get, + options: [false, {:proxy, "101.4.136.34:81"}], + params: %{}, + url: "http://httpbin.org/ip" + }, + request_url: "http://httpbin.org/ip", + status_code: 200 + }} +``` diff --git a/documentation/tutorial.md b/documentation/tutorial.md new file mode 100644 index 00000000..d7c650fe --- /dev/null +++ b/documentation/tutorial.md @@ -0,0 +1,371 @@ +# Crawly tutorial +--- + +In this tutorial, we’ll assume that Elixir is already installed on +your system. If that’s not the case, see Installation guide: +https://elixir-lang.org/install.html + +We are going to scrape `https://www.homebase.co.uk`, a website that +contains products of different types. + +This tutorial will walk you through these tasks: +1. Creating a new Crawly project. +2. Writing a spider to crawl a site and extract data. +3. Exporting the scraped data. + +Crawly is written in Elixir. If you’re new to the language you might +want to start by getting an idea of what the language is like, to get +the most out of Crawly. + +If you’re already familiar with other languages, and want to learn +Elixir quickly, the Elixir website +https://elixir-lang.org/learning.html is a good resource. + +## Creating a project + +Before you start crawling, you will have to set up a new Crawly +project. Enter a directory where you’d like to store your code and +run: + +```mix new tutorial --sup``` + +This will create a tutorial directory with the following contents: +```bash +tutorial +├── README.md +├── config +│   └── config.exs +├── lib +│   ├── tutorial +│   │   └── application.ex +│   └── tutorial.ex +├── mix.exs +└── test + ├── test_helper.exs + └── tutorial_test.exs + +``` + +Switch to the project folder: `cd ./tutorial` and update the mix.exs +file with the following code: +```elixir + def deps do + [{:crawly, "~> 0.5.0"}] + end +``` +Now run `mix deps.get` + + +## Our first spider + +Spiders are behaviours which you defined and that Crawly uses to +extract information from a given website. The spider must implement +the spider behaviour (it's required to implement `parse_item/1`, `init/0`, +`base_url/0` callbacks) + +This is the code for our first spider. Save it in a file name +homebase.ex under the lib/tutorial/spiders directory of your project. + +```elixir +defmodule Homebase do + @behaviour Crawly.Spider + + @impl Crawly.Spider + def base_url(), do: "https://www.homebase.co.uk" + + @impl Crawly.Spider + def init() do + [ + start_urls: [ + "https://www.homebase.co.uk/our-range/tools" + ] + ] + end + + @impl Crawly.Spider + def parse_item(_response) do + %Crawly.ParsedItem{:items => [], :requests => []} + end +end +``` + +As you can see, our Spider implements the Spider behaviour and defines +some functions: + +1. base_url: method which returns base_urls for the given Spider, used in +order to filter out all irrelevant requests. In our case we don't want +our crawler to follow links going to social media sites and other +partner sites (which are not related to the homebase website themselves) + +2. init(): must return a KW list which contains start_urls list which +Crawler will begin to crawl from. Subsequent requests will be +generated from these initial urls. + +3. parse_item(): function which will be called to handle response +downloaded by Crawly. It must return the `Crawly.ParsedItem` structure. + + +## How to run our spider + +To put our spider to work, go to the project’s top level directory and +run: + +1. iex -S mix - It will start the Elixir application which we have +created, and will open interactive console +2. Execute the following command in the opened Elixir console: +```Crawly.Engine.start_spider(Homebase)``` + +You will get an output similar to this: + + ```elixir +iex(2)> Crawly.Engine.start_spider(Homebase) + +15:03:47.134 [info] Starting the manager for Elixir.Homebase + +=PROGRESS REPORT==== 23-May-2019::15:03:47 === + supervisor: {<0.415.0>,'Elixir.Crawly.ManagerSup'} + started: [{pid,<0.416.0>}, + {id,'Elixir.Homebase'}, + {mfargs, + {'Elixir.DynamicSupervisor',start_link, + [[{strategy,one_for_one}, + {name,'Elixir.Homebase'}]]}}, + {restart_type,permanent}, + {shutdown,infinity}, + {child_type,supervisor}] + +15:03:47.137 [debug] Starting requests storage worker for +Elixir.Homebase.. + +15:04:06.698 [debug] No work, increase backoff to 2400 +15:04:06.699 [debug] No work, increase backoff to 4800 +15:04:06.699 [debug] No work, increase backoff to 9600 +15:04:07.973 [debug] No work, increase backoff to 19200 +15:04:17.787 [info] Stopping Homebase, itemcount timeout achieved +``` + +## What just happened under the hood? + +Crawly schedules the Request objects returned by the init function of +the Spider. Upon receiving a response for each one, it instantiates +Response objects and calls the callback function associated with the +request passing the response as argument. + +In our case we have not defined any data to be returned by the +`parse_item` callback. And in our the Crawly worker processes +(processes responsible for downloading requests) did not have work +to do. And in the cases like that, they will slow down progressively, +until the switch off (which happened because the Spider was not +extracting items fast enough). + +And if you're wondering how to extract the data from the response, +please hold on. We're going to cover it in the next section. + +## Extracting data + +The best way to learn how to extract data with Crawly is trying the +selectors in Crawly shell. + +1. Start the Elixir shell using `iex -S mix` command +2. Now you can fetch a given HTTP response using the following + command: + `{:ok, response} = Crawly.fetch("https://www.homebase.co.uk/our-range/tools")` + +You will see something like: + +``` +{:ok, + %HTTPoison.Response{ + body: "[response body here...]" + headers: [ + {"Date", "Fri, 24 May 2019 02:37:26 GMT"}, + {"Content-Type", "text/html; charset=utf-8"}, + {"Transfer-Encoding", "chunked"}, + {"Connection", "keep-alive"}, + {"Cache-Control", "no-cache, no-store"}, + {"Pragma", "no-cache"}, + {"Expires", "-1"}, + {"Vary", "Accept-Encoding"}, + {"Set-Cookie", "Bunnings.Device=default; path=/"}, + {"Set-Cookie", + "ASP.NET_SessionId=bcb2deqlapednir0lysulo1h; path=/; HttpOnly"}, + {"Set-Cookie", "Bunnings.Device=default; path=/"}, + {"Set-Cookie", + "ASP.NET_SessionId=bcb2deqlapednir0lysulo1h; path=/; HttpOnly"}, + {"Set-Cookie", "Bunnings.UserType=RetailUser; path=/"}, + ...., + {"Set-Cookie", + "__AntiXsrfToken=fd198cd78d1b4826ba00c24c3af1ec56; path=/; HttpOnly"}, + {"Server", "cloudflare"}, + {"CF-RAY", "4dbbe33fae7e8b20-KBP"} + ], + request: %HTTPoison.Request{ + body: "", + headers: [], + method: :get, + options: [], + params: %{}, + url: "https://www.homebase.co.uk/our-range/tools" + }, + request_url: "https://www.homebase.co.uk/our-range/tools", + status_code: 200 + }} +``` + +Using the shell, you can try selecting elements using Floki with the +response. Lets say that we want to extract all product categories links from the +page above: + +``` +response.body |> Floki.find("div.product-list-footer a") |> +Floki.attribute("href") + +"/our-range/tools/power-tools/drills", "/our-range/tools/power-tools/saws", + "/our-range/tools/power-tools/sanders", + "/our-range/tools/power-tools/electric-screwdrivers", + "/our-range/tools/power-tools/tools-accessories", + "/our-range/tools/power-tools/routers-and-planers", + "/our-range/tools/power-tools/multi-tools", + "/our-range/tools/power-tools/impact-drivers-and-wrenches", + "/our-range/tools/power-tools/air-compressors", + "/our-range/tools/power-tools/angle-grinders", + "/our-range/tools/power-tools/heat-guns", + "/our-range/tools/power-tools/heavy-duty-construction-tools", + "/our-range/tools/power-tools/welding" ...] +``` + +The result of running the command above is a list of elements which +contain href attribute of links selected with +`a.category-block-heading__title` css selector. These URLs will be +used in order to feed Crawly with requests to follow. + +In order to find the proper CSS selectors to use, you might find +useful opening the target page from the shell in your web browser. You +can use your browser developer tools to inspect the HTML and come up +with a selector. + +Now let's navigate to one of the Homebase product pages and extract +data from it. + +``` +{:ok, response} = +Crawly.fetch("https://www.homebase.co.uk/4-tier-heavy-duty-shelving-unit_p375180") + +``` + +Extract the `title` with: +``` +response.body |> Floki.find(".page-title h1") |> Floki.text() +"4 Tier Heavy Duty Shelving Unit" +``` + +Extract the `SKU` with: + +``` +response.body |> Floki.find(".product-header-heading span") |> Floki.text +"SKU: 375180" +``` + +Extract the `price` with: +``` +response.body |> Floki.find(".price-value [itemprop=priceCurrency]") |> Floki.text +"£75" +``` + +## Extracting data in our spider + +Let’s get back to our spider. Until now, it doesn’t extract any data, +just makes an `empty run`. Let’s integrate the extraction logic above +into our spider. + +```elixir +defmodule Homebase do + @behaviour Crawly.Spider + + @impl Crawly.Spider + def base_url(), do: "https://www.homebase.co.uk" + + @impl Crawly.Spider + def init() do + [ + start_urls: [ + "https://www.homebase.co.uk/our-range/tools" + ] + ] + end + + @impl Crawly.Spider + def parse_item(response) do + # Extract product category URLs + product_categories = + response.body + |> Floki.find("div.product-list-footer a") + |> Floki.attribute("href") + + # Extract individual product page URLs + product_pages = + response.body + |> Floki.find("a.product-tile ") + |> Floki.attribute("href") + + urls = product_pages ++ product_categories + + # Convert URLs into Requests + requests = + urls + |> Enum.uniq() + |> Enum.map(&build_absolute_url/1) + |> Enum.map(&Crawly.Utils.request_from_url/1) + + # Create item (for pages where items exists) + item = %{ + title: response.body |> Floki.find(".page-title h1") |> Floki.text(), + sku: + response.body + |> Floki.find(".product-header-heading span") + |> Floki.text(), + price: + response.body + |> Floki.find(".price-value [itemprop=priceCurrency]") + |> Floki.text() + } + + %Crawly.ParsedItem{:items => [item], :requests => requests} + end + + defp build_absolute_url(url), do: URI.merge(base_url(), url) |> to_string() +end + +``` + +If you run this spider, it will output the extracted data with the log: +``` +17:23:42.536 [debug] Scraped %{price: "£3.99", sku: "SKU: 486386", title: "Bon Safety EN20471 Hi viz Yellow Vest, size XL"} +17:23:43.432 [debug] Scraped %{price: "£3.99", sku: "SKU: 486384", title: "Bon Safety EN20471 Hi viz Yellow Vest, size L"} +17:23:42.389 [debug] Scraped %{price: "£5.25", sku: "SKU: 414464", title: "Toughbuilt 24in Wall Organizer"} +``` + +Also you will see messages like: +``` +17:23:42.435 [debug] Dropping request: https://www.homebase.co.uk/bon-safety-rain-de-pro-superlight-weight-rainsuit-xxl_p275608, as it's already processed +17:23:42.435 [debug] Dropping request: https://www.homebase.co.uk/bon-safety-rain-de-pro-superlight-weight-rainsuit-l_p275605, as it's already processed +17:23:42.435 [debug] Dropping request: https://www.homebase.co.uk/bon-safety-rain-de-pro-superlight-weight-rainsuit-xl_p275607, as it's already processed +``` +That's because Crawly filters out requests which it has already +visited during the current run. + +## Where the data is stored afterwords? + +You might wonder where is the resulting data is located by default? +Well the default location of the scraped data is under the /tmp +folder. This can be controlled by the `base_store_path` variable in +the Crawly configuration (`:crawly`, `:base_store_path`). + + +## Next steps + +This tutorial covered only the basics of Crawly, but there’s a lot of +other features not mentioned here. + +You can continue from the section Basic concepts to know more about +the basic Crawly features. diff --git a/mix.exs b/mix.exs index 3eb8d052..5d53db49 100644 --- a/mix.exs +++ b/mix.exs @@ -1,10 +1,12 @@ defmodule Crawly.Mixfile do use Mix.Project + @version "0.7.0-dev" + def project do [ app: :crawly, - version: "0.6.0", + version: @version, name: "Crawly", source_url: "https://github.com/oltarasenko/crawly", elixir: "~> 1.7", @@ -13,6 +15,7 @@ defmodule Crawly.Mixfile do test_coverage: [tool: ExCoveralls], start_permanent: Mix.env() == :prod, elixirc_paths: elixirc_paths(Mix.env()), + docs: docs(), elixirc_options: [warnings_as_errors: true], deps: deps() ] @@ -60,4 +63,34 @@ defmodule Crawly.Mixfile do {:excoveralls, "~> 0.10", only: :test} ] end + + defp docs do + [ + source_ref: "v#{@version}", + logo: "documentation/assets/logo.png", + extra_section: "documentation", + main: "quickstart", +# assets: "guides/assets", + formatters: ["html", "epub"], +# groups_for_modules: groups_for_modules(), + extras: extras(), +# groups_for_extras: groups_for_extras() + ] + end + + defp extras do + [ + "documentation/quickstart.md", + "documentation/introduction.md", + "documentation/ethical_aspects.md", + "documentation/installation_guide.md", + "documentation/tutorial.md", + "documentation/basic_concepts.md", + "documentation/settings.md", + "documentation/http_api.md", + + ] + end + + end