From dd28e51fef60a968df6f5cf0322c67f6475bb45b Mon Sep 17 00:00:00 2001 From: bhaveshAn Date: Sun, 29 Oct 2017 02:09:02 +0530 Subject: [PATCH] Initial commit for Youtube support --- README.md | 4 ++-- app/scrapers/__init__.py | 6 ++++-- app/scrapers/youtube.py | 28 ++++++++++++++++++++++++++++ app/server.py | 7 ++----- app/static/images/youtube_icon.png | Bin 0 -> 7858 bytes app/templates/index.html | 1 + 6 files changed, 37 insertions(+), 9 deletions(-) create mode 100644 app/scrapers/youtube.py create mode 100644 app/static/images/youtube_icon.png diff --git a/README.md b/README.md index fb52494b..9e13505c 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ [![Dependency Status](https://david-dm.org/fossasia/query-server.svg)](https://david-dm.org/ossasia/query-server) [![Join the chat at https://gitter.im/fossasia/query-server](https://badges.gitter.im/fossasia/query-server.svg)](https://gitter.im/fossasia/query-server?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) -> The query server can be used to search a keyword/phrase on a search engine (Google, Yahoo, Bing, Ask, DuckDuckGo, Yandex, Baidu and Exalead) and get the results as `json` or `xml`. The tool also stores the searched query string in a MongoDB database for analytical purposes. (The search engine scrapper is based on the scraper at [fossasia/searss](https://github.com/fossasia/searss).) +> The query server can be used to search a keyword/phrase on a search engine (Google, Yahoo, Bing, Ask, DuckDuckGo, Yandex, Baidu, Exalead, Quora and Youtube) and get the results as `json` or `xml`. The tool also stores the searched query string in a MongoDB database for analytical purposes. (The search engine scrapper is based on the scraper at [fossasia/searss](https://github.com/fossasia/searss).) [![Deploy to Docker Cloud](https://files.cloud.docker.com/images/deploy-to-dockercloud.svg)](https://cloud.docker.com/stack/deploy/?repo=https://github.com/fossasia/query-server) [![Deploy](https://www.herokucdn.com/deploy/button.svg)](https://heroku.com/deploy?template=https://github.com/fossasia/query-server) [![Deploy on Scalingo](https://cdn.scalingo.com/deploy/button.svg)](https://my.scalingo.com/deploy?source=https://github.com/fossasia/query-server#master) [![Deploy to Bluemix](https://bluemix.net/deploy/button.png)](https://bluemix.net/deploy?repository=https://github.com/fossasia/query-server&branch=master) @@ -23,7 +23,7 @@ The API(s) provided by query-server are as follows: ` GET /api/v1/search/?query=query&format=format ` -> *search-engine* : [`google`, `ask`, `bing`, `duckduckgo`, `yahoo`, `yandex`, `baidu`, `exalead`] +> *search-engine* : [`google`, `ask`, `bing`, `duckduckgo`, `yahoo`, `yandex`, `baidu`, `exalead`, `quora`, `youtube`] > *query* : query can be any string diff --git a/app/scrapers/__init__.py b/app/scrapers/__init__.py index 535e9042..9c19bb4d 100644 --- a/app/scrapers/__init__.py +++ b/app/scrapers/__init__.py @@ -10,6 +10,7 @@ from baidu import Baidu from exalead import Exalead from quora import Quora +from youtube import Youtube scrapers = { 'g': Google(), @@ -20,7 +21,8 @@ 'yd': Yandex(), 'u': Baidu(), 'e': Exalead(), - 'q': Quora() + 'q': Quora(), + 't': Youtube() } @@ -34,7 +36,7 @@ def small_test(): def feedgen(query, engine, count=10): - if engine == 'q': + if engine in ['q', 't']: urls = scrapers[engine].search_without_count(query) else: urls = scrapers[engine].search(query, count) diff --git a/app/scrapers/youtube.py b/app/scrapers/youtube.py new file mode 100644 index 00000000..0397637e --- /dev/null +++ b/app/scrapers/youtube.py @@ -0,0 +1,28 @@ +from __future__ import print_function +from generalized import Scraper + + +class Youtube(Scraper): + """Scraper class for Youtube""" + + def __init__(self): + self.url = 'https://www.youtube.com/results' + self.queryKey = 'search_query' + + def parseResponse(self, soup): + """ Parse the response and return list of urls + Returns: urls (list) + [[Tile1,url1], [Title2, url2],..] + """ + urls = [] + for a in soup.findAll('a'): + if a.get('href').startswith('/watch?'): + link = 'https://www.youtube.com' + str(a.get('href')) + if not a.getText().startswith('\n\n'): + urls.append({'title': a.getText(), 'link': link}) + else: + continue + + print('Youtube parsed: ' + str(urls)) + + return urls diff --git a/app/server.py b/app/server.py index 24e367ae..3b839c21 100644 --- a/app/server.py +++ b/app/server.py @@ -40,7 +40,7 @@ def search(search_engine): engine = search_engine if engine not in ('google', 'bing', 'duckduckgo', 'yahoo', 'ask', - 'yandex', 'ubaidu', 'exalead', 'quora'): + 'yandex', 'ubaidu', 'exalead', 'quora', 'tyoutube'): err = [404, 'Incorrect search engine', qformat] return bad_request(err) @@ -49,10 +49,7 @@ def search(search_engine): err = [400, 'Not Found - missing query', qformat] return bad_request(err) - if engine[0] == 'q': - result = feedgen(query, engine[0]) - else: - result = feedgen(query, engine[0], count) + result = feedgen(query, engine[0], count) if not result: err = [404, 'No response', qformat] return bad_request(err) diff --git a/app/static/images/youtube_icon.png b/app/static/images/youtube_icon.png new file mode 100644 index 0000000000000000000000000000000000000000..5924c8d653dbaaf8b603bf33f21435fd381681b4 GIT binary patch literal 7858 zcmZ8`2|SeB|NnWOd4?J5G)f|7>|~qDQr3q`NTrl5H>#Vh1<6e!4+$+dQ@UuewU83p zloX?6DOoBKVM5u;knGI-&gkCP_y79G>#U#U{ds>r=ZrI*a@w?BQcPY903c~^w{|N4 z7#yOF6cX%y1}h`5!+fnbS_5z|QG9k68P1WAt?O+-e%;c307$^eaoak$_)bNofvNmL zPO)sU;#WP*-|g=~ld8&VMMjs72K;@i#C*9#sYPA&le#Krv0}TH+GkC*ayi9T4K+A@ zqsZ*h)r5l{9Su0>(ba(CI)?H`HPuRarC0JwaHUgQ{e_%xT)mV57rD#Sp$z=3R8Z>B zQirCX40QBcw@QJ*)mHy9FY{WF*{vfq_*GW}u0U*1{mU{9hzAbZwbY?HcbP`T65--+ zo2{B^5XKuNp(2!rl3(Fek@-nO6;7cShy>!PlxIM77&~+a6`_S%B_{L>!N36w5L$p} zKWV^7m0IU@p)WXvTQG1K9aM*$K$~rvY8^1ZB}#8$)at6x6vWW1rV0TxDk}>F0w_?U z$b>Rbpi+SWEkm*2#)aVJl|E~!3xO>WMpVyKhPri3Wf*&tnkqC^wUqHiM;J29myl{X zMF^=uS-D$R143<7R(Yks_@Ju#UR9+s5Ppnd{9+sP*!=bq5@-tm^+uL zzmjKkLiXg9x^*>QD>2)(;fj#DJ}$RMPnbj)4rBmULM=lXCJ_eMrK{1bt_n%}Rb4$= zRXtu)-QU&4pPTFb@L{Z``e#+u?8wNM%a^9QyT?XGdfDucW~S4VlS6%dlVfANfq~hX znV&;Lqt(@eJv~Etd7Uva-)d^UoQP>MFlaM3`yAlk=Df9IUqH)pgYJ9xn$0cV>l?f` zGyk-8YuDk!U!Om3-|pP%>eiiovr*5W-QArVa=6XSy=?`nbN{}cd-pmI9j@2a|5{z$ zbN}AA>gw;`zJ0%w*W%zX-qghP^&Lo0|9b!4=g`pM;-{0}zmIfv^~A@0iH+&WzS$KP zrd*!A4W5Cp!&^5w101IjI9^7S;W&#)FbN#~u*%>++`BW#yR6JR!--eVD`RDJ2Gs}g zXgpp9;Y3R$3hFcPg2YZMRJ#D8M^A6Ilz7|F{Eye69Uj>uq7JKicE3czzA#eA(xs<$c%f;RV|5 zLVBjdA_|*Kp|vk`b__HgYTuN;zCSpe_k1G#^R^uPg{}Gv|8u#K2=e z{0D2le@Hu4Hd-nm-WHF}jn}4`UDqXLwuC2M_5Zr1>DiD@D=&1r;G>Jnq@a25Lb+kV z)rhq&n^&)Wzp3pGUu*SyzIe{;$AWb8RM+qBBa?e;?`Tv$d2XY3Bc{%5 zHsA{X@w93016KZ(QrD%WemhG)2uvcrOHZC_{P=EJ@3JL#Yz_L)Y3sz!Dfcl(Yoe-u z^mgoZ^ia%`%ynB8q1Yh#st)RgO=j(R+~;zx{px6XT34rY%kuHV7u(M2?H?PA<5w7X zmCn{$5p;!cNohUG;4`mVGc7!~<7p|9)Talhq_l4x8g`XVI~Mi*)R)>UdY){|d4=%7 zbFE*bEv;Q+6xZK9bF91Y$k_PTounrl3L`@wid`)oTIS2p*v`EjniIcz_TwCj)gIcv zKJ&=fp^r;$d!!GyMR(kBDdw=wY6UKY)#OmcP|GxIUd?Z4#`1I;?^^NZ zvVKgU4u{NwzDRu(Gxc6iwOnXESGh7up)Wz<3_rA7)wS5JVfh`gIZN)ZQBUQ*Tcf8- zu)LzVH92X;Pcuyf$tA`t(d5M=K3vl%{;qS$MChwy6M}Azr*=yk(Xm~hTAQ_f-Spec zrhL)bdri$z!^W9|g90Oc^5Jr$=O>@c@N96`h1~aq!XM@)$_-;W3BR1t@PZ3IR=#VYgjXZY3tnr@H~!kv+3t%EYNU`)?F8d zsHb17({4RWsV5^>qJIV)Ag}6GlZmC9<=!+Prw0&D0Xap#*5KbnZ#KW)OiDWt;dIFN z@i2|uZ(U;~U^UphJn?SdbP}heR?2~w??d9Q#I?$jE}IKX6cp9C-!!7c8hV&W%2P5W zTJP6FW<}`r9EjB?UF5rwvEAAN19Xk2f&~VGjrdxePi52_*S-W|2Mkx**}mimcnnG_Z46BNT(ZDi7JY%affd!p|KpaTKS&=WHhJ-0i_m z-7p}~%gi8cb)Mj%X#pmT{=G81k=Rg6dC`Hn`K@Iao-Cz+;C4?lP++2L*D|52H0io?uc zqan2P73>3ZN0w&x<6Qyf;**N_!0k?wp@QAcrV?V}vZNZX<2D0Ll~gMjFo`iRRl)b-0bJX2O1%CG}T;u;;IMau27W z0A;xs3BhSjCpDwb%>*#xx_v~rKeAElXOTWY zg8YI8(b-F*m#MMKyX`Yy`Jln1Vd4@lCtVvRRqe<;t1h%XB(gG^)zN2-Ck(Da05ZIk zwpIlPRmp2C@~;WS#8hY)W%7Do(?F_{&Q)&d+zSy5*r@uen?$7l^}ne1+e#Jkq@ zG6nmYor|z&lyy+XR?jeT_tZPhx5T-Dw57zlZ(DSu#=Hco$l-S?h}I$Sm(^^iKnJPa z0)~tduaUhch9;zszh?Gbbj~rS?pJyx&KZgq^e?Aou1TigjU?7XZ1w@h5=Dl$6T3yc z{ixN_rQS;)c1osW`qm&Wg_!MBCyK88)Ey2kv(0kEyu+w+4#fF}ozCd6{HWG`V4FiBr4gBk+#l%ADl31(hgRT>&Zw09>a0=US!tabh38@l?^Y#rdChbgS#T!joDl714j>3jmUPKejF`vJpxI6i1jE*-JqgQ z1Hf@7YR)(dV#PTA-WbyZWa}byd?gab(LMbw za6p=KQJv$8-D)Fscao>cV%g_`Bjlm}xp%~eZa2hE4LGL&Dh_s&y<6#QM)*5Qt{Dhq zoq-nN9CGNhoL@jn_mll9&G`j=r@lIcvbTY317xauvw|8*(*P@z32SQRIq2>1m$>fD!cOFP^^ z#{s&%sQ1K6fm0Tk+C!(cmsA~>;;6lo!vBFv*UYwn7%cNO#5v2}B2IuC3?cv)gFlr9 zk|;DdCa_p(&XtWSHsH2CGL@w%X@0{bQG~;US_4832{y)b0k;hhIul?Nd~6=!p2dJJ zjsLCg?S$Hha2-M@iwJpW2@CmJ&5KT5lLtPy!R#`V*P*l(z$h8ii;9C9(adH#6G8Ex z$gM}%-%};GGml7Y5V!KHs(rnba5al_Ig)=&VV&u>Kl`g8?^GjoA-AdrQG{ z+D2zfA8%2&ZO?X_TF2m12_H8|lv2msm%@A6HYQD#m=cT;{CS=5~YCc!*jurz7-ZoFpRtdaaNRCW?8en^X z0WWxA;K0rRjLqlr#$sYR1>O?ci=BX-o^TOX4-?#wB}9I6JAcpRccAl`@&teJ_-#*9 zerFP1idM~Km>o=^2axxGsm6idixcal@HHEV-VYy8(~1PVhO)3HnPhc%p^MIA z(a!~JCm`?n8_O~v2~!RUUPOq@WBIrp*l7tDr+S50wZPy1?c2hDxgq)afbA$`_ouHK z+}8w>^RcKcBr4%PJDwyiLwH>abC4CzBRW;{4^2=BaQHX#X~0l;ge|`~hy#Bt^OCT7 zF=89qP&;3i%Md>wYPD#_p<*&&C|qPm?S%Vq_~F0(0C>4u_Fg-p2@GbIoX)95{0>s9 zGmh5n%?L4&0f^S?c|m!^?+_HU=z1p(sKNpe9(fc;5x>9YE8r9`ESbkaX2Sd72vzPP zKxjlYJU+16lg54n@wVKtBMX%r3GnIJJq)jn_*5lLUh+H-l6D{})ezw#F~XqJqF7y3v>YxtfSQ7hB83Y1KsoH z*Gj<8JmHo#sqomuhI4Ss2uL6Qtj_ZQ$Io{iWi^i+DKA+I|Ez#^u*?nPQ8lk*L5zsE zU{wDRpcw+8hA}v%7=s1#Atrf#Vmt>{kOu#EUUKF&I^`C7%%- zN`{}SFe;+(-!^QxJO9rA%>6bx^RH1WUbq;J7A*5SK zi{RNA4S-1sZN~oFLRb1JLV(Al%?AUhVEH_p;Q4kJX}CP`$tDzEZ@z7^TX3N}!w!@t zbsfDmT-9mh5Dn8Hqck0(qXtcK)(E<$DgOqu>tF3c!BJORK#>PkUVx(TyCt3{mS~`O z1Nd$+oVP&*R7m`XCL;{2aHaw2InjUM0fPnbZ{uh_HV7HoLgHo#SjgnT)4f>-8Br-r z3|1aAOe0Rk^UMC6owq57{UXdbcT!Y>V^PTj*Ei3Z^2k3KEztv$7`Qh3BXr0_#WfcMPP2z7M!wpeEAEiaPav=#zK$X?AJs5L8SQ=G{ z4qJ-lyoa=2TQX{iBrGB3GFZDXrXSqP#<+u8SL02?W;doRS4CF6SPE;E7oE67KVDH8 zWTD(iRUo8E6~Mn#_f|2G1bHIGGT;^kGP8=mO%nH!Iwi^($+_Z!4dhjry)M82Twu_foAI3a9DSjny4sS2to*RT#1DXAw~4YL%jM)@Pa#M z6?*KWADC5hGKJE)oy$?DZO6=i)n6rZH)~(E*r*s4a(w{`@6@GHmnArn8e68p_o%rm}^CUz|!COE?<4DdU;CX<4Q4)F8_Y%$E3o7yQVvV1LPNSp&;bK{5Ka5 zEkIT3I(x|2d&?>cUW^u2jjV^Kz`OoTWz2jrZ>~ZMIl5v>pekd02TO?v%=8|2jO^>g z{j3nJbr68*QsU0rYr-ShfAOW2D0{EZ7Fd6wv;C=#0Dh!A7q#3&hgS*=rFJ_QGCX^Y z44Hp9bd|yx$=H76xZq%0~Aie zOfAW#IPegVO%;v3k3&t{UvTtptT8$78_YI&mj5(8n zhCYJet2*%M63-wTE};A=gtC;Mf;eJbBFvQwC~7j$`O6_gI;Hz!KjIdsL8}ahFQC*g z*%tUX8q5*pTqkH%#X^%?aEhN${p{dM8A|s+E!=R?fR@v-n6RM*AYtEJE%+?hh)Yb= zOK@DAqnSlQ_&u7@j^vmp8{spMypq)5yNp&5cL}M6Yt_Q{((5F=5>?Iav9 z*cZW`fDceJ9Z~OS*#E;IRZlyLpe;_2F*x5|R|X9cdQE6h*(dzxP=PGSAlXb8xrlQP z_>gK!?s|X-3DAFMT$xCOa<_Y}b_nj^tE32&%Sb&TT%5BJP$GnF!h9tV zU97fujVe-oa5*bsTFSf6g2O}lpJd*H9+%tM?css@`%}(aulUO-jQ?Dec)|nEM2TwQ z>YK;F-w%Erk^IBZ$gZNU5_$<>CzMu_*zLRavt!LJL$--xYr4o~7?-w)D0S7%oJ-)( zeH*3dg+;le83$Oa?KpAYVATRUPO?UQSsTG7qT%;U#>|U5lOBV1k=TWK2c~CBBq=u^ zXH9`Hd78t}IoUcW=X8j47h|^VC7gDbc$f-G7fReLdhiJR^8B>}cmLzVdk=7b-&>l) zV5@9>*&=`%0QaKig;_V6iz4_BES+O74D7Q2E+C-cjO{IB2CAM_{aRUBca_-?J@yG>}Yi#PMXwRE>+Zc(Y~6Ss91Dt_t;0<4U8kTaRuw zgF*k9>7-)vXl&Lua)@Me)bqo~4lt=jbLeT!Yu{wNP3u%r+j0Jbo7rOO_a4q6!y4d4 zDI#}NX7~p_lJ@!hXojAVyH@ngG%7y6>H5hR$9$W$O_t~{{Jc@oOyFLx-+j`it?7L+ zcp?d`1HoC3Jscg9(-&1_HSJ4_7zj;K&hT&y*%x6k~UZ+5Ce5(C^6bO zUWMqdIH+~;OtQ}$)SYL+XwG#pv^5lk0%F`fzyjW0t`I---Ry{)u^3xyL z`e_2)L6eBS`y~&C+(Rx;tuYz%ZGU(;NeBE9(t0_sBs{s8(r|aQ%y97Ojz2{DPJbN# zlN?m6fHaVg_`c#E02Rk(c%yh<$8?#Y4*O<7=sItykAn0(H(i9~J1F?-{>2UTKBznR z=Ct3af0V@G7XK>`Vg`!anxgBL^<@aQ=p+~84dqsXefM|ocu=!0MymLcrjGezG;?K} z;B=lu_;jFPLFRMgyEU(E(9GutdAu5j8@_KfK1%FX5;V3&zwtDogwL*r-($adbRx4V zswa)(!^1MkR~#Z{tD1ocmbJRKpoU)fP0W=p|5x*A-31o%;P1JS&d*6VZ}cAGku&R# z@Y|Aos2;6RrTfGmwWrq&c=<1A(Lib60|mb*VRjUsC%Z$uWM7?r>+4SD7Ve$XxgX@R zzoY2;xK(U}@gECDhBrS`m8;`u7?5Jp=St6hNu;>^nlLcy+z=y|{6}M%&&fSryBKm( zvOMJ(B4nik?&eSSF6k~(wAh|&*}W=iKro>#*tJhXU~)_n*LiHDdBwH7@olSd!Tq-@ j1xMyWMYK){L`(W3^}n8yc?SP)4cM>Sv^L+yJLdlYv5Ye} literal 0 HcmV?d00001 diff --git a/app/templates/index.html b/app/templates/index.html index 057961f5..61dccc50 100644 --- a/app/templates/index.html +++ b/app/templates/index.html @@ -64,6 +64,7 @@

query-server

+