We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Concurrency 50, duplicated links super (fried chicken many kind of)
Don't believe it, try it yourself
from gain import Css, Item, Parser, Spider, cssParser,Xpath from pyquery import PyQuery as pq import re import requests from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import sessionmaker from sqlalchemy import create_engine from sqlalchemy import Column, Integer, String engine = create_engine('sqlite:////home/dde/test.db', echo=False) Base = declarative_base() class videoInfo(Base): __tablename__ = 'users' id = Column(Integer, primary_key=True) videoTitle = Column(String) videoType = Column(String) videoAuthor = Column(String) videoNotes = Column(String) videoLang = Column(String) videoRegion = Column(String) videoPlayPage = Column(String) videoPlayLink = Column(String) Session = sessionmaker(bind=engine) session = Session() Base.metadata.create_all(engine) class getVideoInfo(Item): def filterPlayLink(link): url = 'http://www.xinxin46.com%s' % link[0] content = requests.get(url).text playUrl = eval(re.findall(r'\[\[.*?\]\]\]', content)[0])[0][1] result = str() for x in playUrl: line,playUrl,player = x.split('$') result += 'player----{}----{}----{}\n'.format(player,line,playUrl) # result = re.findall(r'/player/.*?/', content)[0][1:-1]+'$$$$'+ result return result videoTitle = Css('div.ui-cnt ul.intro li h2 a.title') videoType = Css('.intro > li:nth-child(1) p', process_func=lambda pqObj: ' '.join([pq(x).text() for x in pq(pqObj[0])('a')]) if len(pq(pqObj[0])('a'))>0 else pq(pqObj).text()) videoAuthor = Css('.intro > li:nth-child(1) p',process_func=lambda pqObj:' '.join([pq(x).text() for x in pq(pqObj[1])('a')]) if len(pq(pqObj[1])('a'))>0 else pq(pqObj).text()) videoNotes = Css('.intro > li:nth-child(1) p',process_func=lambda pqObj:pq(pqObj[2]).text()) videoLang = Css('.intro > li:nth-child(1) p',process_func=lambda pqObj:pq(pqObj[3]).text()) videoRegion = Css('.intro > li:nth-child(1) p',process_func=lambda pqObj:pq(pqObj[4]).text()) videoPlayPage = Css('.play-list li a[href^="/player/"]',process_func=lambda pqObj:'\n'.join(['link----'+pq(x).text()+'----' +pq(x).attr('href') for x in pqObj])) videoPlayLink = Xpath('/html/body/div[3]/div/div[1]/div[1]/script[1]/@src',process_func=filterPlayLink) async def save(self): if hasattr(self,'videoTitle')\ and hasattr(self,'videoType')\ and hasattr(self,'videoAuthor')\ and hasattr(self,'videoNotes')\ and hasattr(self,'videoLang')\ and hasattr(self,'videoRegion')\ and hasattr(self,'videoPlayPage')\ and hasattr(self,'videoPlayLink'): """ if self.videoPlayLink.find('qvod') >-1: return print('片名:%s' % self.videoTitle) print('类型:%s' % self.videoType) print('主演:%s' % self.videoAuthor) print('%s' % self.videoNotes) print('%s' % self.videoLang) print('%s' % self.videoRegion) print('%s' % self.videoPlayPage) print('%s' % self.videoPlayLink) print('-------') """ global session addInfo = videoInfo(videoTitle=self.videoTitle,videoType=self.videoType,videoAuthor=self.videoAuthor,videoNotes=self.videoNotes,videoLang=self.videoLang,videoRegion=self.videoRegion,videoPlayPage=self.videoPlayPage,videoPlayLink=self.videoPlayLink) session.add(addInfo) session.commit() class MySpider(Spider): concurrency = 50 encoding = 'gbk' headers = {'User-Agent': 'Google Spider'} start_url = r'http://www.xinxin46.com/L/lilunpian.html' parsers = [cssParser('.ui-pages a[href^="/L/lilunpian"]',attr='href'), cssParser('.primary-list li h5 a[href^="/V/"]',attr='href'), cssParser('.play-list a[href^="/player/"]',getVideoInfo,attr='href'), ] MySpider.run() session.close() ''' import requests a= requests.get('http://www.xinxin46.com/player/baishilingyincangjurudepusuOLshimingantizhidenvhaiFSET680/index-0-0.html').text print(pq(a)('script[src^="/playdata/"]')) '''
The text was updated successfully, but these errors were encountered:
No branches or pull requests
Concurrency 50, duplicated links super (fried chicken many kind of)
Don't believe it, try it yourself
The text was updated successfully, but these errors were encountered: