Repeated bug #39

allphfa · 2017-12-21T02:11:23Z

Concurrency 50, duplicated links super (fried chicken many kind of)

Don't believe it, try it yourself

from gain import Css, Item, Parser, Spider, cssParser,Xpath
from pyquery import PyQuery as pq
import re
import requests

from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
from sqlalchemy import Column, Integer, String


engine = create_engine('sqlite:////home/dde/test.db', echo=False)
Base = declarative_base()

class videoInfo(Base):
    __tablename__ = 'users'
    id = Column(Integer, primary_key=True)
    videoTitle = Column(String)
    videoType = Column(String)
    videoAuthor = Column(String)
    videoNotes = Column(String)
    videoLang = Column(String)
    videoRegion = Column(String)
    videoPlayPage = Column(String)
    videoPlayLink = Column(String)

Session = sessionmaker(bind=engine)
session = Session()
Base.metadata.create_all(engine)



class getVideoInfo(Item):

    def filterPlayLink(link):
        url = 'http://www.xinxin46.com%s' % link[0]
        content = requests.get(url).text
        playUrl = eval(re.findall(r'\[\[.*?\]\]\]', content)[0])[0][1]
        result = str()
        for x in playUrl:
            line,playUrl,player = x.split('$')
            result += 'player----{}----{}----{}\n'.format(player,line,playUrl)
        # result = re.findall(r'/player/.*?/', content)[0][1:-1]+'$$$$'+ result
        return result
    videoTitle = Css('div.ui-cnt ul.intro li h2 a.title')
    videoType = Css('.intro > li:nth-child(1)  p', process_func=lambda pqObj: ' '.join([pq(x).text() for x in pq(pqObj[0])('a')]) if len(pq(pqObj[0])('a'))>0 else pq(pqObj).text())
    videoAuthor = Css('.intro > li:nth-child(1)  p',process_func=lambda pqObj:' '.join([pq(x).text() for x in pq(pqObj[1])('a')]) if len(pq(pqObj[1])('a'))>0 else pq(pqObj).text())
    videoNotes = Css('.intro > li:nth-child(1)  p',process_func=lambda pqObj:pq(pqObj[2]).text())
    videoLang = Css('.intro > li:nth-child(1)  p',process_func=lambda pqObj:pq(pqObj[3]).text())
    videoRegion = Css('.intro > li:nth-child(1) p',process_func=lambda pqObj:pq(pqObj[4]).text())
    videoPlayPage = Css('.play-list li a[href^="/player/"]',process_func=lambda pqObj:'\n'.join(['link----'+pq(x).text()+'----' +pq(x).attr('href') for x in pqObj]))
    
    videoPlayLink = Xpath('/html/body/div[3]/div/div[1]/div[1]/script[1]/@src',process_func=filterPlayLink)

    async def save(self):
        if hasattr(self,'videoTitle')\
            and hasattr(self,'videoType')\
            and hasattr(self,'videoAuthor')\
            and hasattr(self,'videoNotes')\
            and hasattr(self,'videoLang')\
            and hasattr(self,'videoRegion')\
            and hasattr(self,'videoPlayPage')\
            and hasattr(self,'videoPlayLink'):
            """
            if self.videoPlayLink.find('qvod') >-1:
                return

            print('片名：%s' % self.videoTitle)
            print('类型：%s' % self.videoType)
            print('主演：%s' % self.videoAuthor)
            print('%s' % self.videoNotes)
            print('%s' % self.videoLang)
            print('%s' % self.videoRegion)
            print('%s' % self.videoPlayPage)
            print('%s' % self.videoPlayLink)
            print('-------')
            """
            global session
            addInfo = videoInfo(videoTitle=self.videoTitle,videoType=self.videoType,videoAuthor=self.videoAuthor,videoNotes=self.videoNotes,videoLang=self.videoLang,videoRegion=self.videoRegion,videoPlayPage=self.videoPlayPage,videoPlayLink=self.videoPlayLink)
            session.add(addInfo)
            session.commit()




class MySpider(Spider):
    concurrency = 50
    encoding = 'gbk'
    headers = {'User-Agent': 'Google Spider'}
    start_url = r'http://www.xinxin46.com/L/lilunpian.html'
    parsers = [cssParser('.ui-pages a[href^="/L/lilunpian"]',attr='href'),
               cssParser('.primary-list li h5 a[href^="/V/"]',attr='href'),
               cssParser('.play-list a[href^="/player/"]',getVideoInfo,attr='href'),
               ]


MySpider.run()

session.close()

'''
import requests

a= requests.get('http://www.xinxin46.com/player/baishilingyincangjurudepusuOLshimingantizhidenvhaiFSET680/index-0-0.html').text
print(pq(a)('script[src^="/playdata/"]'))
'''

allphfa closed this as completed Feb 8, 2018

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repeated bug #39

Repeated bug #39

allphfa commented Dec 21, 2017

Repeated bug #39

Repeated bug #39

Comments

allphfa commented Dec 21, 2017