-
Notifications
You must be signed in to change notification settings - Fork 1
/
tudouscraper.py
54 lines (47 loc) · 1.77 KB
/
tudouscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import sys
import os
import re
import json
import wget
import urllib.request
from bs4 import BeautifulSoup
vidid = input("Enter a Tudou identifier: ")
url = 'http://video.tudou.com/v/' + vidid + '.html'
fnameappend = '.info.json'
jsonenc = json.JSONEncoder()
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
title = soup.find('span', attrs={'id': 'subtitle'})['title']
def Tudou():
dict = {'vidid': vidid, \
'origurl': url, \
'title': title, \
'description': None, \
'uploader': None, \
'channel': None, \
'uploaded': None}
dict['description'] = soup.find('div', attrs={'class': 'td-play__videoinfo__details-box__desc'}).text.strip()
uploader = soup.find('a', attrs={'class': 'td-play__userinfo__name'})
dict['uploader'] = uploader.text.strip()
dict['channel'] = 'http:' + uploader['href']
dict['uploaded'] = soup.find('meta', attrs={'name': 'publishedtime'})['content']
print('Title: ' + dict['title'])
print('Uploader: ' + dict['uploader'])
print('Channel link: ' + dict['channel'])
print('Upload date: ' + dict['uploaded'])
print('Description: ' + dict['description'])
print('Original url: ' + dict['origurl'])
print ('Downloading thumbnail...')
thumbloc = soup.find('meta', attrs={'name': 'thumb'})['content']
thumbdown = wget.download(thumbloc)
os.rename(thumbdown,title+'.jpg')
filename=title.translate(str.maketrans("*/\\<>:\"|","--------")).strip()+"-" + vidid + fnameappend
print(filename)
print(dict)
f = open(filename, 'w')
f.write(jsonenc.encode({dict['vidid']: dict}))
f.close()
Tudou()
print('\n' + 'Downloading video...')
os.system('you-get ' + url + ' -O "' + title + '"-' + vidid)
print('Done!')