/
slidesaver.py
83 lines (78 loc) · 3.41 KB
/
slidesaver.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# pip3 install pyquery requests reportlab
import json
import pyquery
import re
import requests
import reportlab.lib.utils
import reportlab.pdfgen.canvas
import sys
def main():
link = sys.argv[1]
resp = requests.get(link)
if resp.status_code == 200:
if 'slideshare.net' in link:
query = pyquery.PyQuery(resp.text)
title = query('h1:first').text()
title = re.sub(r'[^a-zA-Z0-9%!-_*.]', ' ', title)
if resp.text.find('"type":"presentation"') != -1:
images = query('img.slide_image')
c = reportlab.pdfgen.canvas.Canvas(title + '.pdf')
for image in images:
link = image.attrib['data-full']
page = reportlab.lib.utils.ImageReader(link)
pagew, pageh = page.getSize()
c.setPageSize((pagew, pageh))
c.drawImage(page, 0, 0, pagew, pageh)
c.showPage()
print(link)
c.save()
if resp.text.find('"type":"video"') != -1:
match = re.search(
r'"ppt_location":"\
(?P<filename>.+?)".+?"video_bucket":"\
(?P<domain>.+?)".+?"video_extension":"\
(?P<extension>.+?)"', resp.text)
if match is not None:
# this combination rule comes from
# http://public.slidesharecdn.com/b/slideview/scripts/combined_video_init.js
# line 232: var
# d=this.config.videoBucket+"/"+this.config.pptLocation+"-SD."+this.config.videoExtension
link = 'http:%s/%s-SD.%s' % (
match.group('domain'),
match.group('filename'),
match.group('extension'))
resp = requests.get(link)
with open('output.mp4', 'wb') as f:
for chunk in resp.iter_content():
f.write(chunk),
f.close()
print(link)
if 'speakerdeck.com' in link:
query = pyquery.PyQuery(resp.text)
title = query('h1')[1].text
title = re.sub(r'[^a-zA-Z0-9]', ' ', title)
embeds = query('div.speakerdeck-embed')
for embed in embeds:
code = embed.attrib['data-id']
resp = requests.get(
'https://speakerdeck.com/player/%s' % (code))
if resp.status_code == 200:
match = re.search(r'"slides":(?P<list>\[.+?\])', resp.text)
images = json.loads(match.group('list'))
c = reportlab.pdfgen.canvas.Canvas(title + '.pdf')
for image in images:
link = image['original']
page = reportlab.lib.utils.ImageReader(link)
pagew, pageh = page.getSize()
c.setPageSize((pagew, pageh))
c.drawImage(page, 0, 0, pagew, pageh)
c.showPage()
print(link)
c.save()
if __name__ == '__main__':
if len(sys.argv) < 2:
print('python slidesaver.py [URL]')
else:
main()