# Eric's approach for quizes

In [148]:
import requests
import os.path
from lxml import html
import time
from collections import defaultdict

In [4]:
def download_quizzes(topic, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    concepts = get_concepts(topic)
    for c in concepts:
        quizzes = get_quizzes(c)
        for q in quizzes:
            download_quiz(output_dir, q)


def get_concepts(topic):
    res = requests.get('http://www.ck12.org/%s/' % topic)
    res.raise_for_status()
    content = res.content
    tree = html.fromstring(content)
    concepts = []
    for e in tree.xpath('//li[@class="concepts"]/a'):
        concepts.append(e.attrib['href'].split('/')[-2])
    return concepts


def get_quizzes(concept):
    quizzes = []
    url = 'http://api-prod.ck12.org/flx/get/minimal/modalities/at%20grade/' + concept + '?pageSize=13&pageNum=0&ownedBy=ck12&modalities=concept%2Clessonplan%2Clessonplanans%2Csimulationint%2Clessonplanx%2Crubric%2Cactivityans%2Clesson%2Cpostreadans%2Cprepostread%2Cweb%2Ccthink%2Crwaans%2Csection%2Cplix%2Cwhileread%2Cquiz%2Clessonplanxans%2Cpreread%2Cattachment%2Clecture%2Cpresentation%2Cimage%2Cquizdemo%2Crwa%2Cwhilereadans%2Cprereadans%2Cpostread%2Cexerciseint%2Clab%2Cflashcard%2Cstudyguide%2Cquizans%2Casmtpractice%2Cprepostreadans%2Clabans%2Casmtquiz%2Cworksheet%2Chandout%2Csimulation%2Cexercise%2Cactivity%2Cworksheetans%2Caudio%2Cconceptmap%2Cenrichment%2Cinteractive&level=at%2Bgrade&expirationAge=daily'
    res = requests.get(url)
    res.raise_for_status()
    for m in res.json()['response']['domain']['modalities']:
        if m['artifactType'] == 'quiz':
            quizzes.append(m['perma'].split('/')[-1])
    return quizzes


def download_quiz(output_dir, quiz):
    filename = output_dir + '/{0}-Answer-Key.docx'.format(quiz)
    if os.path.isfile(filename):
        return

    res = requests.get('http://www.ck12.org/flx/show/answer%20key/' + quiz + '-Answer-Key')
    if res.status_code != 200:
        return

    res.raise_for_status()
    assert res.headers['Content-Type'] == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
    with open(filename, 'wb') as f:
        f.write(res.content)

In [5]:
def download_ck12_quizzes(ctx, topic, directory):
    """Download quizzes from http://www.ck12.org/"""
    download_quizzes(topic, directory)
    logger.info('Downloading quizzes completed')

In [6]:
topics = ['earth-science', 'life-science', 'physical-science']

In [8]:
es_topics = get_concepts(topics[0])

In [9]:
es_topics[0]

'Scientific-Explanations-and-Interpretations'

In [10]:
get_quizzes(es_topics[0])

['Scientific-Explanations-and-Interpretations-Quiz']

In [18]:
def get_reading(concept):
    quizzes = []
    url = 'http://api-prod.ck12.org/flx/get/minimal/modalities/at%20grade/' + concept + \
    '?pageSize=13&pageNum=0&ownedBy=ck12&modalities=concept%2Clessonplan%2Clessonplanans%2Csimulationint%2Clessonplanx%2Crubric%2Cactivityans%2Clesson%2Cpostreadans%2Cprepostread%2Cweb%2Ccthink%2Crwaans%2Csection%2Cplix%2Cwhileread%2Cquiz%2Clessonplanxans%2Cpreread%2Cattachment%2Clecture%2Cpresentation%2Cimage%2Cquizdemo%2Crwa%2Cwhilereadans%2Cprereadans%2Cpostread%2Cexerciseint%2Clab%2Cflashcard%2Cstudyguide%2Cquizans%2Casmtpractice%2Cprepostreadans%2Clabans%2Casmtquiz%2Cworksheet%2Chandout%2Csimulation%2Cexercise%2Cactivity%2Cworksheetans%2Caudio%2Cconceptmap%2Cenrichment%2Cinteractive&level=at%2Bgrade&expirationAge=daily'
    res = requests.get(url)
    res.raise_for_status()
    for m in res.json()['response']['domain']['modalities']:
        if m['artifactType'] == 'lesson':
            quizzes.append(m['perma'].split('/')[-1])
    return quizzes

In [20]:
es_readings = get_reading(es_topics[0])

In [22]:
def download_reading(output_dir, quiz):
#     filename = output_dir + '/{0}-Answer-Key.docx'.format(quiz)
    if os.path.isfile(filename):
        return

    res = requests.get('http://www.ck12.org/earth-science/Scientific-Explanations-and-Interpretations/lesson/' + quiz)
    if res.status_code != 200:
        return
    return res
#     res.raise_for_status()
    
#     assert res.headers['Content-Type'] == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
#     with open(filename, 'wb') as f:
#         f.write(res.content)

In [24]:
tr1 = download_reading('test_out', es_readings[0])

In [35]:
from IPython.core.display import HTML

data-artifactID="7845"

data-artifactrevisionid="4479090"

In [71]:
# tr1.text

In [49]:
# HTML(tr1.text)

http://www.ck12.org/earth-science/Scientific-Explanations-and-Interpretations/lesson/Scientific-Explanations-and-Interpretations-HS-ES/

https://dr282zn36sxxg.cloudfront.net/datastreams/f-d%3A6c10752ab34313f4f7108cf6cc8a016dc3e0adb7083d0fec7d74efdd%2BPDF%2BPDF.1

Scientific-Explanations-and-Interpretations-HS-ES_l_v50_qri_s1.pdf

http://www.ck12.org/flx/show/answer%20key/Scientific-Explanations-and-Interpretations-Quiz-Answer-Key

http://www.ck12.org/flx/show/answer%20key/Scientific-Explanations-and-Interpretations-Quiz-MS-ES-Answer-Key

http://www.ck12.org/flx/show/pdf/Bathymetric-Evidence-for-Seafloor-Spreading-HS-ES_l_v48_fyo_s1.pdf

In [None]:
http://www.ck12.org/flx/show/pdf/Scientific-Explanations-and-Interpretations-HS-ES_l_v50_qri_s1.pdf

# Using BS4

In [None]:
import requests
from bs4 import BeautifulSoup

In [77]:
def get_concepts(topic):
    res = requests.get('http://www.ck12.org/%s/' % topic)
    res.raise_for_status()
    content = res.content
    tree = html.fromstring(content)
    concepts = []
    for e in tree.xpath('//li[@class="concepts"]/a'):
        concepts.append(e.attrib['href'].split('/')[-2])
    return concepts

In [150]:
def get_readings(concept):
    readings = []
    url = 'http://api-prod.ck12.org/flx/get/minimal/modalities/at%20grade/' + concept + \
    '?pageSize=13&pageNum=0&ownedBy=ck12&modalities=concept%2Clessonplan%2Clessonplanans%2Csimulationint%2Clessonplanx%2Crubric%2Cactivityans%2Clesson%2Cpostreadans%2Cprepostread%2Cweb%2Ccthink%2Crwaans%2Csection%2Cplix%2Cwhileread%2Cquiz%2Clessonplanxans%2Cpreread%2Cattachment%2Clecture%2Cpresentation%2Cimage%2Cquizdemo%2Crwa%2Cwhilereadans%2Cprereadans%2Cpostread%2Cexerciseint%2Clab%2Cflashcard%2Cstudyguide%2Cquizans%2Casmtpractice%2Cprepostreadans%2Clabans%2Casmtquiz%2Cworksheet%2Chandout%2Csimulation%2Cexercise%2Cactivity%2Cworksheetans%2Caudio%2Cconceptmap%2Cenrichment%2Cinteractive&level=at%2Bgrade&expirationAge=daily'
    res = requests.get(url)
    res.raise_for_status()
    for m in res.json()['response']['domain']['modalities']:
        if m['artifactType'] == 'lesson':
            readings.append(m['perma'].split('/')[-1])
    return readings

In [79]:
topics = ['earth-science', 'life-science', 'physical-science']

In [80]:
es_topics = get_concepts(topics[0])

In [151]:
topic_readings = defaultdict(list)
for topic in es_topics[5:10]:
    topic_readings[topic].extend(get_readings(topic))

In [153]:
topic_readings

defaultdict(list,
            {'Development-of-Theories': ['Development-of-Theories-HS-ES',
              'user:chubbard',
              'user:amvmzmvyev9mzwf0agvyc0boym9llm9yzw..'],
             'Evolution-Plate-Tectonics-and-Climate-Change': ['Evolution-Plate-Tectonics-and-Climate-Change-HS-ES',
              'user:chubbard'],
             'Observations-and-Experiments': ['Observations-and-Experiments-HS-ES',
              'user:chubbard',
              'user:amvmzmvyev9mzwf0agvyc0boym9llm9yzw..'],
             'Scientific-Community': ['Scientific-Community-HS-ES',
              'user:chubbard',
              'user:cgftzwxhlnjlbmvllmfzaebnbwfpbc5jb20.'],
             'Scientific-Models': ['Scientific-Models-HS-ES',
              'user:ekb5ywhvby5jb20.',
              'user:tlake25']})

In [147]:
def make_pdf_download_requests(topic_readings):
    my_acc_cookie = dict(dxtr='9e95ada69c9997e9ecdddfdcdff1e3eee2f3e9ef',
                  auth='2f04b86dd1d095a9fe9ed75bc15fcee7fd857ace00764530d5424291834ed5af925c58a9',
                  walkthrough='walkthrough',
                  mceVersion='4',
                  mceSeenNewEditor='true',
                  ck12olympics='visited',
                  flxweb='886838535f5f9dd052853d36d7bdfad9eeeb83d0c8e99c1e4f684ec6b95d86869ea18847',
                  cdnAPIver='20160809',
                  assessment='403a5b7cae2f9cb667fc62cb75dd6c1108d1da60ae2f3ec365e64b04b48f74207b35a016',
                  flxDashboardState='selfStudy',
                  flxweb_role='teacher',
                  browseview='listview', 
                  dexterjsVisitorID='j5OhVsPELviphLX6FnDk9RInN'
                 )

    render_responses = {}
    lesson_base_url = 'http://www.ck12.org/earth-science/{}/lesson/{}'
    render_req_base_url = 'http://www.ck12.org/render/pdf/status/{}/{}'

    for topic, lessons in topic_readings.items():
        for lesson in lessons:
            lesson_url = lesson_base_url.format(topic, lesson)
            lesson_r = requests.get(lesson_url)
            soup = BeautifulSoup(lesson_r.content, 'html.parser')
            pdf_links = soup.find_all("a", {"class":"js_signinrequired pdf"})
            link_attr = pdf_links[0].attrs
            da_id = link_attr['data-artifactid']
            dar_id = link_attr['data-artifactrevisionid']
            render_req_url = render_req_base_url.format(da_id, dar_id)
            render_responses[lesson] = requests.get(render_req_url, cookies= my_acc_cookie).json()
    return render_responses

In [145]:
{topic: resp['result'] for topic, resp in render_responses.items()}

{'Development-of-Theories-HS-ES': 'http://www.ck12.org/flx/show/pdf/Development-of-Theories-HS-ES_l_v54_acs_s1.pdf',
 'Evolution-Plate-Tectonics-and-Climate-Change-HS-ES': 'http://www.ck12.org/flx/show/pdf/Evolution-Plate-Tectonics-and-Climate-Change-HS-ES_l_v51_pjg_s1.pdf',
 'Observations-and-Experiments-HS-ES': 'http://www.ck12.org/flx/show/pdf/Observations-and-Experiments-HS-ES_l_v53_zcs_s1.pdf'}

In [None]:
url = 'http://www.ck12.org/earth-science/Scientific-Explanations-and-Interpretations/lesson/' + 'Scientific-Explanations-and-Interpretations-HS-ES'
page_doc = requests.get(url)
soup = BeautifulSoup(page_doc.content, 'html.parser')_

In [23]:
results = soup.find_all("a", {"class":"js_signinrequired pdf"})[0]

In [28]:
results.attrs

{'class': ['js_signinrequired', 'pdf'],
 'data-artifactid': '7845',
 'data-artifactrevisionid': '4479090',
 'data-rendertemplatetype': 'onecolumn',
 'data-rendertype': 'pdf',
 'href': '#'}

In [31]:
da_id = results.attrs['data-artifactid']
dar_id = results.attrs['data-artifactrevisionid']

In [33]:
render_req_url = 'http://www.ck12.org/render/pdf/status/' + da_id + '/'  +dar_id
render_req_url

'http://www.ck12.org/render/pdf/status/7845/4479090'

In [62]:
render_resp = requests.get(render_req_url, cookies= acc_cookie)

In [None]:
def download_lesson_pdf(output_dir, pdf_uri):
    filename = 'test_dl' + '/'.format(quiz)
    if os.path.isfile(filename):
        return

    res = requests.get(pdf_uri)
    if res.status_code != 200:
        return

    res.raise_for_status()
    assert res.headers['Content-Type'] == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
    with open(filename, 'wb') as f:
        f.write(res.content)

In [66]:
render_resp.json()

{'hostname': 'cevag2.px12.bet',
 'id': 15537603,
 'name': 'pdf',
 'owner': {'authID': 1442441,
  'defaultLogin': 'zhvzdgluc0bhbgxlbmfplm9yzw..',
  'email': 'dustins@allenai.org',
  'id': 1442441,
  'login': 'zhvzdgluc0bhbgxlbmfplm9yzw..',
  'name': 'Dustin Schwenk',
  'timezone': 'US/Pacific'},
 'result': 'http://www.ck12.org/flx/show/pdf/Scientific-Explanations-and-Interpretations-HS-ES_l_v50_qri_s1.pdf',
 'started': '2016-08-24 10:36:45',
 'status': 'SUCCESS',
 'taskID': '4653b34c-b25c-40a1-9b51-dd3254bf37b6',
 'updated': '2016-08-24 10:37:27',
 'userdata': {'artifactUrl': '',
  'downloadUri': 'http://www.ck12.org/flx/show/pdf/Scientific-Explanations-and-Interpretations-HS-ES_l_v50_qri_s1.pdf',
  'template': 'onecolumn'}}

In [67]:
acc_cookie = dict(dxtr='9e95ada69c9997e9ecdddfdcdff1e3eee2f3e9ef',
              auth='2f04b86dd1d095a9fe9ed75bc15fcee7fd857ace00764530d5424291834ed5af925c58a9',
              walkthrough='walkthrough',
              mceVersion='4',
              mceSeenNewEditor='true',
              ck12olympics='visited',
              flxweb='886838535f5f9dd052853d36d7bdfad9eeeb83d0c8e99c1e4f684ec6b95d86869ea18847',
              cdnAPIver='20160809',
              assessment='403a5b7cae2f9cb667fc62cb75dd6c1108d1da60ae2f3ec365e64b04b48f74207b35a016',
              flxDashboardState='selfStudy',
              flxweb_role='teacher',
              browseview='listview', 
              dexterjsVisitorID='j5OhVsPELviphLX6FnDk9RInN'
             )