In [1]:
from glob import glob

In [2]:
#A list of notebooks
lab10_folders = glob('notebooks/**/lab10.ipynb', recursive=True)

In [3]:
#Number of notebooks that have saved history
len(lab10_folders)

160

In [4]:
import json

In [5]:
#Create a list of dictionaries. Each dictionary is the metadata for one of the 160 notebooks
lab10s = []
for file in lab10_folders: 
    with open(file, 'r') as f: 
        lab10s.append(json.load(f))


In [6]:
#Num files with history
len(lab10s)

160

In [7]:
#Example metadata of each cell for the 4th notebook
lab10s[3]['cells']

[{'cell_type': 'markdown',
  'metadata': {},
  'source': ['## Conditional Probability\n',
   'This lab is an introduction to visualizing conditional probabilities.  We will cover *icon arrays*.  These do not appear in the textbook and will not appear on any exam, but they will help you gain intuition about conditional probability.\n',
   '\n',
   '#### Administrative details\n',
   "This lab will not be collected.  Conditional probability will appear on the final exam, and this is an opportunity to understand it better.  **We recommend going through at least part 2.**  You can complete the rest later as an exercise when you're studying."]},
 {'cell_type': 'code',
  'execution_count': 1,
  'metadata': {'collapsed': False,
     'response': {'content': {'data': {},
       'name': 'stdout',
      'metadata': {},
      'msg_type': 'stream',
      'version': '5.0'},
     'timestamp': '2016-12-01T21:33:31.692Z'}]},
  'outputs': [{'name': 'stdout',
    'output_type': 'stream',
     'Assignment

In [8]:
#Num cells in the 4th notebook
len(lab10s[3]['cells'])

63

In [9]:
def getCodeInCell(cell):
    """
    CELL - The metadata for one cell. 
    return - A list of code in string format.
            The number of elements in this list equals
            the number of times CELL was ran. 
            
            The number of elements in each inner list equals
            the number of lines in CELL.
            
            If the outer list doesn't contain list but just
            strings, then the number of lines of code in CELL was 1.
    """
    
    allCode = []
    
    # If metadata isn't collapsed (there was no output from CELL),
    # then look into metadata and extract from "metadata -> history" of CELL.
    if not cell['metadata']['collapsed'] \
        and 'history' in cell['metadata'] \
        and len(cell['metadata']['history']) >= 1:
        for past_code in cell['metadata']['history']:
            allCode.append(past_code['code'])
    
    # If metadata is collapsed, then extract from "source" of CELL.
    else:   
        allCode.extend(cell['source'])
    return allCode
        

In [10]:
#Example metadata of a cell that was ran more than once with output.
lab10s[3]['cells'][36]

{'cell_type': 'code',
 'execution_count': 27,
 'metadata': {'collapsed': False,
  'history': [{'code': 'cancer = people.pivot("cancer status", "test status")\ncancer',
    'response': {'content': {'data': {'text/plain': 'test status | healthy | sick\nnegative    | 1       | 1\npositive    | 1       | 1'},
      'execution_count': 23,
      'metadata': {}},
     'metadata': {},
     'msg_type': 'execute_result',
     'version': '5.0'},
    'timestamp': '2016-12-01T21:51:32.777Z'},
   {'code': 'cancer = people.pivot("test status", "cancer status")\ncancer',
    'response': {'content': {'data': {'text/plain': 'cancer status | negative | positive\nhealthy       | 1        | 1\nsick          | 1        | 1'},
      'execution_count': 24,
      'metadata': {}},
     'metadata': {},
     'msg_type': 'execute_result',
     'version': '5.0'},
    'timestamp': '2016-12-01T21:51:50.813Z'},
   {'code': 'cancer = people.pivot("test status", "cancer status", "count")\ncancer',
    'response': {'cont

In [11]:
#Output from getCodeInCell given the metadata above.
getCodeInCell(lab10s[3]['cells'][36])

['cancer = people.pivot("cancer status", "test status")\ncancer',
 'cancer = people.pivot("test status", "cancer status")\ncancer',
 'cancer = people.pivot("test status", "cancer status", "count")\ncancer',
 'cancer = people.pivot("test status", "cancer status", "count", sum)\ncancer']

In [12]:
#Example of a markdown cell.
lab10s[0]['cells'][4]

{'cell_type': 'markdown',
 'metadata': {},
 'source': ["**Question 2.1.** Knowing only what we've told you so far, what's the probability that you're a large green marble?"]}

In [13]:
#Returns whether CELL is a code cell.
def isCodeCell(cell):
    return cell['cell_type'] == 'code'

#Returns whether CELL is a markdown cell.
def isMarkDownCell(cell):
    return cell['cell_type'] == 'markdown'

In [14]:
import re


In [15]:
#The regex pattern that matches the string that would 
#denote the start of a question.
Q_PATTERN = re.compile("\*\*Question ([\d+.]+)\*\*.*")

In [16]:
def extract_code_from_student(student_num):
    ## NOTE: Assumes that all code that pertains to a particular question
    # is contained in the code cells that immediately follow the markdown
    # cell that states the question.
    
    qs = {}
    
    # RECORDING is True when a markdown cell that starts a question is hit
    # and it's time to record the code in the proceeding cells.
    recording = False
    
    # CURRENT is false when not recording any question. Otherwise, 
    # CURRENT is the string that denotes the current question number
    # that is being recorded (i.e. 2.1.1.)
    current = False
    
    # Get number of last index of the list of cells in STUDENT_NUM's LAB10
    last_ind = len(lab10s[student_num]['cells']) - 1
    
    # Iterate through all cells except the last two that are there
    # to help run all AG tests.
    for cell in lab10s[student_num]['cells'][:last_ind - 2]:
        
        # Extract code if RECORDING and CURRENT and CELL is a code cell.
        if recording:
            if isCodeCell(cell) and current:
                qs[current].append(getCodeInCell(cell))
            else: 
                current = False
                recording = False
                
        # If CELL is a markdown cell, then check each line of CELL to see
        # whether CELL denotes the start of a question. If so, set
        # CURRENT and RECORDING accordingly.
        if isMarkDownCell(cell):
            for s in cell['source']:
                if Q_PATTERN.match(s):
                    qNum = Q_PATTERN.search(s).group(1)
                    qs[qNum] = []
                    current = qNum
                    recording = True
                    break

    return qs

In [17]:
#Example output from extract_code_from_student
extract_code_from_student(0)

{'2.1.': [['probability_large_green = 4/13'], ['_ = tests.grade("q21")']],
 '2.1.1.': [['probability_green_given_large = 4/5'],
  ['_ = tests.grade("q211")']],
 '2.1.2.': [["# Make an icon array to help you compute the answer.\ndisplayed_grouped_icon_array(marbles.where('color', 'green').group('size'), 'green marbles')\n\n# Now compute the answer.\nprobability_large_given_green = ...",
   "# Make an icon array to help you compute the answer.\ndisplay_grouped_icon_array(marbles.where('color', 'green').group('size'), 'green marbles')\n\n# Now compute the answer.\nprobability_large_given_green = ...",
   "# Make an icon array to help you compute the answer.\ndisplay_grouped_icon_array(marbles.where('color', 'green').group('size'), 'green marbles')\n\n# Now compute the answer.\nprobability_large_given_green = 4/10"],
  ['_ = tests.grade("q212")']],
 '2.1.3.': [['# Just run this cell.  The next cell is where you should write your answer.\ndisplay_grouped_icon_array(marbles.groups(make_array

In [20]:
#Run extract_code_from_student for every student and put results in a dictionary
#where key values are integers 0-159 (the number of notebooks - 1) and the values 
#are the output from extract_code_from_student.
lab10s_full_data = {i:extract_code_from_student(i) for i in range(len(lab10_folders))}

In [21]:
lab10s_full_data

{0: {'2.1.': [['probability_large_green = 4/13'], ['_ = tests.grade("q21")']],
  '2.1.1.': [['probability_green_given_large = 4/5'],
   ['_ = tests.grade("q211")']],
  '2.1.2.': [["# Make an icon array to help you compute the answer.\ndisplayed_grouped_icon_array(marbles.where('color', 'green').group('size'), 'green marbles')\n\n# Now compute the answer.\nprobability_large_given_green = ...",
    "# Make an icon array to help you compute the answer.\ndisplay_grouped_icon_array(marbles.where('color', 'green').group('size'), 'green marbles')\n\n# Now compute the answer.\nprobability_large_given_green = ...",
    "# Make an icon array to help you compute the answer.\ndisplay_grouped_icon_array(marbles.where('color', 'green').group('size'), 'green marbles')\n\n# Now compute the answer.\nprobability_large_given_green = 4/10"],
   ['_ = tests.grade("q212")']],
  '2.1.3.': [['# Just run this cell.  The next cell is where you should write your answer.\ndisplay_grouped_icon_array(marbles.groups