# CPython compilation & bytecode

So, you wrote some Python code. What needs to happen before it starts running? How is it represented before and during execution? Find answers here.

Note: Everything here is specific to CPython – the reference implementation. Other Python interpreters, like PyPy, Jython or Batavia, do things very differently.

In [2]:
import os
import sys
import time
import inspect
import datetime

---
# Lexical analysis
(tokenization)

In [3]:
import tokenize

In [4]:
!cat -n module.py

     1	a = 3
     2	b = 'Hello '
     3	print(a * b)
     4	
     5	def func(a=1, *b, **c):
     6	    return 7 + 3


In [5]:
!python3 -m tokenize module.py

0,0-0,0:            ENCODING       'utf-8'        
1,0-1,1:            NAME           'a'            
1,2-1,3:            OP             '='            
1,4-1,5:            NUMBER         '3'            
1,5-1,6:            NEWLINE        '\n'           
2,0-2,1:            NAME           'b'            
2,2-2,3:            OP             '='            
2,4-2,12:           STRING         "'Hello '"     
2,12-2,13:          NEWLINE        '\n'           
3,0-3,5:            NAME           'print'        
3,5-3,6:            OP             '('            
3,6-3,7:            NAME           'a'            
3,8-3,9:            OP             '*'            
3,10-3,11:          NAME           'b'            
3,11-3,12:          OP             ')'            
3,12-3,13:          NEWLINE        '\n'           
4,0-4,1:            NL             '\n'           
5,0-5,3:            NAME           'def'          
5,4-5,8:            NAME           'func'         
5,8-5,9:    

In [6]:
 with open('module.py' ,'rb') as f:
    for token in tokenize.tokenize(f.readline):
        print(token)

TokenInfo(type=59 (ENCODING), string='utf-8', start=(0, 0), end=(0, 0), line='')
TokenInfo(type=1 (NAME), string='a', start=(1, 0), end=(1, 1), line='a = 3\n')
TokenInfo(type=53 (OP), string='=', start=(1, 2), end=(1, 3), line='a = 3\n')
TokenInfo(type=2 (NUMBER), string='3', start=(1, 4), end=(1, 5), line='a = 3\n')
TokenInfo(type=4 (NEWLINE), string='\n', start=(1, 5), end=(1, 6), line='a = 3\n')
TokenInfo(type=1 (NAME), string='b', start=(2, 0), end=(2, 1), line="b = 'Hello '\n")
TokenInfo(type=53 (OP), string='=', start=(2, 2), end=(2, 3), line="b = 'Hello '\n")
TokenInfo(type=3 (STRING), string="'Hello '", start=(2, 4), end=(2, 12), line="b = 'Hello '\n")
TokenInfo(type=4 (NEWLINE), string='\n', start=(2, 12), end=(2, 13), line="b = 'Hello '\n")
TokenInfo(type=1 (NAME), string='print', start=(3, 0), end=(3, 5), line='print(a * b)\n')
TokenInfo(type=53 (OP), string='(', start=(3, 5), end=(3, 6), line='print(a * b)\n')
TokenInfo(type=1 (NAME), string='a', start=(3, 6), end=(3, 7), l

### Summary

When Python reads source code, it first converts it to a stream of *tokens* – word-like units of a language.

Two of Python's tokens are fairly unique among programming languages: `INDENT` and `DEDENT`.

---

# Parsing
(Abstract Syntax Tree)

In [7]:
import ast

In [8]:
!cat -n module.py

     1	a = 3
     2	b = 'Hello '
     3	print(a * b)
     4	
     5	def func(a=1, *b, **c):
     6	    return 7 + 3


In [9]:
with open('module.py' ,'rb') as f:
    tree = ast.parse(f.read())

print(tree)

<_ast.Module object at 0x7f4a05b46f28>


In [10]:
print(tree._fields)

('body',)


In [11]:
for node in tree.body:
    print(node.lineno, node)

1 <_ast.Assign object at 0x7f4a05b46e10>
2 <_ast.Assign object at 0x7f4a05b46ef0>
3 <_ast.Expr object at 0x7f4a05b46b38>
5 <_ast.FunctionDef object at 0x7f4a05b469b0>


In [12]:
def dump_ast(node, indent=''):
    print('{}{}'.format(indent, type(node).__name__))
    for name, value in ast.iter_fields(node):
        if isinstance(value, ast.AST):
            print('{}  .{}:'.format(indent, name))
            dump_ast(value, indent + '    ')
        elif isinstance(value, list):
            if not value:
                print('{}  .{}: []'.format(indent, name))
            else:
                print('{}  .{}: ['.format(indent, name))
                for child in value:
                    dump_ast(child, indent + '    ')
                print('{}  ]'.format(indent))
        else:
            print('{}  .{} = {}'.format(indent, name, repr(value)))

dump_ast(tree)

Module
  .body: [
    Assign
      .targets: [
        Name
          .id = 'a'
          .ctx:
            Store
      ]
      .value:
        Num
          .n = 3
    Assign
      .targets: [
        Name
          .id = 'b'
          .ctx:
            Store
      ]
      .value:
        Str
          .s = 'Hello '
    Expr
      .value:
        Call
          .func:
            Name
              .id = 'print'
              .ctx:
                Load
          .args: [
            BinOp
              .left:
                Name
                  .id = 'a'
                  .ctx:
                    Load
              .op:
                Mult
              .right:
                Name
                  .id = 'b'
                  .ctx:
                    Load
          ]
          .keywords: []
    FunctionDef
      .name = 'func'
      .args:
        arguments
          .args: [
            arg
              .arg = 'a'
              .annotation = None
          ]
          .vararg

See also: "Full Grammar Specification" in the Python docs

## Summary

The token stream is parsed, using Python's grammar, to arrive at the Abstract Syntax Tree – a graph-like representation of the program.

It is possible to modify the AST, or generate it programmatically (even though the tools in the standard library aren't too usable). People have used this to add Lisp-like macros to Python, for example.

---
# Code Objects

In [13]:
!cat module.py

a = 3
b = 'Hello '
print(a * b)

def func(a=1, *b, **c):
    return 7 + 3


In [14]:
filename = os.path.abspath('module.py')
print(filename)

/home/pviktori/dev/slides/2016-06-22-pysilesia-bytecode/module.py


In [15]:
code = compile(tree, filename=filename, mode='exec')

print(code)

<code object <module> at 0x7f4a05b2bf60, file "/home/pviktori/dev/slides/2016-06-22-pysilesia-bytecode/module.py", line 1>


In [18]:
exec(code)

Hello Hello Hello 


In [19]:
def dump_code_attrs(code):
    print(code)
    for attr_name in dir(code):
        if not attr_name.startswith('__'):
            print('  {}: {}'.format(attr_name, getattr(code, attr_name)))

dump_code_attrs(code)

<code object <module> at 0x7f4a05b2bf60, file "/home/pviktori/dev/slides/2016-06-22-pysilesia-bytecode/module.py", line 1>
  co_argcount: 0
  co_cellvars: ()
  co_code: b'd\x00\x00Z\x00\x00d\x01\x00Z\x01\x00e\x02\x00e\x00\x00e\x01\x00\x14\x83\x01\x00\x01d\x02\x00d\x03\x00d\x04\x00\x84\x01\x00Z\x03\x00d\x05\x00S'
  co_consts: (3, 'Hello ', 1, <code object func at 0x7f4a05b2bb70, file "/home/pviktori/dev/slides/2016-06-22-pysilesia-bytecode/module.py", line 5>, 'func', None)
  co_filename: /home/pviktori/dev/slides/2016-06-22-pysilesia-bytecode/module.py
  co_firstlineno: 1
  co_flags: 64
  co_freevars: ()
  co_kwonlyargcount: 0
  co_lnotab: b'\x06\x01\x06\x01\x0e\x02'
  co_name: <module>
  co_names: ('a', 'b', 'print', 'func')
  co_nlocals: 0
  co_stacksize: 3
  co_varnames: ()


In [20]:
func_code = code.co_consts[3]
dump_code_attrs(func_code)

<code object func at 0x7f4a05b2bb70, file "/home/pviktori/dev/slides/2016-06-22-pysilesia-bytecode/module.py", line 5>
  co_argcount: 1
  co_cellvars: ()
  co_code: b'd\x03\x00S'
  co_consts: (None, 7, 3, 10)
  co_filename: /home/pviktori/dev/slides/2016-06-22-pysilesia-bytecode/module.py
  co_firstlineno: 5
  co_flags: 79
  co_freevars: ()
  co_kwonlyargcount: 0
  co_lnotab: b'\x00\x01'
  co_name: func
  co_names: ()
  co_nlocals: 3
  co_stacksize: 2
  co_varnames: ('a', 'b', 'c')


In [21]:
def a_function(*args):
    print(args)
    return len(args)

dump_code_attrs(a_function.__code__)

<code object a_function at 0x7f4a046bdae0, file "<ipython-input-21-4c1d72ac596c>", line 1>
  co_argcount: 0
  co_cellvars: ()
  co_code: b't\x00\x00|\x00\x00\x83\x01\x00\x01t\x01\x00|\x00\x00\x83\x01\x00S'
  co_consts: (None,)
  co_filename: <ipython-input-21-4c1d72ac596c>
  co_firstlineno: 1
  co_flags: 71
  co_freevars: ()
  co_kwonlyargcount: 0
  co_lnotab: b'\x00\x01\n\x01'
  co_name: a_function
  co_names: ('print', 'len')
  co_nlocals: 1
  co_stacksize: 2
  co_varnames: ('args',)


### Summary
The Abstract Syntax Tree can be compiled down to a *code object*, a structure that holds all details of the program in a compact form – ready to be executed or saved.

The code objects contains, among other things, variable names and constants used in the code. Functions defined in the code are also represented as constants: other code objects!

---
# Bytecode

In [22]:
import dis

In [23]:
!cat module.py

a = 3
b = 'Hello '
print(a * b)

def func(a=1, *b, **c):
    return 7 + 3


In [24]:
print(list(b for b in code.co_code))

[100, 0, 0, 90, 0, 0, 100, 1, 0, 90, 1, 0, 101, 2, 0, 101, 0, 0, 101, 1, 0, 20, 131, 1, 0, 1, 100, 2, 0, 100, 3, 0, 100, 4, 0, 132, 1, 0, 90, 3, 0, 100, 5, 0, 83]


In [25]:
def print_dis(code):
    line_starts = [a for a, b in dis.findlinestarts(code)]
    for instr in dis.get_instructions(code):
        if instr.offset in line_starts:
            print()
        print('{i.offset:2}:  {i.opcode:3} {arg:>6}: {i.opname}({i.argrepr})'.format(
                i=instr,
                arg='-' if instr.arg is None else instr.arg))

print_dis(code)



 0:  100      0: LOAD_CONST(3)
 3:   90      0: STORE_NAME(a)

 6:  100      1: LOAD_CONST('Hello ')
 9:   90      1: STORE_NAME(b)

12:  101      2: LOAD_NAME(print)
15:  101      0: LOAD_NAME(a)
18:  101      1: LOAD_NAME(b)
21:   20      -: BINARY_MULTIPLY()
22:  131      1: CALL_FUNCTION(1 positional, 0 keyword pair)
25:    1      -: POP_TOP()

26:  100      2: LOAD_CONST(1)
29:  100      3: LOAD_CONST(<code object func at 0x7f4a05b2bb70, file "/home/pviktori/dev/slides/2016-06-22-pysilesia-bytecode/module.py", line 5>)
32:  100      4: LOAD_CONST('func')
35:  132      1: MAKE_FUNCTION()
38:   90      3: STORE_NAME(func)
41:  100      5: LOAD_CONST(None)
44:   83      -: RETURN_VALUE()


In [26]:
dis.HAVE_ARGUMENT

90

In [27]:
print(dis.opmap['LOAD_CONST'])
print(dis.opname[100])

100
LOAD_CONST


In [28]:
len(dis.opmap)

114

In [29]:
!python3 -m dis module.py

  1           0 LOAD_CONST               0 (3)
              3 STORE_NAME               0 (a)

  2           6 LOAD_CONST               1 ('Hello ')
              9 STORE_NAME               1 (b)

  3          12 LOAD_NAME                2 (print)
             15 LOAD_NAME                0 (a)
             18 LOAD_NAME                1 (b)
             21 BINARY_MULTIPLY
             22 CALL_FUNCTION            1 (1 positional, 0 keyword pair)
             25 POP_TOP

  5          26 LOAD_CONST               2 (1)
             29 LOAD_CONST               3 (<code object func at 0x7faf5a868540, file "module.py", line 5>)
             32 LOAD_CONST               4 ('func')
             35 MAKE_FUNCTION            1
             38 STORE_NAME               3 (func)
             41 LOAD_CONST               5 (None)
             44 RETURN_VALUE


In [30]:
!cat module.py

a = 3
b = 'Hello '
print(a * b)

def func(a=1, *b, **c):
    return 7 + 3


In [31]:
def loop_example():
    for i in range(100):
        print(i)

print_dis(loop_example.__code__)


 0:  120     30: SETUP_LOOP(to 33)
 3:  116      0: LOAD_GLOBAL(range)
 6:  100      1: LOAD_CONST(100)
 9:  131      1: CALL_FUNCTION(1 positional, 0 keyword pair)
12:   68      -: GET_ITER()
13:   93     16: FOR_ITER(to 32)
16:  125      0: STORE_FAST(i)

19:  116      1: LOAD_GLOBAL(print)
22:  124      0: LOAD_FAST(i)
25:  131      1: CALL_FUNCTION(1 positional, 0 keyword pair)
28:    1      -: POP_TOP()
29:  113     13: JUMP_ABSOLUTE()
32:   87      -: POP_BLOCK()
33:  100      0: LOAD_CONST(None)
36:   83      -: RETURN_VALUE()


### Summary
The *bytecode* is a set of instructions for the Python interpreter – a stack-based virtual machine. This is the most important part of a code object – it tells Python what to do.

Each instruction is represented either as a single byte (instructions with opcodes under a certain number), or as three bytes: a one-byte instruction and two-byte argument.

## Classes
Let's see some fried brains!

In [32]:
def make_class():
    class ClassExample:
        def __init__(self, name):
            self.name = name

dump_code_attrs(make_class.__code__)
print_dis(make_class.__code__)

<code object make_class at 0x7f4a046801e0, file "<ipython-input-32-30532d0627d4>", line 1>
  co_argcount: 0
  co_cellvars: ()
  co_code: b'Gd\x01\x00d\x02\x00\x84\x00\x00d\x02\x00\x83\x02\x00}\x00\x00d\x00\x00S'
  co_consts: (None, <code object ClassExample at 0x7f4a04680300, file "<ipython-input-32-30532d0627d4>", line 2>, 'ClassExample')
  co_filename: <ipython-input-32-30532d0627d4>
  co_firstlineno: 1
  co_flags: 67
  co_freevars: ()
  co_kwonlyargcount: 0
  co_lnotab: b'\x00\x01'
  co_name: make_class
  co_names: ()
  co_nlocals: 1
  co_stacksize: 3
  co_varnames: ('ClassExample',)

 0:   71      -: LOAD_BUILD_CLASS()
 1:  100      1: LOAD_CONST(<code object ClassExample at 0x7f4a04680300, file "<ipython-input-32-30532d0627d4>", line 2>)
 4:  100      2: LOAD_CONST('ClassExample')
 7:  132      0: MAKE_FUNCTION()
10:  100      2: LOAD_CONST('ClassExample')
13:  131      2: CALL_FUNCTION(2 positional, 0 keyword pair)
16:  125      0: STORE_FAST(ClassExample)
19:  100      0: LOAD_C

In [33]:
help(__build_class__)

Help on built-in function __build_class__ in module builtins:

__build_class__(...)
    __build_class__(func, name, *bases, metaclass=None, **kwds) -> class
    
    Internal helper function used by the class statement.



In [34]:
class_code = make_class.__code__.co_consts[1]
dump_code_attrs(class_code)
print_dis(class_code)

<code object ClassExample at 0x7f4a04680300, file "<ipython-input-32-30532d0627d4>", line 2>
  co_argcount: 0
  co_cellvars: ()
  co_code: b'e\x00\x00Z\x01\x00d\x00\x00Z\x02\x00d\x01\x00d\x02\x00\x84\x00\x00Z\x03\x00d\x03\x00S'
  co_consts: ('make_class.<locals>.ClassExample', <code object __init__ at 0x7f4a046bddb0, file "<ipython-input-32-30532d0627d4>", line 3>, 'make_class.<locals>.ClassExample.__init__', None)
  co_filename: <ipython-input-32-30532d0627d4>
  co_firstlineno: 2
  co_flags: 64
  co_freevars: ()
  co_kwonlyargcount: 0
  co_lnotab: b'\x0c\x01'
  co_name: ClassExample
  co_names: ('__name__', '__module__', '__qualname__', '__init__')
  co_nlocals: 0
  co_stacksize: 2
  co_varnames: ()

 0:  101      0: LOAD_NAME(__name__)
 3:   90      1: STORE_NAME(__module__)
 6:  100      0: LOAD_CONST('make_class.<locals>.ClassExample')
 9:   90      2: STORE_NAME(__qualname__)

12:  100      1: LOAD_CONST(<code object __init__ at 0x7f4a046bddb0, file "<ipython-input-32-30532d0627d4

---
# Serialization

In [35]:
with open('__pycache__/module.cpython-35.pyc', 'rb') as f:
    module_bytes = f.read()

print(list(module_bytes))

[22, 13, 13, 10, 225, 117, 106, 87, 74, 0, 0, 0, 227, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 64, 0, 0, 0, 115, 45, 0, 0, 0, 100, 0, 0, 90, 0, 0, 100, 1, 0, 90, 1, 0, 101, 2, 0, 101, 0, 0, 101, 1, 0, 20, 131, 1, 0, 1, 100, 2, 0, 100, 3, 0, 100, 4, 0, 132, 1, 0, 90, 3, 0, 100, 5, 0, 83, 41, 6, 233, 3, 0, 0, 0, 122, 6, 72, 101, 108, 108, 111, 32, 233, 1, 0, 0, 0, 99, 1, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 2, 0, 0, 0, 79, 0, 0, 0, 115, 4, 0, 0, 0, 100, 3, 0, 83, 41, 4, 78, 233, 7, 0, 0, 0, 114, 1, 0, 0, 0, 233, 10, 0, 0, 0, 169, 0, 41, 3, 218, 1, 97, 218, 1, 98, 218, 1, 99, 114, 5, 0, 0, 0, 114, 5, 0, 0, 0, 250, 65, 47, 104, 111, 109, 101, 47, 112, 118, 105, 107, 116, 111, 114, 105, 47, 100, 101, 118, 47, 115, 108, 105, 100, 101, 115, 47, 50, 48, 49, 54, 45, 48, 54, 45, 50, 50, 45, 112, 121, 115, 105, 108, 101, 115, 105, 97, 45, 98, 121, 116, 101, 99, 111, 100, 101, 47, 109, 111, 100, 117, 108, 101, 46, 112, 121, 218, 4, 102, 117, 110, 99, 5, 0, 0, 0, 115, 2, 0, 0, 0, 0, 1, 114, 10, 

In [36]:
import json
serialized = json.dumps({1: 2, 3: "abc", 4: [1, 2, 3]})
print(repr(serialized))
print(repr(json.loads(serialized)))

'{"1": 2, "3": "abc", "4": [1, 2, 3]}'
{'1': 2, '3': 'abc', '4': [1, 2, 3]}


In [37]:
import pickle

print(code)
pickled = pickle.dumps(code)
print(pickle.loads(pickled))


<code object <module> at 0x7f4a05b2bf60, file "/home/pviktori/dev/slides/2016-06-22-pysilesia-bytecode/module.py", line 1>
<code object <module> at 0x7f4a04680420, file "/home/pviktori/dev/slides/2016-06-22-pysilesia-bytecode/module.py", line 1>


In [38]:
import marshal

marshalled = marshal.dumps(code)
print(marshal.loads(marshalled))


<code object <module> at 0x7f4a04680390, file "/home/pviktori/dev/slides/2016-06-22-pysilesia-bytecode/module.py", line 1>


In [39]:
print(list(marshalled))


[227, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 64, 0, 0, 0, 115, 45, 0, 0, 0, 100, 0, 0, 90, 0, 0, 100, 1, 0, 90, 1, 0, 101, 2, 0, 101, 0, 0, 101, 1, 0, 20, 131, 1, 0, 1, 100, 2, 0, 100, 3, 0, 100, 4, 0, 132, 1, 0, 90, 3, 0, 100, 5, 0, 83, 41, 6, 233, 3, 0, 0, 0, 250, 6, 72, 101, 108, 108, 111, 32, 233, 1, 0, 0, 0, 227, 1, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 2, 0, 0, 0, 79, 0, 0, 0, 115, 4, 0, 0, 0, 100, 3, 0, 83, 41, 4, 78, 233, 7, 0, 0, 0, 114, 1, 0, 0, 0, 233, 10, 0, 0, 0, 169, 0, 41, 3, 218, 1, 97, 218, 1, 98, 218, 1, 99, 114, 7, 0, 0, 0, 114, 7, 0, 0, 0, 250, 65, 47, 104, 111, 109, 101, 47, 112, 118, 105, 107, 116, 111, 114, 105, 47, 100, 101, 118, 47, 115, 108, 105, 100, 101, 115, 47, 50, 48, 49, 54, 45, 48, 54, 45, 50, 50, 45, 112, 121, 115, 105, 108, 101, 115, 105, 97, 45, 98, 121, 116, 101, 99, 111, 100, 101, 47, 109, 111, 100, 117, 108, 101, 46, 112, 121, 218, 4, 102, 117, 110, 99, 5, 0, 0, 0, 115, 2, 0, 0, 0, 0, 1, 114, 12, 0, 0, 0, 78, 41, 4, 114, 8, 0, 0, 0, 114, 9, 0,

In [40]:
print(list(module_bytes))

[22, 13, 13, 10, 225, 117, 106, 87, 74, 0, 0, 0, 227, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 64, 0, 0, 0, 115, 45, 0, 0, 0, 100, 0, 0, 90, 0, 0, 100, 1, 0, 90, 1, 0, 101, 2, 0, 101, 0, 0, 101, 1, 0, 20, 131, 1, 0, 1, 100, 2, 0, 100, 3, 0, 100, 4, 0, 132, 1, 0, 90, 3, 0, 100, 5, 0, 83, 41, 6, 233, 3, 0, 0, 0, 122, 6, 72, 101, 108, 108, 111, 32, 233, 1, 0, 0, 0, 99, 1, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 2, 0, 0, 0, 79, 0, 0, 0, 115, 4, 0, 0, 0, 100, 3, 0, 83, 41, 4, 78, 233, 7, 0, 0, 0, 114, 1, 0, 0, 0, 233, 10, 0, 0, 0, 169, 0, 41, 3, 218, 1, 97, 218, 1, 98, 218, 1, 99, 114, 5, 0, 0, 0, 114, 5, 0, 0, 0, 250, 65, 47, 104, 111, 109, 101, 47, 112, 118, 105, 107, 116, 111, 114, 105, 47, 100, 101, 118, 47, 115, 108, 105, 100, 101, 115, 47, 50, 48, 49, 54, 45, 48, 54, 45, 50, 50, 45, 112, 121, 115, 105, 108, 101, 115, 105, 97, 45, 98, 121, 116, 101, 99, 111, 100, 101, 47, 109, 111, 100, 117, 108, 101, 46, 112, 121, 218, 4, 102, 117, 110, 99, 5, 0, 0, 0, 115, 2, 0, 0, 0, 0, 1, 114, 10, 

In [41]:
marshal.loads(module_bytes[12:]) == code

True

---

In [42]:
print(module_bytes[0:4])
list(module_bytes[0:4])

b'\x16\r\r\n'


[22, 13, 13, 10]

In [43]:
import importlib.util
print(importlib.util.MAGIC_NUMBER)
list(importlib.util.MAGIC_NUMBER)

b'\x16\r\r\n'


[22, 13, 13, 10]

---

In [44]:
print(module_bytes[4:8])
list(module_bytes[4:8])

b'\xe1ujW'


[225, 117, 106, 87]

In [45]:
stamp_value = int.from_bytes(module_bytes[4:8], 'little')
stamp_value

1466594785

In [46]:
datetime.datetime.fromtimestamp(stamp_value).isoformat(' ')

'2016-06-22 13:26:25'

---

In [47]:
print(module_bytes[8:12])
list(module_bytes[8:12])

b'J\x00\x00\x00'


[74, 0, 0, 0]

In [48]:
int.from_bytes(module_bytes[8:12], 'little')

74

In [49]:
with open('module.py', 'rb') as f:
    print(len(f.read()))

74


In [50]:
os.stat('module.py')

os.stat_result(st_mode=33204, st_ino=2513529, st_dev=64768, st_nlink=1, st_uid=1000, st_gid=1000, st_size=74, st_atime=1472134086, st_mtime=1466594785, st_ctime=1466594785)


### Summary
The `.pyc` file stores the result of the compiler's hard work, so it doesn't have to compile again if the source file is not changed. It is generated (and read) as part of Python's import machinery – i.e. from "import" statements, not when running a module directly using `python module.py` or `python -m module`.

The “meat” of a `.pyc` file is a code object serialized using the `marshal` module – an efficient format that only supports values that can appear as constants in Python files.
The `.pyc` file adds a header to that, leading to this overall structure:

* Bytes 0-3: Magic number (identifies a bytecode version)
* Bytes 4-7: Modification time of source file
* Bytes 8-11: Length of source file (in bytes)
* Bytes 12 and on: `marshal`-ed code object

---
# Functions, Code Objects and Signatures

In [55]:
def make_adder(a):
    def adder(b:int=2, *args, c: "(unused)"=3, **kwargs) -> int:
        "A silly function"
        result = a + b
        print('debug:', result)
        return result
    return adder

adder = make_adder(1)

print(adder(2))
print(adder(3))

debug: 3
3
debug: 4
4


In [56]:
print('__code__: ', adder.__code__)
print('__defaults__: ', adder.__defaults__)
print('__kwdefaults__: ', adder.__kwdefaults__)
print('__qualname__: ', adder.__qualname__)
print('__doc__: ', adder.__doc__)
print('__annotations__: ', adder.__annotations__)
print('__globals__: a', type(adder.__globals__).__name__)
print('__closure__: ', adder.__closure__)

__code__:  <code object adder at 0x7f4a04680780, file "<ipython-input-55-8ecfda3da029>", line 2>
__defaults__:  (2,)
__kwdefaults__:  {'c': 3}
__qualname__:  make_adder.<locals>.adder
__doc__:  A silly function
__annotations__:  {'b': <class 'int'>, 'c': '(unused)', 'return': <class 'int'>}
__globals__: a dict
__closure__:  (<cell at 0x7f4a0858aa08: int object at 0x7f4a19c6d400>,)


In [60]:
dump_code_attrs(adder.__code__)

<code object adder at 0x7f4a04680780, file "<ipython-input-55-8ecfda3da029>", line 2>
  co_argcount: 1
  co_cellvars: ()
  co_code: b'\x88\x00\x00|\x00\x00\x17}\x04\x00t\x00\x00d\x01\x00|\x04\x00\x83\x02\x00\x01|\x04\x00S'
  co_consts: ('A silly function', 'debug:')
  co_filename: <ipython-input-55-8ecfda3da029>
  co_firstlineno: 2
  co_flags: 31
  co_freevars: ('a',)
  co_kwonlyargcount: 1
  co_lnotab: b'\x00\x02\n\x01\r\x01'
  co_name: adder
  co_names: ('print',)
  co_nlocals: 5
  co_stacksize: 3
  co_varnames: ('b', 'c', 'args', 'kwargs', 'result')


In [61]:
dump_code_attrs(make_adder.__code__)

<code object make_adder at 0x7f4a04680660, file "<ipython-input-55-8ecfda3da029>", line 1>
  co_argcount: 1
  co_cellvars: ('a',)
  co_code: b'd\x01\x00d\x02\x00d\x03\x00t\x00\x00d\x04\x00t\x00\x00d\x05\x00\x87\x00\x00f\x01\x00d\x06\x00d\x07\x00\x90\x04\x00\x86\x01\x01}\x01\x00|\x01\x00S'
  co_consts: (None, 2, 'c', 3, '(unused)', ('b', 'c', 'return'), <code object adder at 0x7f4a04680780, file "<ipython-input-55-8ecfda3da029>", line 2>, 'make_adder.<locals>.adder')
  co_filename: <ipython-input-55-8ecfda3da029>
  co_firstlineno: 1
  co_flags: 3
  co_freevars: ()
  co_kwonlyargcount: 0
  co_lnotab: b'\x00\x01*\x05'
  co_name: make_adder
  co_names: ('int',)
  co_nlocals: 2
  co_stacksize: 10
  co_varnames: ('a', 'adder')


In [62]:
sig = inspect.signature(adder)
print(sig)

(b:int=2, *args, c:'(unused)'=3, **kwargs) -> int


In [63]:
sig.parameters

mappingproxy(OrderedDict([('b', <Parameter "b:int=2">), ('args', <Parameter "*args">), ('c', <Parameter "c:'(unused)'=3">), ('kwargs', <Parameter "**kwargs">)]))

In [64]:
for name, arg in sig.parameters.items():
    print('{a.name}: kind={a.kind}, default={a.default!r}, annotation={a.annotation!r}'.format(a=arg))

b: kind=1, default=2, annotation=<class 'int'>
args: kind=2, default=<class 'inspect._empty'>, annotation=<class 'inspect._empty'>
c: kind=3, default=3, annotation='(unused)'
kwargs: kind=4, default=<class 'inspect._empty'>, annotation=<class 'inspect._empty'>


In [65]:
sig.return_annotation

int

In [68]:
bound_args = sig.bind(1, c=2, d=3)
bound_args

<BoundArguments (b=1, c=2, kwargs={'d': 3})>

In [69]:
def foo():
    ...

sig2 = inspect.signature(foo)
sig2.bind(a=2)

TypeError: got an unexpected keyword argument 'a'

In [71]:
adder(*bound_args.args, **bound_args.kwargs)

debug: 2


2

In [None]:
def on_click(x, y):
    print(x, y)

myframework.set_click_handler(on_click)


In [79]:
def demo():
    a = 1
    b = 2
    c = 3
    d = 4
    
    a, b, c  =1, a, 2
    
    a, a[0], c, d = [1, 2, 3], b, print(c), print(a)

print_dis(demo.__code__)


 0:  100      1: LOAD_CONST(1)
 3:  125      0: STORE_FAST(a)

 6:  100      2: LOAD_CONST(2)
 9:  125      1: STORE_FAST(b)

12:  100      3: LOAD_CONST(3)
15:  125      2: STORE_FAST(c)

18:  100      4: LOAD_CONST(4)
21:  125      3: STORE_FAST(d)

24:  100      1: LOAD_CONST(1)
27:  124      0: LOAD_FAST(a)
30:  100      2: LOAD_CONST(2)
33:    3      -: ROT_THREE()
34:    2      -: ROT_TWO()
35:  125      0: STORE_FAST(a)
38:  125      1: STORE_FAST(b)
41:  125      2: STORE_FAST(c)

44:  100      1: LOAD_CONST(1)
47:  100      2: LOAD_CONST(2)
50:  100      3: LOAD_CONST(3)
53:  103      3: BUILD_LIST()
56:  124      1: LOAD_FAST(b)
59:  116      0: LOAD_GLOBAL(print)
62:  124      2: LOAD_FAST(c)
65:  131      1: CALL_FUNCTION(1 positional, 0 keyword pair)
68:  116      0: LOAD_GLOBAL(print)
71:  124      0: LOAD_FAST(a)
74:  131      1: CALL_FUNCTION(1 positional, 0 keyword pair)
77:  102      4: BUILD_TUPLE()
80:   92      4: UNPACK_SEQUENCE()
83:  125      0: STORE_FAST(a)
8

### Summary
Python's *functions* are mutable objects, each of which holds an immutable code object. Data related to code execution, like variable names and constant values, are stored in the code object. Extra data that's only needed when calling the function, such as default argument values, is stored on the function.

Data related to arguments – their names, values, and annotations – are stored in a format that's convenient for calling and running the function. When you need to inspect them, you can use a helper called `inspect.signature`, which exposes them in a more usable way.

---
# Line Numbers

In [80]:
def oops():
    print(1/0)

print_dis(oops.__code__)


 0:  116      0: LOAD_GLOBAL(print)
 3:  100      1: LOAD_CONST(1)
 6:  100      2: LOAD_CONST(0)
 9:   27      -: BINARY_TRUE_DIVIDE()
10:  131      1: CALL_FUNCTION(1 positional, 0 keyword pair)
13:    1      -: POP_TOP()
14:  100      0: LOAD_CONST(None)
17:   83      -: RETURN_VALUE()


In [81]:
oops()

ZeroDivisionError: division by zero

In [82]:
!cat module.py

a = 3
b = 'Hello '
print(a * b)

def func(a=1, *b, **c):
    return 7 + 3


In [83]:
dump_code_attrs(code)

<code object <module> at 0x7f4a05b2bf60, file "/home/pviktori/dev/slides/2016-06-22-pysilesia-bytecode/module.py", line 1>
  co_argcount: 0
  co_cellvars: ()
  co_code: b'd\x00\x00Z\x00\x00d\x01\x00Z\x01\x00e\x02\x00e\x00\x00e\x01\x00\x14\x83\x01\x00\x01d\x02\x00d\x03\x00d\x04\x00\x84\x01\x00Z\x03\x00d\x05\x00S'
  co_consts: (3, 'Hello ', 1, <code object func at 0x7f4a05b2bb70, file "/home/pviktori/dev/slides/2016-06-22-pysilesia-bytecode/module.py", line 5>, 'func', None)
  co_filename: /home/pviktori/dev/slides/2016-06-22-pysilesia-bytecode/module.py
  co_firstlineno: 1
  co_flags: 64
  co_freevars: ()
  co_kwonlyargcount: 0
  co_lnotab: b'\x06\x01\x06\x01\x0e\x02'
  co_name: <module>
  co_names: ('a', 'b', 'print', 'func')
  co_nlocals: 0
  co_stacksize: 3
  co_varnames: ()


In [84]:
!python -m dis module.py

  1           0 LOAD_CONST               0 (3)
              3 STORE_NAME               0 (a)

  2           6 LOAD_CONST               1 ('Hello ')
              9 STORE_NAME               1 (b)

  3          12 LOAD_NAME                0 (a)
             15 LOAD_NAME                1 (b)
             18 BINARY_MULTIPLY     
             19 PRINT_ITEM          
             20 PRINT_NEWLINE       

  5          21 LOAD_CONST               2 (1)
             24 LOAD_CONST               3 (<code object func at 0x7feaad75c830, file "module.py", line 5>)
             27 MAKE_FUNCTION            1
             30 STORE_NAME               2 (func)
             33 LOAD_CONST               4 (None)
             36 RETURN_VALUE        


In [85]:
list(code.co_lnotab)

[6, 1, 6, 1, 14, 2]

In [86]:
list(zip(code.co_lnotab[::2], code.co_lnotab[1::2]))

[(6, 1), (6, 1), (14, 2)]

In [None]:
lst = [1, 2, 3]
        
print(lst)

### Summary
When Python needs to determine which line number a particular bytecode offset belongs to, it can use the line number table, or `lnotab`. The lnotab holds pairs of bytes saying, effectively, how many bytes correspond to how many lines.

The numbers can't be negative – an instruction can never refer to an earlier line than the instruction before it. The bytecode generator never goes "backwards".

A full explanation is in the file `Objects/lnotab_notes.txt` in Python sources.

---
# Frames
Code objects, functions, and frames

In [87]:
def inner():
    a = 1
    frame = inspect.currentframe()
    return frame
def outer():
    return inner()

frame = outer()
frame

<frame at 0x7f4a0468f630>

In [88]:
print('f_code:', frame.f_code)
print('f_lasti:', frame.f_lasti)
print('f_lineno:', frame.f_lineno)
print('f_locals:', frame.f_locals)
print('f_back:', frame.f_back)

f_code: <code object inner at 0x7f4a046804b0, file "<ipython-input-87-4b25b15e8d9e>", line 1>
f_lasti: 15
f_lineno: 3
f_locals: {'a': 1}
f_back: <frame object at 0x55dcd21e6ed8>


In [89]:
print('f_code:', frame.f_back.f_code)
print('f_lasti:', frame.f_back.f_lasti)
print('f_lineno:', frame.f_back.f_lineno)
print('f_locals:', frame.f_back.f_locals)
print('f_back:', frame.f_back.f_back)

f_code: <code object outer at 0x7f4a04680c90, file "<ipython-input-87-4b25b15e8d9e>", line 4>
f_lasti: 6
f_lineno: 5
f_locals: {}
f_back: <frame object at 0x55dcd22129e8>


In [90]:
def get_current_line(frame):
    lines = inspect.getsourcelines(frame.f_code)[0]
    index = frame.f_lineno - frame.f_code.co_firstlineno
    return '{}: {}'.format(frame.f_code.co_name, lines[index])

print(get_current_line(frame))
print(get_current_line(frame.f_back))


inner:     return inspect.currentframe()

outer:     return inner()



In [91]:
import traceback
def recurse(wait=10):
    if wait < 0:
        return traceback.format_stack()
    else:
        return recurse(wait-1)

print(''.join(recurse()))

  File "/usr/lib64/python3.5/runpy.py", line 170, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib64/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/lib/python3.5/site-packages/IPython/kernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/usr/lib/python3.5/site-packages/IPython/config/application.py", line 574, in launch_instance
    app.start()
  File "/usr/lib/python3.5/site-packages/IPython/kernel/zmq/kernelapp.py", line 373, in start
    ioloop.IOLoop.instance().start()
  File "/usr/lib64/python3.5/site-packages/zmq/eventloop/ioloop.py", line 151, in start
    super(ZMQIOLoop, self).start()
  File "/usr/lib64/python3.5/site-packages/tornado/ioloop.py", line 882, in start
    handler_func(fd_obj, events)
  File "/usr/lib64/python3.5/site-packages/tornado/stack_context.py", line 274, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/lib64/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 

In [92]:
def fmt(string):
    parent_frame = inspect.currentframe().f_back
    return string.format(**parent_frame.f_locals)

a = 1
b = 2
c = 3

print(fmt("{a}, {b}, {c}"))


1, 2, 3


### Summary
When code is executed, there exists a `frame` object for each running function. This object contains the current instruction, values of local variables, and a pointer to the "parent" frame, from which the current function was called.

A traceback is generated by walking from the current frame to its parent, its parent's parent, and so on. Each time the current line is looked up based on the current instruction. (This lookup only takes place when the traceback is actually printed).

---
# Questions ?

<br>

> Petr Viktorin
>
> encukou@gmail.com
>
> @encukou