# Coding Best Practices with Python

In [1]:
import pandas as pd
import numpy as np
import sys
import pytest

# 1. Writing Efficient Python Code

Defining "efficient":      

Efficient code satisfy 2 key concepts:      
1) minimal completion time (fast runtime)       
2) minimal resource consumption (small memory footprint)        
i.e. reduce latency and memory overhead.      

Defining "Pythonic":       

Pythonic code tend to be less verbose and easier to interpret. (e.g. use list comprehension rather than for loop + append). Pythonic code is usually efficient code.        


Suppose you wanted to collect the names in the above list that have six letters or more. In other programming languages, the typical approach is to create an index variable (i), use i to iterate over the list, and use an if statement to collect the names with six letters or more

In [2]:
names = ['Jerry', 'Kramer', 'Elaine', 'George', 'Newman']

In [3]:
# Print the list created using the Non-Pythonic approach
i = 0
new_list= []
while i < len(names):
    if len(names[i]) >= 6:
        new_list.append(names[i])
    i += 1
print(new_list)

['Kramer', 'Elaine', 'George', 'Newman']


In [4]:
# more pythonic
# Print the list created by looping over the contents of names
better_list = []
for name in names:
    if len(name) >= 6:
        better_list.append(name)
print(better_list)

['Kramer', 'Elaine', 'George', 'Newman']


In [5]:
# best pythonic
# Print the list created by using list comprehension
best_list = [name for name in names if len(name) >= 6]
print(best_list)

['Kramer', 'Elaine', 'George', 'Newman']


In [6]:
import this

The Zen of Python, by Tim Peters

Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!


## 1.1 Building with built-ins

Built-in components are referred to as the Python Standard Library.        
Built-in types: list, tuple, set, dict and others.       
Built-in func: print(), len(), range(), round(), enumerate(), map(), zip() etc.       
Built-in module: os, sys, itertools, collections, math etc.      


In [7]:
# range()
# Explicitly typing a list of numbers:
# nums = [0,1,2,3,4,5,6,7,8,9,10]

# using range(start,stop) and list
nums = range(0,11)
nums_list = list(nums)
print(nums_list)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [8]:
# range(stop)
nums = range(11)
nums_list = list(nums)
print(nums_list)

# note that range func returns a range object, which we can convert into a list

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [9]:
# range() with a step value
even_nums = range(2,11,2)
even_nums_list = list(even_nums)
print(even_nums_list)

[2, 4, 6, 8, 10]


In [10]:
# enumerate()
# enumerate() creates an index item pair for each item in the object provided.
letters = ["a", "b", "c", "d"]
indexed_letters = enumerate(letters)

indexed_letters_list = list(indexed_letters)
print(indexed_letters_list)

# enumerate will return an enumerate object, then can be converted into a list.

[(0, 'a'), (1, 'b'), (2, 'c'), (3, 'd')]


In [11]:
# enumerate with starting index
letters = ["a", "b", "c", "d"]
indexed_letters2 = enumerate(letters, start=5)

indexed_letters2_list = list(indexed_letters2)
print(indexed_letters2_list)

[(5, 'a'), (6, 'b'), (7, 'c'), (8, 'd')]


In [12]:
# map()
# map applies a function to each element in an object

nums = [1.5, 2.3, 3.4, 4.6, 5.0]

rnd_nums = map(round, nums)
print(list(rnd_nums))

[2, 2, 3, 5, 5]


In [13]:
# map() with lambda func.
nums = [1,2,3,4,5]
sqrd_nums = map(lambda x: x**2, nums)

print(list(sqrd_nums))

[1, 4, 9, 16, 25]


In [14]:
# Create a range object that goes from 0 to 5
nums = range(0,6)
print(type(nums))

# Convert nums to a list
nums_list = list(nums)
print(nums_list)

# Create a new list of odd numbers from 1 to 11 by unpacking a range object
nums_list2 = [*range(1,12,2)]
print(nums_list2)

<class 'range'>
[0, 1, 2, 3, 4, 5]
[1, 3, 5, 7, 9, 11]


suppose you had a list of people that arrived at a party you are hosting. The list is ordered by arrival (Jerry was the first to arrive, followed by Kramer, etc.

In [15]:
names = ['Jerry', 'Kramer', 'Elaine', 'George', 'Newman']
# non-pythonic way
indexed_names = []
for i in range(len(names)):
    index_name = (i, names[i])
    indexed_names.append(index_name)
    
print(indexed_names)

[(0, 'Jerry'), (1, 'Kramer'), (2, 'Elaine'), (3, 'George'), (4, 'Newman')]


In [16]:
# more pythonic
# Rewrite the for loop to use enumerate
indexed_names = []
for i,name in enumerate(names):
    index_name = (i,name)
    indexed_names.append(index_name) 
print(indexed_names)

# even more pythonic
# Rewrite the above for loop using list comprehension
indexed_names_comp = [(i,name) for i,name in enumerate(names)]
print(indexed_names_comp)

# very pythonic
# Unpack an enumerate object with a starting index of one
indexed_names_unpack = [*enumerate(names, start=1)]
print(indexed_names_unpack)

[(0, 'Jerry'), (1, 'Kramer'), (2, 'Elaine'), (3, 'George'), (4, 'Newman')]
[(0, 'Jerry'), (1, 'Kramer'), (2, 'Elaine'), (3, 'George'), (4, 'Newman')]
[(1, 'Jerry'), (2, 'Kramer'), (3, 'Elaine'), (4, 'George'), (5, 'Newman')]


Suppose you wanted to create a new list (called names_uppercase) that converted all the letters in each name to uppercase. you could accomplish this with the below for loop:

In [17]:
names = ['Jerry', 'Kramer', 'Elaine', 'George', 'Newman']

# not so pythonic
names_uppercase = []

for name in names:
  names_uppercase.append(name.upper())

print(names_uppercase)

['JERRY', 'KRAMER', 'ELAINE', 'GEORGE', 'NEWMAN']


In [18]:
# Use map to apply str.upper to each element in names
names_map  = map(str.upper, names)

# Print the type of the names_map
print(type(names_map))

# Unpack names_map into a list
names_uppercase = [*names_map]

# Print the list created above
print(names_uppercase)

<class 'map'>
['JERRY', 'KRAMER', 'ELAINE', 'GEORGE', 'NEWMAN']


## 1.2 The power of NumPy arrays

NumPy arrays provide a fast and memory efficient alternative to Python list.         
numpy arrays are homogeneous, means they must contain elements of the same type.      


In [19]:
nums_list = list(range(5))
print(nums_list)

[0, 1, 2, 3, 4]


In [20]:
nums_np = np.array(range(5))
print(nums_np)

[0 1 2 3 4]


In [21]:
# np array homogeneity
nums_np_ints = np.array([1,2,3])
print(nums_np_ints)

print(nums_np_ints.dtype)

nums_np_floats = np.array([1,2.5,3])
print(nums_np_floats)

print(nums_np_floats.dtype)

[1 2 3]
int64
[1.  2.5 3. ]
float64


In [22]:
# np array broadcasting
nums_np = np.array([-2,-1,0,1,2])
nums_np**2

array([4, 1, 0, 1, 4])

In [23]:
# 2-D list/array comparison

# list
nums2 = [[1,2,3],
       [4,5,6]]

# array 
nums_np = np.array(nums2)

# list slicing
print(nums2[0][1])
# array slicing
print(nums_np[0,1])

#return first col of list
print([row[0] for row in nums2])

# return first col of array
print(nums_np[:,0])

2
2
[1, 4]
[1 4]


In [24]:
# np boolean indexing
nums = [-2, -1, 0, 1, 2]
nums_np = np.array(nums)

# bool mask
print(nums_np > 0)

print(nums_np[nums_np > 0])


[False False False  True  True]
[1 2]


You have a list of guests (the names list). Each guest, for whatever reason, has decided to show up to the party in 10-minute increments. For example, Jerry shows up to Festivus 10 minutes into the party's start time, Kramer shows up 20 minutes into the party, and so on and so forth.        

We want to write a few simple lines of code, using the built-ins we have covered, to welcome each of your guests and let them know how many minutes late they are to your party

In [25]:
names = ['Jerry', 'Kramer', 'Elaine', 'George', 'Newman']

In [26]:
# Create a list of arrival times
arrival_times = [*range(10,60,10)]

print(arrival_times)

# clock is 3 min faster
# Convert arrival_times to an array and update the times
arrival_times_np = np.array(arrival_times)
new_times = arrival_times_np - 3
print(new_times)

# Use list comprehension and enumerate to pair guests to new times
guest_arrivals = [(names[i],time) for i,time in enumerate(new_times)]
print(guest_arrivals)



[10, 20, 30, 40, 50]
[ 7 17 27 37 47]
[('Jerry', 7), ('Kramer', 17), ('Elaine', 27), ('George', 37), ('Newman', 47)]


## 1.3 Runtime and profiling code 

magic command: enhancement on top of normal python syntax. They are prefix "%"         

IPython has magic command %timeit

In [27]:
%timeit rand_nums = np.random.rand(1000)
# no. of run: how many iteration we want to estimate runtime (-r)
# no. of loop: how many times the code is executed per run (-n)

9.58 µs ± 272 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [28]:
%timeit -r2 -n10 rand_nums = np.random.rand(1000)
# 2 runs each with 10 execution, i.e. 20 times is run

The slowest run took 5.14 times longer than the fastest. This could mean that an intermediate result is being cached.
29.7 µs ± 20 µs per loop (mean ± std. dev. of 2 runs, 10 loops each)


In [29]:
# single line of code
%timeit nums = [x for x in range(10)]

639 ns ± 12.7 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [30]:
# multiple lines of code
# %%timeit
# nums=[]
# for x in range(10):
#     nums.append(x)

In [31]:
# saving the timeit output
times = %timeit -o rand_nums = np.random.rand(1000)
print(times)

9.11 µs ± 115 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
9.11 µs ± 115 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [32]:
times.timings

[9.049733690001176e-06,
 9.209603429999334e-06,
 9.222254799999519e-06,
 9.297384989999955e-06,
 9.01004766999904e-06,
 9.014614130001063e-06,
 8.998615220000374e-06]

In [33]:
times.best

8.998615220000374e-06

In [34]:
times.worst

9.297384989999955e-06

In [35]:
# python data structure creation

# using formal name
formal_list = list()
formal_dict = dict()
formal_tuple = tuple()

# using (shorthand) literal syntax
literal_list = []
literal_dict = {}
literal_tuple = ()

# compare timing of creation
# formal
f_time = %timeit -o formal_dict = dict()

# literal
l_time = %timeit -o literal_dict = {}

101 ns ± 1.07 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)
33.3 ns ± 0.873 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)


In [36]:
# Create a list of integers (0-50) using list comprehension
%timeit nums_list_comp = [num for num in range(51)]

# Create a list of integers (0-50) by unpacking range
%timeit nums_unpack = [*range(51)]

1.68 µs ± 12.8 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
534 ns ± 5 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


## 1.4 Code profiling for runtime

What if we want to time a large code base or see line-by-line within a function ? -> code profiling      

Code Profiling is a technique used to describe how ling, and how often, various parts of a program are executed.    
It can be used for line-by-line analysis, and provide stats on individual pieces of our code w/o magic command %timeit.       

package used: line_profiler

In [37]:
heroes = ["Batman", "Superman", " Wonder Woman"]

hts = np.array([188.0, 191.0, 183.0])
wts = np.array([95.0, 101.0, 74.0])

In [38]:
def convert_units(heroes, heights, weights):
    new_hts = [ht*0.39370 for ht in heights]
    new_wts = [wt*0.39370 for wt in weights]
    
    hero_data = {}
    
    for i,hero in enumerate(heroes):
        hero_data[hero] = (new_hts[i], new_wts[i])
        
    return hero_data

In [39]:
convert_units(heroes, hts, wts)

{'Batman': (74.01559999999999, 37.4015),
 'Superman': (75.19669999999999, 39.7637),
 ' Wonder Woman': (72.0471, 29.1338)}

In [40]:
# use %timeit
%timeit convert_units(heroes, hts, wts)
# this only give us total execution time
# we could technically use %timeit on every line in the function

3.73 µs ± 25.5 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [41]:
# use line_profiler extension
%load_ext line_profiler

In [42]:
# magic command %lprun is from line_profiler
# -f: we want to profile a func
# follow by the name of the func, w/o ().
# then the func with arg
%lprun -f convert_units convert_units(heroes, hts, wts)
# hits: how many time that line is executed
# time: uses time unit
# per hit: avg amt of time spent executing a single line: Time/Hits
# % Time: percentage of time spent on a line wrt total time in the func

Timer unit: 1e-06 s

Total time: 2.9e-05 s
File: <ipython-input-38-57a50ec7a699>
Function: convert_units at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def convert_units(heroes, heights, weights):
     2         1         16.0     16.0     55.2      new_hts = [ht*0.39370 for ht in heights]
     3         1          5.0      5.0     17.2      new_wts = [wt*0.39370 for wt in weights]
     4                                               
     5         1          1.0      1.0      3.4      hero_data = {}
     6                                               
     7         4          4.0      1.0     13.8      for i,hero in enumerate(heroes):
     8         3          3.0      1.0     10.3          hero_data[hero] = (new_hts[i], new_wts[i])
     9                                                   
    10         1          0.0      0.0      0.0      return hero_data

In [43]:
def convert_units_broadcast(heroes, heights, weights):

    # Array broadcasting instead of list comprehension
    new_hts = heights * 0.39370
    new_wts = weights * 2.20462

    hero_data = {}

    for i,hero in enumerate(heroes):
        hero_data[hero] = (new_hts[i], new_wts[i])

    return hero_data

In [44]:
%lprun -f convert_units_broadcast convert_units_broadcast(heroes, hts, wts)

Timer unit: 1e-06 s

Total time: 2.4e-05 s
File: <ipython-input-43-097b3089decf>
Function: convert_units_broadcast at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def convert_units_broadcast(heroes, heights, weights):
     2                                           
     3                                               # Array broadcasting instead of list comprehension
     4         1         16.0     16.0     66.7      new_hts = heights * 0.39370
     5         1          4.0      4.0     16.7      new_wts = weights * 2.20462
     6                                           
     7         1          0.0      0.0      0.0      hero_data = {}
     8                                           
     9         4          1.0      0.2      4.2      for i,hero in enumerate(heroes):
    10         3          3.0      1.0     12.5          hero_data[hero] = (new_hts[i], new_wts[i])
    11                               

## 1.5 Code profiling for memory usage

can use built-in module: sys.     
This module contains system specific func and contains a nice method: sys.getsizeof(nums_list), which return the size of the object in bytes. 

In [45]:
# single obj size
nums_np = np.array(range(1000))
sys.getsizeof(nums_np)

8096

In [46]:
# line-by-line memory footprint
%load_ext memory_profiler

%mprun -f convert_units convert_units(heroes, hts, wts)
# any func profiled for memory must be defined in a file and imported. so here wont work
# so have to save in .py file

ERROR: Could not find file <ipython-input-38-57a50ec7a699>
NOTE: %mprun can only be used on functions defined in physical files, and not in the IPython environment.





In [47]:
from hero_funcs import convert_units

%load_ext memory_profiler

%mprun -f convert_units convert_units(heroes, hts, wts)

# results will be differeent on different platform and runs

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler



Filename: /Users/XavierTang/Documents/Data Science/Python/python_basics/hero_funcs.py

Line #    Mem usage    Increment   Line Contents
     1     96.6 MiB     96.6 MiB   def convert_units(heroes, heights, weights):
     2     96.6 MiB      0.0 MiB       new_hts = [ht*0.39370 for ht in heights]
     3     96.6 MiB      0.0 MiB       new_wts = [wt*0.39370 for wt in weights]
     4                                 
     5     96.6 MiB      0.0 MiB       hero_data = {}
     6                                 
     7     96.6 MiB      0.0 MiB       for i,hero in enumerate(heroes):
     8     96.6 MiB      0.0 MiB           hero_data[hero] = (new_hts[i], new_wts[i])
     9                                     
    10     96.6 MiB      0.0 MiB       return hero_data

## 1.6 Combining objects

zip can be used to combine object.     

Python also has built-in specialised container datatypes as alternatives to general purpose dict, list, set and turple.    
Notable:     
namedtuple: tuple subclasses with named fields     
deque: list-like container with fast appends and pops        
Counter: dict for counting hashable objects       
OrderedDict: dict that retains order of entries            
defaultdict: dict that calls a factory function to supply missing values.          

The itertools are functional tools for creating and using iterators.        
Notable:     
Infinite iterators: count, cycle, repeat      
Finite iterators: accumulate, chain, zip_longest      
Combination generators: product, permutations, combinations       

In [48]:
names = ["Bulbasaur", "Charmander", " Squirtle"]
hps = [45, 39, 44]

# inefficient
combined = []

for i, pokemon in enumerate(names):
    combined.append((pokemon, hps[i]))
    
print(combined)

[('Bulbasaur', 45), ('Charmander', 39), (' Squirtle', 44)]


In [49]:
# using zip
# uneven zip: will stop once the shortest list is exhausted
combined_zip = zip(names, hps)
# return a "zip" obj
print(type(combined_zip))

<class 'zip'>


In [50]:
# the obj "zip" has to be unpacked into a list to see the contents
combined_zip_list = [*combined_zip]
print(combined_zip_list)

[('Bulbasaur', 45), ('Charmander', 39), (' Squirtle', 44)]


In [51]:
# frequency counting with loop
poke_types = ["Grass", "Dark", "Fire", "Water", "Steel", "Dragon", "Dark", "Water", "Steel", "Fire"]

# normal way
type_count = {}

for poke_type in poke_types:
    if poke_type not in type_count:
        type_count[poke_type] = 1
    else: type_count[poke_type] += 1
        
print(type_count)

{'Grass': 1, 'Dark': 2, 'Fire': 2, 'Water': 2, 'Steel': 2, 'Dragon': 1}


In [52]:
# using collections.Counter()
from collections import Counter

type_counts_counter = Counter(poke_types)
print(type_counts_counter)

Counter({'Dark': 2, 'Fire': 2, 'Water': 2, 'Steel': 2, 'Grass': 1, 'Dragon': 1})


In [53]:
# Suppose we want all combinations pairs of type possible
combos = []

for x in poke_types:
    for y in poke_types:
        if x==y:
            continue
        if ((x,y) not in combos) & ((y,x) not in combos):
            combos.append((x,y))
            
print(combos)

[('Grass', 'Dark'), ('Grass', 'Fire'), ('Grass', 'Water'), ('Grass', 'Steel'), ('Grass', 'Dragon'), ('Dark', 'Fire'), ('Dark', 'Water'), ('Dark', 'Steel'), ('Dark', 'Dragon'), ('Fire', 'Water'), ('Fire', 'Steel'), ('Fire', 'Dragon'), ('Water', 'Steel'), ('Water', 'Dragon'), ('Steel', 'Dragon')]


In [54]:
# using itertools
from itertools import combinations
combos_obj = combinations(poke_types, 2)
print(type(combos_obj))
combos = [*combos_obj]
print(combos)

<class 'itertools.combinations'>
[('Grass', 'Dark'), ('Grass', 'Fire'), ('Grass', 'Water'), ('Grass', 'Steel'), ('Grass', 'Dragon'), ('Grass', 'Dark'), ('Grass', 'Water'), ('Grass', 'Steel'), ('Grass', 'Fire'), ('Dark', 'Fire'), ('Dark', 'Water'), ('Dark', 'Steel'), ('Dark', 'Dragon'), ('Dark', 'Dark'), ('Dark', 'Water'), ('Dark', 'Steel'), ('Dark', 'Fire'), ('Fire', 'Water'), ('Fire', 'Steel'), ('Fire', 'Dragon'), ('Fire', 'Dark'), ('Fire', 'Water'), ('Fire', 'Steel'), ('Fire', 'Fire'), ('Water', 'Steel'), ('Water', 'Dragon'), ('Water', 'Dark'), ('Water', 'Water'), ('Water', 'Steel'), ('Water', 'Fire'), ('Steel', 'Dragon'), ('Steel', 'Dark'), ('Steel', 'Water'), ('Steel', 'Steel'), ('Steel', 'Fire'), ('Dragon', 'Dark'), ('Dragon', 'Water'), ('Dragon', 'Steel'), ('Dragon', 'Fire'), ('Dark', 'Water'), ('Dark', 'Steel'), ('Dark', 'Fire'), ('Water', 'Steel'), ('Water', 'Fire'), ('Steel', 'Fire')]


## 1.7 Set Theory

If we want to compare similiarity and differences between the contents of 2 objects, we can use set theory.      

Built-in "set" datatype with accompanying methods:          
intersection(): all elements that are in both sets       
difference(): all elements in one set but not the other        
symmetric_difference(): all elements in exactly one set     
union(): all elements that are in either set       

we can consider to store data in set datatype if we wish to compare them.       
they also have the ability to quickly check if a value exist within its memeber using "in" operator.     

A set is defined as a collection of distinct elements. We can use a set to collect unique items from an existing object.

In [55]:
list_a = ["Bulbasaur", "Charmander", "Squirtle"]
list_b = ["Caterpie", "Pidgey", "Squirtle"]

# find intersec
# inefficent
in_common = []

for pokemon_a in list_a:
    for pokemon_b in list_b:
        if pokemon_a == pokemon_b:
            in_common.append(pokemon_a)
            
print(in_common)

['Squirtle']


In [56]:
# using the set datatype
set_a = set(list_a)
print(set_a)

{'Bulbasaur', 'Squirtle', 'Charmander'}


In [57]:
set_b = set(list_b)
print(set_b)

{'Pidgey', 'Squirtle', 'Caterpie'}


In [58]:
set_a.intersection(set_b)

{'Squirtle'}

In [59]:
# in set_a but not in set_b
print(set_a.difference(set_b))
# in set_b but not in set_a
print(set_b.difference(set_a))

{'Bulbasaur', 'Charmander'}
{'Pidgey', 'Caterpie'}


In [60]:
set_a.symmetric_difference(set_b)

{'Bulbasaur', 'Caterpie', 'Charmander', 'Pidgey'}

In [61]:
set_a.union(set_b)

{'Bulbasaur', 'Caterpie', 'Charmander', 'Pidgey', 'Squirtle'}

In [62]:
# collect unique items

#inefficient
unique_types = []

for prim_type in poke_types:
    if prim_type not in unique_types:
        unique_types.append(prim_type)
        
print(unique_types)

['Grass', 'Dark', 'Fire', 'Water', 'Steel', 'Dragon']


In [63]:
# using set
unique_type_set = set(poke_types)
print(unique_type_set)

{'Steel', 'Grass', 'Dragon', 'Fire', 'Water', 'Dark'}


## 1.9 Eliminating loops

In [64]:
poke_stats = [
    [90, 92, 75, 60],
    [25, 20, 15, 90],
    [55, 130, 60, 75]
]

# sum of each row

# for loop: inefficient
totals = []
for row in poke_stats:
    totals.append(sum(row))
    
print(totals)

[317, 150, 320]


In [65]:
# list comprehension
# faster
totals_comp = [sum(row) for row in poke_stats]
print(totals_comp)

# built-in map()
# even faster
totals_comp = [*map(sum, poke_stats)]
print(totals_comp)

[317, 150, 320]
[317, 150, 320]


In [66]:
# use numpy
poke_stats = np.array([
    [90, 92, 75, 60],
    [25, 20, 15, 90],
    [55, 130, 60, 75]
])

# avg of row
# inefficent
avgs = []
for row in poke_stats:
    avg = np.mean(row)
    avgs.append(avg)
print(avgs)

[79.25, 37.5, 80.0]


In [67]:
# vectorisation
avgs_np = poke_stats.mean(axis=1)
print(avgs_np)

[79.25 37.5  80.  ]


## 1.10 Writing better loops

Note, some loops here can be eliminated, but are here for demonstration purposes.      

1) Understand what is being done with each loop iteration      
2) Move 1-time calcuations outside (above) the loop     
3) Use holistic conversion outside (below) loop (use a map())      
4) anything that is done once should be outside the loop.      



In [68]:
names = ["Absol", "Aron", "Jynx", "Natu", "Onix"]
attacks = np.array([130, 70, 50, 50, 45])

# want name of pokemon with attack > avg of all attack

# loop
for pokemon, attack in zip(names, attacks):
    total_attack_avg = attacks.mean() # bad, should move out)
    if attack > total_attack_avg:
        print(
        "{}'s attack: {} > average: {}!"
        .format(pokemon, attack, total_attack_avg)
        )

Absol's attack: 130 > average: 69.0!
Aron's attack: 70 > average: 69.0!


In [69]:
# not-so-holistic conversions
names = ["Pikachu", "Squirtle", "Articuno"]
legend_status = [False, False, True]
generations = [1,1,1]

poke_data = []
for poke_tuple in zip(names, legend_status, generations):
    poke_list = list(poke_tuple)
    poke_data.append(poke_list)
    
print(poke_data)

# holistic
poke_data_tuples = []
for poke_tuple in zip(names, legend_status, generations):
    poke_data_tuples.append(poke_tuple)
poke_data = [*map(list, poke_data_tuples)]
print(poke_data)

[['Pikachu', False, 1], ['Squirtle', False, 1], ['Articuno', True, 1]]
[['Pikachu', False, 1], ['Squirtle', False, 1], ['Articuno', True, 1]]


## 1.11 pandas iteration

1) .iterrows()      
Use test_df.iterrows(): create list of tuple (index, Series_row_data)     

2) .itertuples()     
often more efficient than .iterrows().     


In [70]:
baseball_df = pd.read_csv("../python_basics/data/baseball_stats.csv")
baseball_df.head()

Unnamed: 0,Team,League,Year,RS,RA,W,OBP,SLG,BA,Playoffs,RankSeason,RankPlayoffs,G,OOBP,OSLG
0,ARI,NL,2012,734,688,81,0.328,0.418,0.259,0,,,162,0.317,0.415
1,ATL,NL,2012,700,600,94,0.32,0.389,0.247,1,4.0,5.0,162,0.306,0.378
2,BAL,AL,2012,712,705,93,0.311,0.417,0.247,1,5.0,4.0,162,0.315,0.403
3,BOS,AL,2012,734,806,69,0.315,0.415,0.26,0,,,162,0.331,0.428
4,CHC,NL,2012,613,759,61,0.302,0.378,0.24,0,,,162,0.335,0.424


In [71]:
baseball_df_copy = baseball_df.copy()

In [72]:
# calculate win percent
def calc_win_perc(wins, games_played):
    win_perc = wins/games_played
    return np.round(win_perc,2)

In [73]:
# adding win perc to DataFrame

#inefficient
win_perc_list = []

for i in range(len(baseball_df_copy)):
    row = baseball_df_copy.iloc[i]
    wins = row["W"]
    games_played = row["G"]
    win_perc = calc_win_perc(wins, games_played)
    win_perc_list.append(win_perc)
    
baseball_df_copy["WP"]=win_perc_list
baseball_df_copy.head()

Unnamed: 0,Team,League,Year,RS,RA,W,OBP,SLG,BA,Playoffs,RankSeason,RankPlayoffs,G,OOBP,OSLG,WP
0,ARI,NL,2012,734,688,81,0.328,0.418,0.259,0,,,162,0.317,0.415,0.5
1,ATL,NL,2012,700,600,94,0.32,0.389,0.247,1,4.0,5.0,162,0.306,0.378,0.58
2,BAL,AL,2012,712,705,93,0.311,0.417,0.247,1,5.0,4.0,162,0.315,0.403,0.57
3,BOS,AL,2012,734,806,69,0.315,0.415,0.26,0,,,162,0.331,0.428,0.43
4,CHC,NL,2012,613,759,61,0.302,0.378,0.24,0,,,162,0.335,0.424,0.38


In [74]:
baseball_df_copy1 = baseball_df.copy()

# using iterrows() to iterate DataFrame
win_perc_list = []
# iterrows return each DF row as a tuple of (index, Series)
for i, row in baseball_df_copy1.iterrows():
    wins = row["W"]
    games_played = row["G"]
    win_perc = calc_win_perc(wins, games_played)
    win_perc_list.append(win_perc)
    
baseball_df_copy1["WP"] = win_perc_list
baseball_df_copy1.head()    

Unnamed: 0,Team,League,Year,RS,RA,W,OBP,SLG,BA,Playoffs,RankSeason,RankPlayoffs,G,OOBP,OSLG,WP
0,ARI,NL,2012,734,688,81,0.328,0.418,0.259,0,,,162,0.317,0.415,0.5
1,ATL,NL,2012,700,600,94,0.32,0.389,0.247,1,4.0,5.0,162,0.306,0.378,0.58
2,BAL,AL,2012,712,705,93,0.311,0.417,0.247,1,5.0,4.0,162,0.315,0.403,0.57
3,BOS,AL,2012,734,806,69,0.315,0.415,0.26,0,,,162,0.331,0.428,0.43
4,CHC,NL,2012,613,759,61,0.302,0.378,0.24,0,,,162,0.335,0.424,0.38


In [None]:
# using .itertuples
baseball_df_copy2 = baseball_df.copy()

# behaves like tuple but have fields accessible using attribute lookup
for row_namedtuple in baseball_df_copy2.itertuples():
    print(row_namedtuple)

In [76]:
print(row_namedtuple.Index)
print(row_namedtuple.Team)

1231
WSA


## 1.12 pandas alternative to looping

use the .apply() method.      
This func acts like .map(), but apply to entire dataframe. (axis = 0 for col, 1 for row)     
Can be used with lambda func.

In [77]:
baseball_df_copy3 = baseball_df.copy()

In [78]:
def calc_run_diff(runs_scored, runs_allowed):
    run_diff = runs_scored - runs_allowed
    return run_diff

In [79]:
run_diffs_apply = baseball_df_copy3.apply(lambda row: calc_run_diff(row["RS"], row["RA"]), axis=1)
baseball_df_copy3["RD"] = run_diffs_apply
baseball_df_copy3

Unnamed: 0,Team,League,Year,RS,RA,W,OBP,SLG,BA,Playoffs,RankSeason,RankPlayoffs,G,OOBP,OSLG,RD
0,ARI,NL,2012,734,688,81,0.328,0.418,0.259,0,,,162,0.317,0.415,46
1,ATL,NL,2012,700,600,94,0.320,0.389,0.247,1,4.0,5.0,162,0.306,0.378,100
2,BAL,AL,2012,712,705,93,0.311,0.417,0.247,1,5.0,4.0,162,0.315,0.403,7
3,BOS,AL,2012,734,806,69,0.315,0.415,0.260,0,,,162,0.331,0.428,-72
4,CHC,NL,2012,613,759,61,0.302,0.378,0.240,0,,,162,0.335,0.424,-146
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1227,PHI,NL,1962,705,759,81,0.330,0.390,0.260,0,,,161,,,-54
1228,PIT,NL,1962,706,626,93,0.321,0.394,0.268,0,,,161,,,80
1229,SFG,NL,1962,878,690,103,0.341,0.441,0.278,1,1.0,2.0,165,,,188
1230,STL,NL,1962,774,664,84,0.335,0.394,0.271,0,,,163,,,110


In [80]:
baseball_df_copy4 = baseball_df.copy()

In [81]:
# grab any col as np array
wins_np = baseball_df_copy4["W"].values
print(type(wins_np))

<class 'numpy.ndarray'>


In [82]:
print(wins_np)

[ 81  94  93 ... 103  84  60]


In [83]:
# vectorisation
baseball_df_copy4["RS"].values - baseball_df_copy4["RA"].values

array([  46,  100,    7, ...,  188,  110, -117])

In [84]:
run_diffs_np = baseball_df_copy4["RS"].values - baseball_df_copy4["RA"].values
baseball_df_copy4["RD"] = run_diffs_np
print(baseball_df_copy4.head())

  Team League  Year   RS   RA   W    OBP    SLG     BA  Playoffs  RankSeason  \
0  ARI     NL  2012  734  688  81  0.328  0.418  0.259         0         NaN   
1  ATL     NL  2012  700  600  94  0.320  0.389  0.247         1         4.0   
2  BAL     AL  2012  712  705  93  0.311  0.417  0.247         1         5.0   
3  BOS     AL  2012  734  806  69  0.315  0.415  0.260         0         NaN   
4  CHC     NL  2012  613  759  61  0.302  0.378  0.240         0         NaN   

   RankPlayoffs    G   OOBP   OSLG   RD  
0           NaN  162  0.317  0.415   46  
1           5.0  162  0.306  0.378  100  
2           4.0  162  0.315  0.403    7  
3           NaN  162  0.331  0.428  -72  
4           NaN  162  0.335  0.424 -146  


# 2. Unit Testing

Suppose we wrote a function in python, how do we know the implementation is correct ?    
Easiest way is to open an interpreter, test the function on a few arguments and check whether the return value is correct.     

This is actually not very efficient.       

One example below to test the function

In [2]:
# sample function for a two-col data
# func return None if has missing area or tab separator
def row_to_list(row):
    row = row.rstrip("\n")
    separated_entries = row.split("\t")
    if len(separated_entries) == 2 and "" not in separated_entries:
        return separated_entries
    return None

In [3]:
row_to_list("2,081\t314,942\n")

['2,081', '314,942']

In [5]:
row_to_list("\t293,410\n")

There are many python library for writing unit tests such as pytest, unittest, nosetests and doctest.    
We use pytest here becasue it is easy to use, most popular and has all essential features.     

## 2.0 Unit test and mini-project outline/pipeline

In this section, we will have raw (tabular) data on housing area and price, and have to create a cleaned data using some func. These cleaned data is then used compute features. Then they are used to create predictive module.      

i.e.      
raw data -> clean-up (func) -> clean data -> create feature (func) -> feature -> modeling (func) -> predictive model ---> housing area/price.    

What is a unit?     
Small, independent piece of code, can be func or class.     

Integration test checks multiple units at the same time, and not independently. (e.g. input raw data, check feature)      
End to end test checks the whole pipeline at once. 

## 2.1 Unit testing basics

(source code for this part is in ../python_basics/DataCamp_UnitTest/src/data/test_row_to_list.py)

To start unit test with pytest (for the above func), we will first create a file called test_row_to_list.py     
When pythest sees  a filename start with "test_", it understands that this is not an usual python file, for a speical one containing unit test.     

Files holding unit tests are also called test modules, and we just created our first test module.    
In the test module "test_row_to_list.py", we first import pytest, then import func undertest.    

A unit test is written as a python func, whose name starts with a "test_", just like the test module. So pythest can tell this is a unit test and not a ordinary python func. The unit test usually corresponds to exactly one entry in the argument and return value table for row_to_list(). The unit test checks whether row_to_list() has the expected return value when called on this particular argument.    

To run the test module, run "pytest test_row_to_list.py" (under conda env due to package availablity).      

The unit tests script also serve as documentation. If a collaborator didn't know this func purpose, they can recreate the argument <--> return value table, by looking at the bool expression used in the assert statements. The table will give then a good hint about what the func does.     

Unit test also increase trust in a package, as users can run the unit tests and verify that the func work. We can setup a Continuous Integration or IC to run all unit test when any code is pushed to source repository. If any unit test failed, it reject the changes and informs the devs.

assert statement can take a second argument called message:     

assert boolean_expression, message      

The message is only printed when the assert statement raises an assertion error.    

It is recommended to include a message with assertions because of readibility. 

In [3]:
assert 1 == 2, "One is not equal to two!"

AssertionError: One is not equal to two!

Beware of float return values:      

Due to memory and rounding, we should not use the usual way to compare floats in the assertion statement.    

We should use the pytest.approx() to wrapp expected return values.

In [4]:
0.1 + 0.1 +0.1 == 0.3

False

In [5]:
#usual way
assert 0.1 + 0.1 +0.1 == 0.3 , "Usual way"

AssertionError: Usual way

In [8]:
# do this for float
assert 0.1 + 0.1 + 0.1 == pytest.approx(0.3)
# empty output

In [9]:
# also works on np array
assert np.array([0.1+0.1, 0.1+0.1+0.1]) == pytest.approx(np.array([0.2,0.3]))

So far we have used assert to check if a func returns the expected value. Some func may not return anythingm but rather raise an exception when called on certain arg.     

We can also test whether this func raises ValueError when 1d array is used as input.

In [2]:
def split_into_training_and_testing_sets(data_array):
    dim = data_array.ndim
    if dim != 2:
        raise ValueError("Argument data_array must be two dimensional. Got {0} dimensional array instead!".format(dim))
    num_rows = data_array.shape[0]
    if num_rows < 2:
        raise ValueError("Argument data_array must have at least 2 rows, it actually has just {0}".format(num_rows))
    num_training = int(0.75 * data_array.shape[0])
    permuted_indices = np.random.permutation(data_array.shape[0])
    return data_array[permuted_indices[:num_training], :], data_array[permuted_indices[num_training:], :]

In [3]:
# input must be 2d, if not raise a ValueError
example_argument = np.array([[2081, 314942],
                            [1059, 186606],
                            [1148, 206186],
                            ]
                           )

split_into_training_and_testing_sets(example_argument)

(array([[  2081, 314942],
        [  1059, 186606]]), array([[  1148, 206186]]))

In [4]:
# input must be 2d, if not raise a ValueError
example_argument_1d = np.array([2081,314942, 1059, 186606, 1148, 206186])

split_into_training_and_testing_sets(example_argument_1d)

ValueError: Argument data_array must be two dimensional. Got 1 dimensional array instead!

In [None]:
# context manager
# Do not run

# context manager runs some code before entering and exiting the context

with context_manager:
    # <-- run some code on entering context
    print("This is part of the context")
    #<--- run some code on exiting context

with pytest.raises(ValueError): #context manager
    # <- Does nothing on entering the context
    print("This is part of the context")
    # <- If context raised ValueError, silence it
    # <- if the context did not raise ValueError, raise an exception

In [6]:
with pytest.raises(ValueError):
    raise ValueError # context exits with ValueError
    # <--- pytest.raises(ValueError) silences it

In [7]:
with pytest.raises(ValueError):
    pass # context exits without raising a ValueError
    # <-- pytest.raises(ValueError) raises Failed

Failed: DID NOT RAISE <class 'ValueError'>

How many tests shoud we write for a func ?     
Best practice is to pick a few from each of the categories:     
1) Bad argument
2) Special argument
3) Normal argument

If a func is tested in all these category, it can be considered as well-tested func.    

For the example of split_into_training_and_testing_sets() function, examples:      
1) Bad argument     
Bad arguments are arguments for which the func raises an exception instead of returning a value.     
1 dimension array is a bad argument.      

2) Special argument     
2 types:    
Boundary values    
For some argument values, func uses speical logic       

3) Normal argument    
Anything that is not Bad or Speical argument.       

Caveat:    
Not all func have bad or speical arguments, in this case, just ignore those category. 

**Test Driven Development (TDD):**     

step 1: Write unit tests and fix requirements     
As we write unit test, we think more about the requirement of this func.    

step 2: run tests and watch it fail      
since func has not exist yet.

step 3: implement func and run test again

## 2.2 Organize set of tests

My each corresponding my_module.py, there should be a corresponding test_my_module.py     
So while we have a folder called src (for source code), there will be a folder called test that mirror its structure.    

We can use a construc called test class to organise the different test class.     

We can run all test by go to the folder and type "pytest", it will recurse into directory subtree and run all tests:   
Filenames starting with test_    
within it identifies classname starting with Test     
within it identifies func name start with test_      

pytest -x: stop after first failure.    

pytest file_path_to_test_module: also works     

During testing, pytest assign a node ID to every test class and unit test that it encounters:    
Node ID of a test class: (path to test module)::(test class name)     
Node ID of an unit test: (path to test module)::(test class name)::(unit test name)        

pytest NodeID: only run the specific tests       

keyword expression     
pytest -k "pattern": run all tests whose node ID matches the pattern     
Python logical operator also works in keyword expression e.g. not in etc.      



## 2.3 Expected to fail

Sometimes we know certain test is gonna fail (e.g. actual func not yet implemented, but we wrote some test cases). we can use "xfail" decorator. The decorator goes on top of a test, and it starts with @:    
@pytest.mark.xfail    

This will make the test suite still green.    

we can also use     
@pytest.mark.skipif(bool_exp)    
if true, this test is skipped     

When integrating with github, use Travis CI, and create a travis.ymml file, and also codecov can show how much percent the code (no. of lines) are tested.     



In [None]:
# for illustration, do not run
class TestTrainModel(object):
    @pytest.mark.xfail(reason="Using TDD, func is not implemented")
    def test_on_linear_data(self):
        ...
        
        
    

In [None]:
# for illustration, do not run
class TestTrainModel(object):
    @pytest.mark.skipif(sys.version_info > (2,7), reason="requires python 2.7")
    def test_on_linear_data(self):
        """Only runs on python 2.7 or lower"""
        ...
        
    

## 2.3 Advanced unit testing

There are some functions whose tests require more than assert statement. One example is shown below. The func first apply row_to_list to rows then convert to int. These will filter out bad data and wrote the result to the clean file.    
preprocess() needs a raw datafile in the enviroment to work properly. When we call the func, it modifies the environment by creating a clean data file.     

We can create a test for this func called test_on_raw_data().    

Need to setup an env for the assert, then teardown to restore to a clean state env for the next test    
So the new work flow goes like:      
setup -> assert -> teardown      

In pytest, the setup and teardown is placed outside the test, in a function called a fixture.     
A fixture is a function which has the pytest.fixure decorator.     
The first section is the setup. Then the func return the data that the test needs. The test can access this data by calling the fixure passed as an argument.

In [None]:
def preprocess(raw_data_file_path, clean_data_file_path):
    with open(raw_data_file_path, "r") as input_file:
        rows = input_file.readlines()
    with open(clean_data_file_path, "w") as output_file:
        for row in rows:
            row_as_list = row_to_list(row)
            if row_as_list is None:
                continue
            area = convert_to_int(row_as_list[0])
            price = convert_to_int(row_as_list[1])
            if area is None or price is None:
                continue
            output_file.write("{0}\t{1}\n".format(area, price))

In [None]:
def test_on_raw_data():
    #setup: create the raw data file
    # bring the env to a state where testing can begin
    preprocess(raw_data_path, clean_data_file_path)
    with open(clean_data_file_path) as f:
        lines = f.readlines()
        first_line = lines[0]
        assert first_line == "1801\t201411\n"
        second_line = lines[1]
        assert second_line == "2002\t333209\n"
        # Teardown: remove row and clean data file

In [None]:
# pytest.fixure structure

@pytest.fixure

def my_fixure():
    # Do setup here
    yield data 
    # Do teardown here
    
    
def test_something(my_fixure):
    ...
    data = my_fixure
    ...

In [None]:
# e.g.

#Fixure
@pytest.fixture
def raw_and_clean_data_file():
    raw_data_file_path = "raw.txt"
    clean_data_file_path = "clean.txt"
    with open("raw.txt", "w") as f:
        f.write("1,801\t201,411\n"
               "1,767565,112\n"
               "2,002\t333,209\n"
               "1990\t7822,911\n"
               "1,285\t389129\n")
    yield raw_data_file_path, clean_data_file_path
    #teardown
    os.remove(raw_data_file_path)
    os.remove(clean_data_file_path)
    
    
#test
def test_on_raw_data(row_and_clean_data_file):
    raw_path, clean_path = raw_and_clean_data_file
    preprocess(raw_path, clean_path)
    with open(clean_data_file_path) as f:
        lines = f.readlines()
        first_line = lines[0]
        assert first_line == "1801\t201411\n"
        second_line = lines[1]
        assert second_line == "2002\t333209\n"

In [None]:
# we can also use tmpdir
@pytest.fixture
def raw_and_clean_data_file(tmpdir):
    raw_data_file_path = tempdir.join("raw.txt")
    clean_data_file_path = tmpdir.join("clean.txt")
    with open("raw.txt", "w") as f:
        f.write("1,801\t201,411\n"
               "1,767565,112\n"
               "2,002\t333,209\n"
               "1990\t7822,911\n"
               "1,285\t389129\n")
    yield raw_data_file_path, clean_data_file_path
    #no teardown needed
