# Unit testing for data science

Python unit test library:
- pytest
- unitest
- nosetests
- doctest

Step 1:
- create test_row_to_list.py
- test_ indicate unit tests
Step 2:
- import pytest
- import row_to_list
Step 3:
- def test_bla(): assert

In [None]:
def test_for_missing_area():
    assert row_to_list("ksfjsjf\n") is None

In [4]:
import pytest

def example():
    return 2

def test_example():
    assert example() == 2

platform win32 -- Python 3.9.7, pytest-6.2.5, py-1.11.0, pluggy-1.0.0
rootdir: c:\Users\Lenovo\Documents\DataCamp
collected 0 items



ERROR: file or directory not found: test_example()



In [6]:
!pytest test_example.py

platform win32 -- Python 3.9.7, pytest-6.2.5, py-1.11.0, pluggy-1.0.0
rootdir: c:\Users\Lenovo\Documents\DataCamp
collected 1 item

test_example.py .                                                        [100%]



## Mastering assert statements

In [1]:
assert 1==2, "1 different from 2"

AssertionError: 1 different from 2

In [3]:
# Don't use this for floats

assert .1 + .1 + .1 == .3, "c'est pas égal !"

AssertionError: c'est pas égal !

In [5]:
# Use this instead
import pytest

assert .1 + .1 + .1 == pytest.approx(.3), "approximation"

In [6]:
# Works with numpy too
import numpy as np

assert np.array([.1 + .1, .1 + .1]) == pytest.approx(np.array([.2, .2]))

### Multiple assertions in one unit test

In [9]:
def convert_to_int(num):
    return int(num.replace(",", ""))

In [10]:
def test_on_string_with_one_comma():
    test_argument = "2,081"
    expected = 2081
    actual = convert_to_int(test_argument)
    message = "convert_to_int('2,081') should return the int 2081, but it actually returned {0}".format(actual)
    assert isinstance(return_value, int)
    assert actual == expected, message

## Testing for exceptions

In [2]:
import pytest

In [3]:
with pytest.raises(ValueError):
    # <--- Does nothing on entering the context
    print("This is part of the context")
    # <--- If context raised ValueError, silence it
    # <--- If the context did not raise ValueError, raise an exception

This is part of the context


Failed: DID NOT RAISE <class 'ValueError'>

In [4]:
with pytest.raises(ValueError):
    raise ValueError  # context exists w/ ValueError
    # <--- pytest.raises(ValueError) silences it

In [5]:
with pytest.raises(ValueError):
    pass   # context exists without raising a ValueError
# <--- pytest.raises(ValueError) raised Failed

Failed: DID NOT RAISE <class 'ValueError'>

In [17]:
import numpy as np

def test_on_numpy():
    ex = (1.4, 2)
    with pytest.raises(TypeError) as error:
        np.zeros(ex)
    print(error)

In [19]:
test_on_numpy()

<ExceptionInfo TypeError("'float' object cannot be interpreted as an integer") tblen=1>


In [22]:
def test_on_numpy2():
    ex = (1.4, 2)
    with pytest.raises(TypeError) as error:
        np.zeros(ex)
    expected_error_msg = "hehe boi"
    assert error.match(expected_error_msg)

In [23]:
test_on_numpy2()

AssertionError: Regex pattern 'hehe boi' does not match "'float' object cannot be interpreted as an integer".

### The well tested function

Test for these argument types
- Bad arguments  -> raises an exception
- Special arguments  -> boundary values, special logic
- Normal arguments (at least 2 or 3)

In [None]:
def test_on_one_tab_with_missing_value():    # (1, 1) boundary value
    actual = row_to_list("\t4,567\n")
    # Format the failure message
    assert actual is None, "Expected: None, Actual: {0}".format(actual)

In [None]:
# Normal arguments example

import pytest
from preprocessing_helpers import row_to_list

def test_on_normal_argument_1():
    actual = row_to_list("123\t4,567\n")
    # Fill in with the expected return value for the argument "123\t4,567\n"
    expected = ["123", "4,567"]
    assert actual == expected, "Expected: {0}, Actual: {1}".format(expected, actual)
    
def test_on_normal_argument_2():
    actual = row_to_list("1,059\t186,606\n")
    expected = ["1,059", "186,606"]
    # Write the assert statement along with a failure message
    assert actual == expected, "Expected: {0}, Actual: {1}".format(expected, actual)

### Test Driven Development (TDD)

- Write unit tests before implementation
    - Unit tests cannot be deprioritized
    - Time for writing unit tests factored in implementation time
    - Requirements are clearer and implementation easier

## How to organize a growing set of tests ?

In [None]:
# Test class
import pytest
from data.preprocessing_helpers import row_to_list, convert_to_int

class TestRowToList(object):
    def test_on_no_tab_no_missing_value(self):
        pass
    
    def test_on_two_tabs_no_missing_value(self):
        pass

class TestConvertToInt(object):
    def test_with_no_comma(self):
        pass
    def test_with_one_comma(self):
        pass

### Final test directory structure

![final test directory](tets_dir.png)

### Runing all tests

In [1]:
# cd tests
# pytest

- recurses into directory subtree of tests/
    - filename starting with test_ -> test module
        - classname starting with Test -> test class
            - function names starting with test_ -> unit test

In [2]:
# pytest -x  -> stops after the first error

### Node ID

In [None]:
# if I want to run specific tests

# Node ID of a test class
# <path to test module>::<test class name>

# Node ID of an unit test
# <path to test module>::<test class name>::<unit test name>

#  ~~~~~~~~~~~~~ example ~~~~~~~~~~~~~~~~~~
# ~$ pytest tests/data/test_preprocessing_helpers.py::TestRowToList::test_on_one_tab_with_missing_value

### pytest -k

In [None]:
# pytest -k "TestSplit and not test_on_one_row"

# TestSplit unique name, il reconnaitle nom complet


## Expected failures and conditional skipping

### xfail: marking tests as expecting to fail

In [None]:
import pytest

class TestTrainModel(object):
    @pytest.mark.xfail(reason="Using TDD, train_model() is not implemented")
    def test_on_linear_data(self):
        pass

### Tests that are expected to fail: skipif

In [None]:
# Error on python 2,7 or above

import sys

class TestConvertToInt(object):
    @pytest.mark.skipif(sys.version_info > (2,7), reason="Works only on Python 2.7 or lower")
    def test_with_no_comma(self):
        """Only runs on python 2.7 or lower"""
        pass

### Showing reason for both skipped and xfail

pytest -rsx

### Continuous integration and code coverage

For continuous integration, create a configuration file

In [None]:
# repository root
#|-- src
#|-- tests
#|-- .travis.yml

In [None]:
# Contents of .travis.yml

"""
language: python
python:
    - "3.6"
install:
    - pip install -e .
    - pip install pytest-cov codecov  # Install packages for code coverage report
script:
    - pytest tests         # Replace with line below if using code coverage
    - pytest --cov=src tests  # Point to the source directory.
after_success:
    - codecov              # uploads report to codecov.io
"""

In [None]:
# Push the file to github

# git add .travis.yml
# git push origin master

In [None]:
# install the travis CI app (free for public repos) from github
# install codecov.io in github marketplace

With all this new knowledge, let's try to create a dummy project, with libraries and test units + continuous integration !

## Beyon assertion: setup and teardown

### Fixture

In [None]:
import pytest

@pytest.fixture
def my_fixture():
    # Do setup here
    yield data
    # Do teardown here

def test_something(my_fixture):
    data = my_fixture

In [None]:
# Example
import os

@pytest.fixture
def raw_and_clean_data_file():
    # Setup
    raw_data_file_path = "raw.txt"
    clean_data_file_path = "clean.txt"
    with open(raw_data_file_path, "w") as f:
        f.write("1,801\t201,411\n""1,767565,112\n""2,002\t333,209\n")
    yield raw_data_file_path, clean_data_file_path
    # Teardown
    os.remove(raw_data_file_path)
    os.remove(clean_data_file_path)

def test_on_raw_data(raw_and_clean_data_file):
    raw_path, clean_path = raw_and_clean_data_file
    preprocess(raw_path, clean_path)
    with open(clean_data_file_path) as f:
        lines = f.readlines()
    first_line = lines[0]
    assert first_line == "1,801\t201,411\n"
    second_line = lines[1]
    assert second_line == "2,002\t333,209\n"

### tmpdir and fixture chaining

In [None]:
@pytest.fixture
def raw_and_clean_data_file(tmpdir):
    # Setup
    raw_data_file_path = tmpdir.join("raw.txt")
    clean_data_file_path = tmpdir.join("clean.txt")
    with open(raw_data_file_path, "w") as f:
        f.write("1,801\t201,411\n""1,767565,112\n""2,002\t333,209\n")
    yield raw_data_file_path, clean_data_file_path
    # No teardown code necessary

### Mocking

In [None]:
# Add the correct argument to use the mocking fixture in this test
def test_on_raw_data(self, raw_and_clean_data_file, mocker):
    raw_path, clean_path = raw_and_clean_data_file
    # Replace the dependency with the bug-free mock
    convert_to_int_mock = mocker.patch("data.preprocessing_helpers.convert_to_int",
                                       side_effect=convert_to_int_bug_free)
    preprocess(raw_path, clean_path)
    # Check if preprocess() called the dependency correctly
    assert convert_to_int_mock.call_args_list == [call("1,801"), call("201,411"), call("2,002"), call("333,209"), call("1990"), call("782,911"), call("1,285"), call("389129")]
    with open(clean_path, "r") as f:
        lines = f.readlines()
    first_line = lines[0]
    assert first_line == "1801\\t201411\\n"
    second_line = lines[1]
    assert second_line == "2002\\t333209\\n" 

### Testing plots