In [1]:
#Crafting a docstring

#You've decided to write the world's greatest open-source natural language processing Python package. It will revolutionize
#working with free-form text, the way numpy did for arrays, pandas did for tabular data, and scikit-learn did for machine
#learning.

#The first function you write is count_letter(). It takes a string and a single letter and returns the number of times the
#letter appears in the string. You want the users of your open-source package to be able to understand how this function works
#easily, so you will need to give it a docstring. Build up a Google Style docstring for this function by following these steps.

def count_letter(content, letter):
    """Count the number of times `letter` appears in `content`.
    
    Args:
        content (str): The string to search.
        letter (str): The letter to search for.
    
    Returns:
        int
    
    # Add a section detailing what errors might be raised
    Raises:
        ValueError: If `letter` is not a one-character string.
    """
    if (not isinstance(letter, str)) or len(letter) != 1:
        raise ValueError('`letter` must be a single character string.')
    return len([char for char in content if char == letter])

In [2]:
count_letter('holaa', 'a')

2

In [3]:
#Retrieving docstrings

#You and a group of friends are working on building an amazing new Python IDE (integrated development environment -- like
#PyCharm, Spyder, Eclipse, Visual Studio, etc.). The team wants to add a feature that displays a tooltip with a function's
#docstring whenever the user starts typing the function name. That way, the user doesn't have to go elsewhere to look up the
#documentation for the function they are trying to use. You've been asked to complete the build_tooltip() function that
#retrieves a docstring from an arbitrary function.

#You will be reusing the count_letter() function that you developed in the last exercise to show that we can properly extract
#its docstring.

# Get the "count_letter" docstring by using an attribute of the function
docstring = count_letter.__doc__

border = '#' * 28
print('{}\n{}\n{}'.format(border, docstring, border))

############################
Count the number of times `letter` appears in `content`.
    
    Args:
        content (str): The string to search.
        letter (str): The letter to search for.
    
    Returns:
        int
    
    # Add a section detailing what errors might be raised
    Raises:
        ValueError: If `letter` is not a one-character string.
    
############################


In [4]:
import inspect

# Inspect the count_letter() function to get its docstring
docstring = inspect.getdoc(count_letter)

border = '#' * 28
print('{}\n{}\n{}'.format(border, docstring, border))

############################
Count the number of times `letter` appears in `content`.

Args:
    content (str): The string to search.
    letter (str): The letter to search for.

Returns:
    int

# Add a section detailing what errors might be raised
Raises:
    ValueError: If `letter` is not a one-character string.
############################


In [5]:
import inspect

def build_tooltip(function):
    """Create a tooltip for any function that shows the
    function's docstring.

    Args:
        function (callable): The function we want a tooltip for.

    Returns:
        str
    """
    # Get the docstring for the "function" argument by using inspect
    docstring = inspect.getdoc(function)
    border = '#' * 28
    return '{}\n{}\n{}'.format(border, docstring, border)

print(build_tooltip(count_letter))
print(build_tooltip(range))
print(build_tooltip(print))

############################
Count the number of times `letter` appears in `content`.

Args:
    content (str): The string to search.
    letter (str): The letter to search for.

Returns:
    int

# Add a section detailing what errors might be raised
Raises:
    ValueError: If `letter` is not a one-character string.
############################
############################
range(stop) -> range object
range(start, stop[, step]) -> range object

Return an object that produces a sequence of integers from start (inclusive)
to stop (exclusive) by step.  range(i, j) produces i, i+1, i+2, ..., j-1.
start defaults to 0, and stop is omitted!  range(4) produces 0, 1, 2, 3.
These are exactly the valid indices for a list of 4 elements.
When step is given, it specifies the increment (or decrement).
############################
############################
print(value, ..., sep=' ', end='\n', file=sys.stdout, flush=False)

Prints the values to a stream, or to sys.stdout by default.
Optional keyword 

In [6]:
#Docstrings to the rescue!

#Some maniac has corrupted your installation of numpy! All of the functions still exist, but they've been given random names.
#You desperately need to call the numpy.histogram() function and you don't have time to reinstall the package. Fortunately for
#you, the maniac didn't think to alter the docstrings, and you know how to access them. numpy has a lot of functions in it, so
#we've narrowed it down to four possible functions that could be numpy.histogram() in disguise: numpy.leyud(), numpy.uqka(),
#numpy.fywdkxa() or numpy.jinzyxq().

#Examine each of these functions' docstrings in the IPython shell to determine which of them is actually numpy.histogram().

#numpy.leyud.__doc__
#"\n    Gives a new shape to an array without changing its data.\n\n    Parameters\n    ----------\n    a : array_like\n        Array to be reshaped.\n    newshape : int or tuple of ints\n        The new shape should be compatible with the original shape. If\n        an integer, then the result will be a 1-D array of that length.\n        One shape dimension can be -1. In this case, the value is\n        inferred from the length of the array and remaining dimensions.\n    order : {'C', 'F', 'A'}, optional\n        Read the elements of `a` using this index order, and place the\n        elements into the reshaped array using this index order.  'C'\n        means to read / write the elements using C-like index order,\n        with the last axis index changing fastest, back to the first\n        axis index changing slowest. 'F' means to read / write the\n        elements using Fortran-like index order, with the first index\n        changing fastest, and the last index changing slowest. Note that\n        the 'C' and 'F' options take no account of the memory layout of\n        the underlying array, and only refer to the order of indexing.\n        'A' means to read / write the elements in Fortran-like index\n        order if `a` is Fortran *contiguous* in memory, C-like order\n        otherwise.\n\n    Returns\n    -------\n    reshaped_array : ndarray\n        This will be a new view object if possible; otherwise, it will\n        be a copy.  Note there is no guarantee of the *memory layout* (C- or\n        Fortran- contiguous) of the returned array.\n\n    See Also\n    --------\n    ndarray.reshape : Equivalent method.\n\n    Notes\n    -----\n    It is not always possible to change the shape of an array without\n    copying the data. If you want an error to be raised when the data is copied,\n    you should assign the new shape to the shape attribute of the array::\n\n     >>> a = np.zeros((10, 2))\n     # A transpose makes the array non-contiguous\n     >>> b = a.T\n     # Taking a view makes it possible to modify the shape without modifying\n     # the initial object.\n     >>> c = b.view()\n     >>> c.shape = (20)\n     AttributeError: incompatible shape for a non-contiguous array\n\n    The `order` keyword gives the index ordering both for *fetching* the values\n    from `a`, and then *placing* the values into the output array.\n    For example, let's say you have an array:\n\n    >>> a = np.arange(6).reshape((3, 2))\n    >>> a\n    array([[0, 1],\n           [2, 3],\n           [4, 5]])\n\n    You can think of reshaping as first raveling the array (using the given\n    index order), then inserting the elements from the raveled array into the\n    new array using the same kind of index ordering as was used for the\n    raveling.\n\n    >>> np.reshape(a, (2, 3)) # C-like index ordering\n    array([[0, 1, 2],\n           [3, 4, 5]])\n    >>> np.reshape(np.ravel(a), (2, 3)) # equivalent to C ravel then C reshape\n    array([[0, 1, 2],\n           [3, 4, 5]])\n    >>> np.reshape(a, (2, 3), order='F') # Fortran-like index ordering\n    array([[0, 4, 3],\n           [2, 1, 5]])\n    >>> np.reshape(np.ravel(a, order='F'), (2, 3), order='F')\n    array([[0, 4, 3],\n           [2, 1, 5]])\n\n    Examples\n    --------\n    >>> a = np.array([[1,2,3], [4,5,6]])\n    >>> np.reshape(a, 6)\n    array([1, 2, 3, 4, 5, 6])\n    >>> np.reshape(a, 6, order='F')\n    array([1, 4, 2, 5, 3, 6])\n\n    >>> np.reshape(a, (3,-1))       # the unspecified value is inferred to be 2\n    array([[1, 2],\n           [3, 4],\n           [5, 6]])\n    "

#numpy.uqka.__doc__
#"\n    Returns the indices that would sort an array.\n\n    Perform an indirect sort along the given axis using the algorithm specified\n    by the `kind` keyword. It returns an array of indices of the same shape as\n    `a` that index data along the given axis in sorted order.\n\n    Parameters\n    ----------\n    a : array_like\n        Array to sort.\n    axis : int or None, optional\n        Axis along which to sort.  The default is -1 (the last axis). If None,\n        the flattened array is used.\n    kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional\n        Sorting algorithm.\n    order : str or list of str, optional\n        When `a` is an array with fields defined, this argument specifies\n        which fields to compare first, second, etc.  A single field can\n        be specified as a string, and not all fields need be specified,\n        but unspecified fields will still be used, in the order in which\n        they come up in the dtype, to break ties.\n\n    Returns\n    -------\n    index_array : ndarray, int\n        Array of indices that sort `a` along the specified axis.\n        If `a` is one-dimensional, ``a[index_array]`` yields a sorted `a`.\n        More generally, ``np.take_along_axis(a, index_array, axis=a)`` always\n        yields the sorted `a`, irrespective of dimensionality.\n\n    See Also\n    --------\n    sort : Describes sorting algorithms used.\n    lexsort : Indirect stable sort with multiple keys.\n    ndarray.sort : Inplace sort.\n    argpartition : Indirect partial sort.\n\n    Notes\n    -----\n    See `sort` for notes on the different sorting algorithms.\n\n    As of NumPy 1.4.0 `argsort` works with real/complex arrays containing\n    nan values. The enhanced sort order is documented in `sort`.\n\n    Examples\n    --------\n    One dimensional array:\n\n    >>> x = np.array([3, 1, 2])\n    >>> np.argsort(x)\n    array([1, 2, 0])\n\n    Two-dimensional array:\n\n    >>> x = np.array([[0, 3], [2, 2]])\n    >>> x\n    array([[0, 3],\n           [2, 2]])\n\n    >>> np.argsort(x, axis=0)  # sorts along first axis (down)\n    array([[0, 1],\n           [1, 0]])\n\n    >>> np.argsort(x, axis=1)  # sorts along last axis (across)\n    array([[0, 1],\n           [0, 1]])\n\n    Indices of the sorted elements of a N-dimensional array:\n\n    >>> ind = np.unravel_index(np.argsort(x, axis=None), x.shape)\n    >>> ind\n    (array([0, 1, 1, 0]), array([0, 0, 1, 1]))\n    >>> x[ind]  # same as np.sort(x, axis=None)\n    array([0, 2, 2, 3])\n\n    Sorting with keys:\n\n    >>> x = np.array([(1, 0), (0, 1)], dtype=[('x', '<i4'), ('y', '<i4')])\n    >>> x\n    array([(1, 0), (0, 1)],\n          dtype=[('x', '<i4'), ('y', '<i4')])\n\n    >>> np.argsort(x, order=('x','y'))\n    array([1, 0])\n\n    >>> np.argsort(x, order=('y','x'))\n    array([0, 1])\n\n    "

#numpy.fywdkxa.__doc__
#'\n    Compute the histogram of a set of data.\n\n    Parameters\n    ----------\n    a : array_like\n        Input data. The histogram is computed over the flattened array.\n    bins : int or sequence of scalars or str, optional\n        If `bins` is an int, it defines the number of equal-width\n        bins in the given range (10, by default). If `bins` is a\n        sequence, it defines the bin edges, including the rightmost\n        edge, allowing for non-uniform bin widths.\n\n        .. versionadded:: 1.11.0\n\n        If `bins` is a string, it defines the method used to calculate the\n        optimal bin width, as defined by `histogram_bin_edges`.\n\n    range : (float, float), optional\n        The lower and upper range of the bins.  If not provided, range\n        is simply ``(a.min(), a.max())``.  Values outside the range are\n        ignored. The first element of the range must be less than or\n        equal to the second. `range` affects the automatic bin\n        computation as well. While bin width is computed to be optimal\n        based on the actual data within `range`, the bin count will fill\n        the entire range including portions containing no data.\n    normed : bool, optional\n\n        .. deprecated:: 1.6.0\n\n        This is equivalent to the `density` argument, but produces incorrect\n        results for unequal bin widths. It should not be used.\n\n        .. versionchanged:: 1.15.0\n            DeprecationWarnings are actually emitted.\n\n    weights : array_like, optional\n        An array of weights, of the same shape as `a`.  Each value in\n        `a` only contributes its associated weight towards the bin count\n        (instead of 1). If `density` is True, the weights are\n        normalized, so that the integral of the density over the range\n        remains 1.\n    density : bool, optional\n        If ``False``, the result will contain the number of samples in\n        each bin. If ``True``, the result is the value of the\n        probability *density* function at the bin, normalized such that\n        the *integral* over the range is 1. Note that the sum of the\n        histogram values will not be equal to 1 unless bins of unity\n        width are chosen; it is not a probability *mass* function.\n\n        Overrides the ``normed`` keyword if given.\n\n    Returns\n    -------\n    hist : array\n        The values of the histogram. See `density` and `weights` for a\n        description of the possible semantics.\n    bin_edges : array of dtype float\n        Return the bin edges ``(length(hist)+1)``.\n\n\n    See Also\n    --------\n    histogramdd, bincount, searchsorted, digitize, histogram_bin_edges\n\n    Notes\n    -----\n    All but the last (righthand-most) bin is half-open.  In other words,\n    if `bins` is::\n\n      [1, 2, 3, 4]\n\n    then the first bin is ``[1, 2)`` (including 1, but excluding 2) and\n    the second ``[2, 3)``.  The last bin, however, is ``[3, 4]``, which\n    *includes* 4.\n\n\n    Examples\n    --------\n    >>> np.histogram([1, 2, 1], bins=[0, 1, 2, 3])\n    (array([0, 2, 1]), array([0, 1, 2, 3]))\n    >>> np.histogram(np.arange(4), bins=np.arange(5), density=True)\n    (array([ 0.25,  0.25,  0.25,  0.25]), array([0, 1, 2, 3, 4]))\n    >>> np.histogram([[1, 2, 1], [1, 0, 1]], bins=[0,1,2,3])\n    (array([1, 4, 1]), array([0, 1, 2, 3]))\n\n    >>> a = np.arange(5)\n    >>> hist, bin_edges = np.histogram(a, density=True)\n    >>> hist\n    array([ 0.5,  0. ,  0.5,  0. ,  0. ,  0.5,  0. ,  0.5,  0. ,  0.5])\n    >>> hist.sum()\n    2.4999999999999996\n    >>> np.sum(hist * np.diff(bin_edges))\n    1.0\n\n    .. versionadded:: 1.11.0\n\n    Automated Bin Selection Methods example, using 2 peak random data\n    with 2000 points:\n\n    >>> import matplotlib.pyplot as plt\n    >>> rng = np.random.RandomState(10)  # deterministic random data\n    >>> a = np.hstack((rng.normal(size=1000),\n    ...                rng.normal(loc=5, scale=2, size=1000)))\n    >>> plt.hist(a, bins=\'auto\')  # arguments are passed to np.histogram\n    >>> plt.title("Histogram with \'auto\' bins")\n    >>> plt.show()\n\n    '

#numpy.jinzyxq.__doc__
#"\n    Return an array of zeros with the same shape and type as a given array.\n\n    Parameters\n    ----------\n    a : array_like\n        The shape and data-type of `a` define these same attributes of\n        the returned array.\n    dtype : data-type, optional\n        Overrides the data type of the result.\n\n        .. versionadded:: 1.6.0\n    order : {'C', 'F', 'A', or 'K'}, optional\n        Overrides the memory layout of the result. 'C' means C-order,\n        'F' means F-order, 'A' means 'F' if `a` is Fortran contiguous,\n        'C' otherwise. 'K' means match the layout of `a` as closely\n        as possible.\n\n        .. versionadded:: 1.6.0\n    subok : bool, optional.\n        If True, then the newly created array will use the sub-class\n        type of 'a', otherwise it will be a base-class array. Defaults\n        to True.\n\n    Returns\n    -------\n    out : ndarray\n        Array of zeros with the same shape and type as `a`.\n\n    See Also\n    --------\n    empty_like : Return an empty array with shape and type of input.\n    ones_like : Return an array of ones with shape and type of input.\n    full_like : Return a new array with shape of input filled with value.\n    zeros : Return a new array setting values to zero.\n\n    Examples\n    --------\n    >>> x = np.arange(6)\n    >>> x = x.reshape((2, 3))\n    >>> x\n    array([[0, 1, 2],\n           [3, 4, 5]])\n    >>> np.zeros_like(x)\n    array([[0, 0, 0],\n           [0, 0, 0]])\n\n    >>> y = np.arange(3, dtype=float)\n    >>> y\n    array([ 0.,  1.,  2.])\n    >>> np.zeros_like(y)\n    array([ 0.,  0.,  0.])\n\n    "

#Possible Answers

#numpy.leyud()

#numpy.uqka()

#numpy.fywdkxa()*

#numpy.jinzyxq()

In [7]:
#Extract a function

import pandas as pd
df = pd.read_csv('datasets/GPAs.csv')

#While you were developing a model to predict the likelihood of a student graduating from college, you wrote this bit of code to
#get the z-scores of students' yearly GPAs. Now you're ready to turn it into a production-quality system, so you need to do
#something about the repetition. Writing a function to calculate the z-scores would improve this code.

# Standardize the GPAs for each year
#df['y1_z'] = (df.y1_gpa - df.y1_gpa.mean()) / df.y1_gpa.std()
#df['y2_z'] = (df.y2_gpa - df.y2_gpa.mean()) / df.y2_gpa.std()
#df['y3_z'] = (df.y3_gpa - df.y3_gpa.mean()) / df.y3_gpa.std()
#df['y4_z'] = (df.y4_gpa - df.y4_gpa.mean()) / df.y4_gpa.std()

#Note: df is a pandas DataFrame where each row is a student with 4 columns of yearly student GPAs: y1_gpa, y2_gpa, y3_gpa,
#y4_gpa

def standardize(column):
    """Standardize the values in a column.

    Args:
        column (pandas Series): The data to standardize.

    Returns:
        pandas Series: the values as z-scores
    """
    # Finish the function so that it returns the z-scores
    z_score = (column - column.mean()) / column.std()
    return z_score

# Use the standardize() function to calculate the z-scores
df['y1_z'] = standardize(df.y1_gpa)
df['y2_z'] = standardize(df.y2_gpa)
df['y3_z'] = standardize(df.y3_gpa)
df['y4_z'] = standardize(df.y4_gpa)

In [8]:
df

Unnamed: 0,y1_gpa,y2_gpa,y3_gpa,y4_gpa,y1_z,y2_z,y3_z,y4_z
0,2.785877,2.052513,2.170544,0.065570,0.790863,0.028021,0.172322,-1.711179
1,1.144557,2.666498,0.267098,2.884737,-0.872971,0.564636,-1.347122,0.824431
2,0.907406,0.423634,2.613459,0.030950,-1.113376,-1.395595,0.525883,-1.742317
3,2.205259,0.523580,3.984345,0.339289,0.202281,-1.308243,1.620206,-1.464991
4,2.877876,1.287922,3.077589,0.901994,0.884124,-0.640219,0.896379,-0.958885
...,...,...,...,...,...,...,...,...
95,2.766807,2.543601,3.760116,1.441695,0.771532,0.457226,1.441213,-0.473468
96,0.604510,0.128792,3.330865,0.842611,-1.420427,-1.653282,1.098559,-1.012295
97,1.595505,2.979123,3.384219,1.684800,-0.415837,0.837865,1.141150,-0.254814
98,0.963424,1.891652,0.495692,0.872142,-1.056589,-0.112568,-1.164644,-0.985734


In [9]:
#Split up a function

#Another engineer on your team has written this function to calculate the mean and median of a sorted list. You want to show
#them how to split it into two simpler functions: mean() and median()

def mean_and_median(values):
    """Get the mean and median of a sorted list of `values`

    Args:
        values (iterable of float): A list of numbers

    Returns:
        tuple (float, float): The mean and median
    """
    mean = sum(values) / len(values)
    midpoint = int(len(values) / 2)
    if len(values) % 2 == 0:
        median = (values[midpoint - 1] + values[midpoint]) / 2
    else:
        median = values[midpoint]

    return mean, median

In [10]:
import random
lista_ord = sorted([random.randint(0, 100) for num in range(6)])
lista_ord

[17, 43, 58, 74, 91, 94]

In [11]:
mean_and_median(lista_ord)

(62.833333333333336, 66.0)

In [12]:
def mean(values):
    """Get the mean of a sorted list of values

    Args:
        values (iterable of float): A list of numbers

    Returns:
        float
    """
    # Write the mean() function
    mean = sum(values) / len(values)
    return mean

In [13]:
mean(lista_ord)

62.833333333333336

In [14]:
def median(values):
    """Get the median of a sorted list of values

    Args:
        values (iterable of float): A list of numbers

    Returns:
        float
    """
    # Write the median() function
    midpoint = int(len(values) / 2)
    if len(values) % 2 == 0:
        median = (values[midpoint - 1] + values[midpoint]) / 2
    else:
        median = values[midpoint]
    return median

In [15]:
median(lista_ord)

66.0

In [16]:
#Mutable or immutable?

#The following function adds a mapping between a string and the lowercase version of that string to a dictionary. What do you
#expect the values of d and s to be after the function is called?

def store_lower(_dict, _string):
    """Add a mapping between `_string` and a lowercased version of `_string` to `_dict`

    Args:
        _dict (dict): The dictionary to update.
        _string (str): The string to add.
    """
    orig_string = _string
    _string = _string.lower()
    _dict[orig_string] = _string

d = {}
s = 'Hello'

store_lower(d, s)
d, s

({'Hello': 'hello'}, 'Hello')

In [17]:
#Possible Answers

#d = {}, s = 'Hello'

#d = {}, s = 'hello'

#d = {'Hello': 'hello'}, s = 'Hello'*

#d = {'Hello': 'hello'}, s = 'hello'

#d = {'hello': 'hello'}, s = 'hello'

In [18]:
#Best practice for default arguments

import pandas

#One of your co-workers (who obviously didn't take this course) has written this function for adding a column to a pandas
#DataFrame. Unfortunately, they used a mutable variable as a default argument value! Please show them a better way to do this so
#that they don't get unexpected behavior.

def add_column(values, df=pandas.DataFrame()):
    """Add a column of `values` to a DataFrame `df`.
    The column will be named "col_<n>" where "n" is
    the numerical index of the column.

    Args:
        values (iterable): The values of the new column
        df (DataFrame, optional): The DataFrame to update.
            If no DataFrame is passed, one is created by default.

    Returns:
        DataFrame
    """
    df['col_{}'.format(len(df.columns))] = values
    return df

In [22]:
add_column([1, 2, 3])

Unnamed: 0,col_0,col_1
0,1,1
1,2,2
2,3,3


In [20]:
# Use an immutable variable for the default argument
def better_add_column(values, df=None):
    """Add a column of `values` to a DataFrame `df`.
    The column will be named "col_<n>" where "n" is
    the numerical index of the column.

    Args:
        values (iterable): The values of the new column
        df (DataFrame, optional): The DataFrame to update.
            If no DataFrame is passed, one is created by default.

    Returns:
        DataFrame
    """
    # Update the function to create a default DataFrame
    if df is None:
        df = pandas.DataFrame()
    df['col_{}'.format(len(df.columns))] = values
    return df

In [23]:
better_add_column([1, 2, 3])

Unnamed: 0,col_0
0,1
1,2
2,3
