## Q1.

Add a __setitem__ to the python linked list implementation from the lecture (this past wednesday).

In [20]:
#code below
# added __setitem__ methods and lines on doctest

from doctest import run_docstring_examples as dtest
import numbers
import reprlib
class LL:
    """
    >>> A = LL()  
    >>> A[0]
    Traceback (most recent call last):
        ...
    IndexError: trying to index an empty LL
    >>> A.insert_front(1)
    >>> A[0]
    1
    >>> A.insert_back(2)
    >>> A[1]
    2
    >>> A
    LL([1,...])
    >>> myll = LL.from_components([1,2])
    >>> myll[1]
    1
    >>> len(myll)
    2
    >>> myll[2]
    Traceback (most recent call last):
        ...
    IndexError: LL index out of range
    >>> myll[0:1]
    Traceback (most recent call last):
        ...
    TypeError: LL indices must be integers
    >>> myll[1] = -100
    >>> myll[1]
    -100
    >>> myll['a']
    Traceback (most recent call last):
        ...
    TypeError: LL indices must be integers
    
    """
    
    
    #from_componenet: build things by adding at the front, building it in O(1) instead of O(n) if build at the back
    @classmethod
    def from_components(cls, components):
        inst = cls(components[0])
        for c in components[1:]:
            inst.insert_front(c)
        return inst
        
    def __init__(self, head=None):
        if head is None:
            self._headNode = None
        else:
            self._headNode = [head, None]
            
    def insert_front(self, element):
        new_node = [element, None]
        new_node[1] = self._headNode
        self._headNode = new_node
        
    def insert_back(self, element):
        new_node = [element, None]
        curr_ptr = self._headNode
        while curr_ptr[1] is not None:
            curr_ptr = curr_ptr[1]
        curr_ptr[1]= new_node
        
    def __repr__(self):
        class_name = type(self).__name__
        if len(self)==0:
            components=""
        else:
            components = reprlib.repr(self[0])
        return '{}([{},...])'.format(class_name,components)


    def __len__(self):
        curr_ptr = self._headNode
        count = 0
        if curr_ptr==None:
            return 0
        while 1:
            count = count + 1
            if curr_ptr[1] is None:
                break
            curr_ptr = curr_ptr[1]
        return count    
    
    def __getitem__(self, index):
        class_name = type(self).__name__
        if isinstance(index, numbers.Integral): 
            curr_ptr = self._headNode
            if curr_ptr==None:
                msg = 'trying to index an empty {class_name}' 
                raise IndexError(msg.format(class_name=class_name))
            next_ptr = self._headNode[1]
            count = 0
            while 1:
                if index == count:
                    return curr_ptr[0]
                if curr_ptr[1] is None:
                    msg = '{class_name} index out of range' 
                    raise IndexError(msg.format(class_name=class_name))       
                count += 1
                curr_ptr = curr_ptr[1]
        else:
            msg = '{class_name} indices must be integers' 
            raise TypeError(msg.format(class_name=class_name))
    
    def __setitem__(self, index, value):
        class_name = type(self).__name__
        if isinstance(index, numbers.Integral): 
            curr_ptr = self._headNode
            if curr_ptr==None:
                msg = 'trying to index an empty {class_name}' 
                raise IndexError(msg.format(class_name=class_name))
            next_ptr = self._headNode[1]
            count = 0
            while 1:
                if index == count:
                    curr_ptr[0] = value
                    return
                if curr_ptr[1] is None:
                    msg = '{class_name} index out of range' 
                    raise IndexError(msg.format(class_name=class_name))       
                count += 1
                curr_ptr = curr_ptr[1]
        else:
            msg = '{class_name} indices must be integers' 
            raise TypeError(msg.format(class_name=class_name))

In [21]:
from doctest import run_docstring_examples as dtest
dtest(LL, globals(), verbose = True)

Finding tests in NoName
Trying:
    A = LL()  
Expecting nothing
ok
Trying:
    A[0]
Expecting:
    Traceback (most recent call last):
        ...
    IndexError: trying to index an empty LL
ok
Trying:
    A.insert_front(1)
Expecting nothing
ok
Trying:
    A[0]
Expecting:
    1
ok
Trying:
    A.insert_back(2)
Expecting nothing
ok
Trying:
    A[1]
Expecting:
    2
ok
Trying:
    A
Expecting:
    LL([1,...])
ok
Trying:
    myll = LL.from_components([1,2])
Expecting nothing
ok
Trying:
    myll[1]
Expecting:
    1
ok
Trying:
    len(myll)
Expecting:
    2
ok
Trying:
    myll[2]
Expecting:
    Traceback (most recent call last):
        ...
    IndexError: LL index out of range
ok
Trying:
    myll[0:1]
Expecting:
    Traceback (most recent call last):
        ...
    TypeError: LL indices must be integers
ok
Trying:
    myll[1] = -100
Expecting nothing
ok
Trying:
    myll[1]
Expecting:
    -100
ok
Trying:
    myll['a']
Expecting:
    Traceback (most recent call last):
        ...
    TypeErr

## Q2.

An online mean and standard deviation algorithm.

Below is a function to generate a potentially infinite stream of 1-D data.

In [22]:
from random import normalvariate, random
from itertools import count

def make_data(m, stop=None):
    for _ in count():
        if stop and _ > stop:
            break
        yield 1.0e09 + normalvariate(0, m*random() )
        

Here is an implementation of an online mean algorithm..see http://www.johndcook.com/blog/standard_deviation/ and the link to http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/ in-between. (Convince yourselves of the formulas...)

In [30]:
def online_mean(iterator):
    n = 0
    mu = 0
    for value in iterator:
        n += 1
        delta = value - mu
        mu = mu + delta/n
        yield mu

We use out generator functions to implement iterators:

In [46]:
g = make_data(5, 10)

list(g)

[1000000001.0943303,
 999999998.1787736,
 999999996.7504416,
 999999994.1324487,
 999999999.8396335,
 999999997.3379866,
 1000000001.8774632,
 1000000000.6626213,
 999999998.2246407,
 999999995.0323929,
 999999999.3981588]

We realize that the numbers materialized are very large with very small deltas between them.

In [76]:
g = online_mean(make_data(5, 100))
print(type(g))
#list(g)

<class 'generator'>


### 2.1

Implement the standard deviation algorithm as a generator function as

```python
def online_mean_dev(iterator):
    BLA BLA
    if n > 1:
        stddev = math.sqrt(dev_accum/(n-1))
        yield (n, value, mu, stddev)
```


<font color='blue'>
*Formula from the link provided.*
<br>

Initialize $M_1 = x_1$ and $S_1 = 0$.
<br>
For subsequent x‘s, use the recurrence formulas

$$M_k = M_{k-1}+ \frac{(x_k – M_{k-1})}{k}$$
$$S_k = S_{k-1} + (x_k – M_{k-1})(x_k – M_k)$$

<br>
For 2 ≤ k ≤ n, the $k^{th}$ estimate of the variance is $s^2 = S_k/(k – 1)$.
</font>


In [74]:
# your code here
import math

def online_mean_dev(iterator):
    n = 0
    mu = 0
    dev_accum = 0
    for value in iterator:
        n += 1
        delta = value - mu
        mu = mu + delta/n
        dev_accum = dev_accum + delta*(value - mu)
        
        if n > 1:
            stddev = math.sqrt(dev_accum/(n-1))
            yield (n, value, mu, stddev)

Here we make 100000 element data, and run this iterator on it (imagine running this on a time-series being slowly read from disk

In [75]:
data_with_stats = online_mean_dev(make_data(5, 100000))

print(type(data_with_stats))

<class 'generator'>


In [57]:
# to see how data_with_stats materialize for the first 10 elements

for i in range(10):
    print(next(data_with_stats))
    
print(next(data_with_stats))

(13, 999999996.6601509, 999999999.2192694, 3.298521443456659)
(14, 1000000000.6501689, 999999999.3214765, 3.192107453115774)
(15, 999999999.8697342, 999999999.358027, 3.079247325805359)
(16, 999999998.2976254, 999999999.2917519, 2.986624298819076)
(17, 1000000001.4424378, 999999999.4182628, 2.93845447286534)
(18, 999999996.4346521, 999999999.2525067, 2.9361801558358027)
(19, 999999999.589582, 999999999.2702476, 2.8545018649029505)
(20, 999999999.4223354, 999999999.2778519, 2.778576219197691)
(21, 1000000005.5097207, 999999999.5746076, 3.0304791366983506)
(22, 1000000000.1711993, 999999999.6017255, 2.9601787216427113)
(23, 999999999.4398057, 999999999.5946854, 2.8923166039150763)


In [58]:
    
print(next(data_with_stats))

(24, 999999996.0381976, 999999999.4464984, 2.920411506726057)


Note the "stateful" property of generator -- where we see the interator index keeps moving forward and it is remembered from one generation to another.

## Q3.

Let's do Anomaly detection. Write a routine `is_ok`:

```python
def is_ok(level, t)
```

which takes a tuple like the one yielded by your code above and returns True if the value is inbetween `level`-$\sigma$ of the mean.

Value = mean +-level*sigma

In [61]:
#your code here
# (n, value, mu, stddev)
def is_ok(level, t):
    value = t[1]
    mu = t[2]
    sigma = t[3]
    anomaly_range = level*sigma
    if ((mu - anomaly_range)<= value <= (mu + anomaly_range)):
        return True
    else:
        return False

We use this function to create a predicate passed through to `itertools.filterfalse` which is then used to obtain an iterator on the anomalies.

In [65]:
from itertools import filterfalse
pred = lambda t: is_ok(5, t)

anomalies = filterfalse(pred, data_with_stats)


We materialize the anomalies...

In [66]:
list(anomalies)#materialize

[(9970, 999999985.0494641, 1000000000.000902, 2.8782140518866837),
 (16124, 999999984.8157325, 999999999.9917138, 2.871940944438512),
 (16603, 1000000014.389227, 999999999.9857657, 2.878357886958591),
 (22593, 999999984.7691534, 999999999.9772627, 2.861437009038017),
 (25302, 1000000018.7748191, 999999999.9713694, 2.8713041910702546),
 (26786, 1000000018.2050887, 999999999.978237, 2.875478068302486),
 (27400, 1000000014.6187431, 999999999.9843495, 2.877189950308567),
 (27413, 1000000015.1162496, 999999999.9848051, 2.878943602430227),
 (31139, 999999984.9076481, 999999999.9758993, 2.8781281823186275),
 (31694, 999999985.4168285, 999999999.9735674, 2.881668330198163),
 (39794, 999999984.3441659, 999999999.9658993, 2.8854951244970724),
 (39856, 999999985.1713961, 999999999.9655629, 2.8861109229142943),
 (41763, 1000000015.0537992, 999999999.9704499, 2.8869445152673783),
 (45364, 999999984.7528864, 999999999.9707311, 2.8894704295977798),
 (46916, 999999985.3670477, 999999999.9752353, 2.893

## To think of, but not hand in

What kinds of anomalies will this algorithm pick up? What kinds would a shorter "window" of anomaly detection, like 100 points around the time in question pick? How might you create an algorithm which does window based averaging? (hint: the window size is small compared to the time series size). 

Finally think a bit of how you might implement all of this in a production environment..remember that data streaming in might get backed up when you handle an anomaly.

(Some inspiration might accrue if you look at the docs for `collections.deque`).