In [1]:
import math

def bucketize(point, bucket_size):
    return bucket_size * math.floor(point / bucket_size)

In [3]:
from collections import Counter
def make_histogram(points, bucket_size):
    return Counter(bucketize(point, bucket_size) for point in points)

In [5]:
import matplotlib.pyplot as plt
def plot_histogram(points, bucket_size, title=""):
    histogram = make_histogram(points, bucket_size)
    plt.bar(histogram.keys(), histogram.values(), width=bucket_size)
    plt.title(title)
    plt.show()

In [6]:
import random
random.seed(0)
uniform = [200 * random.random() - 100 for _ in range(10000)]

In [7]:
from probability import inverse_normal_cdf
normal = [57 * inverse_normal_cdf(random.random()) for _ in range(10000)]

In [8]:
plot_histogram(uniform, 10, "Uniform Histogram")

In [9]:
plot_histogram(normal, 10, "Normal Histogram")

In [10]:
def random_normal():
    return inverse_normal_cdf(random.random())

In [11]:
xs = [random_normal() for _ in range(1000)]
ys1 = [ x + random_normal() / 2 for x in xs]
ys2 = [-x + random_normal() / 2 for x in xs]

In [15]:
plt.scatter(xs, ys1, marker='.', color='black', label='ys1')
plt.scatter(xs, ys2, marker='.', color='gray', label='ys2')
plt.xlabel('xs')
plt.ylabel('ys')
plt.legend(loc=9)
plt.title("Very Difference Joint Distributions")

<matplotlib.text.Text at 0x7949170>

In [16]:
plt.show()

In [18]:
from stats import correlation

In [21]:
print (correlation(xs, ys1))

0.9010493686379609


In [22]:
print (correlation(xs, ys2))

-0.8920981526880033


In [24]:
from linear_algebra import get_column, shape, make_matrix

In [25]:
def correlation_matrix(data):
    _, num_columns = shape(data)
    
    def matrix_entry(i, j):
        return correlation(get_column(data, i), get_column(data, j))
    
    return make_matrix(num_columns, num_columns, matrix_entry)

In [26]:
    num_points = 100

    def random_row():
        row = [None, None, None, None]
        row[0] = random_normal()
        row[1] = -5 * row[0] + random_normal()
        row[2] = row[0] + row[1] + 5 * random_normal()
        row[3] = 6 if row[2] > -2 else 0
        return row
    random.seed(0)
    data = [random_row()
            for _ in range(num_points)]



In [27]:
_, num_columns = shape(data)

In [28]:
fig, ax = plt.subplots(num_columns, num_columns)

In [29]:
for i in range(num_columns):
    for j in range(num_columns):
        if i != j: ax[i][j].scatter(get_column(data, j), get_column(data, i))
        else: ax[i][j].annotate("series " + str(i), (0.5, 0.5), xycoords='axes fraction', ha="center", va="center")
        if i < num_columns - 1: ax[i][j].xaxis.set_visible(False)
        if j > 0: ax[i][j].yaxis.set_visible(False)

In [30]:
ax[-1][-1].set_xlim(ax[0][-1].get_xlim())
ax[0][0].set_ylim(ax[0][1].get_ylim())

(-4.0, 3.0)

In [31]:
plt.show()

In [2]:
def try_or_none(f):
    def f_or_none(x):
        try: return f(x)
        except: return None
    return f_or_none

In [3]:
def parse_row(input_row, parsers):
    return [try_or_none(parser)(value) if parser is not None else value
            for value, parser in zip(input_row, parsers)]

def parse_rows_with(reader, parsers):
    for row in reader:
        yield parse_row(row, parsers)

In [4]:
import dateutil.parser
data = []

In [6]:
import csv
with open("comma_delimited_stock_prices.csv", "r") as f:
    reader = csv.reader(f)
    for line in parse_rows_with(reader, [dateutil.parser.parse, None, float]):
        data.append(line)

In [7]:
for row in data:
    if any(x is None for x in row):
        print(row)

[datetime.datetime(2014, 6, 19, 0, 0), 'MSFT', None]


In [8]:
def try_parse_field(field_name, value, parser_dict):
    parser = parser_dict.get(field_name)
    if parser is not None:
        return try_or_none(parser)(value)
    else:
        return value
    
def parser_dict(input_dict, parser_dict):
    return { field_name: try_parse_field(field_name, value, parser_dict)
            for field_name, value in input_dict.iteritems()}

In [3]:
import datetime
data = [{'closing_price': 102.06,
         'date': datetime.datetime(2014, 8, 29, 0, 0),
         'symbol': 'AAPL'}]
max_aapl_price = max(row["closing_price"] for row in data if row["symbol"] == "AAPL")

In [5]:
from collections import defaultdict
by_symbol = defaultdict(list)
for row in data:
    by_symbol[row["symbol"]].append(row)
    
max_price_by_symbol = { symbol: max(row["closing_price"] for row in grouped_rows)
                       for symbol, grouped_rows in by_symbol.items()}

In [6]:
def picker(field_name):
    return lambda row: row[field_name]

In [7]:
def pluck(field_name, rows):
    return map(picker(field_name), rows)

In [8]:
def group_by(grouper, rows, value_transform=None):
    grouped = defaultdict(list)
    for row in rows:
        grouped[grouper(row)].append(row)
    if value_transform is None:
        return grouped
    else:
        return { key: value_transform(rows)
                for key, rows in grouped.items()}

In [9]:
max_price_by_symbol = group_by(picker("symbol"),
                               data,
                               lambda rows: max(pluck("closing_price", rows)))

In [10]:
def percent_price_change(yesterday, today):
    return today["closing_price"] / yesterday["closing_price"] - 1

def day_over_day_changes(grouped_rows):
    ordered = sorted(grouped_rows, key=picker("date"))
    
    return [{ "symbol" : today["symbol"],
              "date"   : today["date"],
             "change"  : percent_price_change(yesterday, today)}
            for yesterday, today in zip(ordered, ordered[1:])]

In [11]:
changes_by_symbol = group_by(picker("symbol"), data, day_over_day_changes)

all_changes = [change
               for changes in changes_by_symbol.values()
               for change in changes]

In [12]:
max(all_changes, key=picker("change"))

ValueError: max() arg is an empty sequence

In [13]:
min(all_changes, key=picker("change"))

ValueError: min() arg is an empty sequence

In [14]:
def combine_pct_changes(pct_change1, pct_change2):
    return (1 + pct_change1) * (1 + pct_change2) - 1

def overall_change(changes):
    return reduce(combine_pct_changes, pluck("change", changes))


In [15]:
overall_change_by_month = group_by(lambda row: row['date'].month, all_changes, overall_change)