# Workshop 7: Statistics (Optional)

In [None]:
# standard preamble
import numpy as np
import scipy as sp      
import matplotlib.pyplot as plt
%matplotlib inline

## 2d distributions

You can create two independent samples of events and plot their distribution as a *scatter* plot:

In [None]:
x = np.random.standard_normal(size=1000)
y = np.random.standard_normal(size=1000)
plt.scatter(x,y)
plt.xlabel('x')
plt.ylabel('y')

You can compute the correlation matrix for two variables:

In [None]:
print (sp.corrcoef(x,y))

Although more instructive perhaps is to print the full covariance matrix:

In [None]:
print (sp.cov(x,y))

Here is a cute example of plotting projection histograms together with the scatter plot:
(from http://matplotlib.org/examples/pylab_examples/scatter_hist.html )

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter

# the random data
x = np.random.randn(1000)
y = np.random.randn(1000)

nullfmt = NullFormatter()         # no labels

# definitions for the axes
left, width = 0.1, 0.65
bottom, height = 0.1, 0.65
bottom_h = left_h = left + width + 0.02

rect_scatter = [left, bottom, width, height]
rect_histx = [left, bottom_h, width, 0.2]
rect_histy = [left_h, bottom, 0.2, height]

# start with a rectangular Figure
plt.figure(1, figsize=(8, 8))

axScatter = plt.axes(rect_scatter)
axHistx = plt.axes(rect_histx)
axHisty = plt.axes(rect_histy)

# no labels
axHistx.xaxis.set_major_formatter(nullfmt)
axHisty.yaxis.set_major_formatter(nullfmt)

# the scatter plot:
axScatter.scatter(x, y)

# now determine nice limits by hand:
binwidth = 0.25
xymax = np.max([np.max(np.fabs(x)), np.max(np.fabs(y))])
lim = (int(xymax/binwidth) + 1) * binwidth

axScatter.set_xlim((-lim, lim))
axScatter.set_ylim((-lim, lim))

bins = np.arange(-lim, lim + binwidth, binwidth)
axHistx.hist(x, bins=bins)
axHisty.hist(y, bins=bins, orientation='horizontal')

axHistx.set_xlim(axScatter.get_xlim())
axHisty.set_ylim(axScatter.get_ylim())

axScatter.set_xlabel('x')
axScatter.set_ylabel('y')

plt.show()

You can also create a correlated sample:

In [None]:
 # mean values of two variables
mean = [0, 0] 

#  covariance matrix 
# Note that the covariance matrix must be positive semidefinite (a.k.a. nonnegative-definite). 
# Otherwise, the behavior of this method is undefined and backwards compatibility is not guaranteed.
cov = [[1, 0.8], [0.8, 1]]  

# produce a sample 
x, y = np.random.multivariate_normal(mean, cov, 1000).T   

# plot -- this looks like a streak
plt.scatter(x,y)
plt.xlabel('x')
plt.ylabel('y')