## Tutorial Setup

### Check your install

In [2]:
import numpy

In [3]:
import scipy

In [4]:
import matplotlib

In [5]:
import sklearn

In [6]:
import psutil

In [7]:
import pandas

In [8]:
import IPython.parallel

Finding the location of an installed package and its version:

In [9]:
numpy.__path__

['C:\\Users\\Ethan\\Anaconda3\\lib\\site-packages\\numpy']

In [10]:
numpy.__version__

'1.9.2'

### Check that you have the datasets

In [11]:
%run ../fetch_data.py

Creating datasets folder: C:\Users\Ethan\Documents\GitHub\DataScience\Python Project\Pycon2015\parallel_ml_tutorial\datasets
Checking availability of the 20 newsgroups dataset
Downloading dataset from http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz (14 MB)
Decompressing C:\Users\Ethan\Documents\GitHub\DataScience\Python Project\Pycon2015\parallel_ml_tutorial\datasets\20news-bydate.tar.gz
Checking that the 20 newsgroups files exist...
=> Success!


In [12]:
import os
for fname in os.listdir('../datasets/'):
    print(fname)

20news-bydate-test
20news-bydate-train
20news-bydate.tar.gz


## A NumPy primer

### NumPy array dtypes and shapes

In [None]:
import numpy as np

In [13]:
a = np.array([1, 2, 3])

In [None]:
a

In [14]:
b = np.array([[0, 2, 4], [1, 3, 5]])

In [None]:
b

In [None]:
b.shape

In [None]:
b.dtype

In [None]:
a.shape

In [None]:
a.dtype

In [None]:
np.zeros(5)

In [None]:
np.ones(shape=(3, 4), dtype=np.int32)

### Common array operations

In [16]:
c = b * 0.5

In [17]:
c

array([[ 0. ,  1. ,  2. ],
       [ 0.5,  1.5,  2.5]])

In [18]:
c.shape

(2, 3)

In [19]:
c.dtype

dtype('float64')

In [20]:
a

array([1, 2, 3])

In [21]:
d = a + c

In [22]:
d

array([[ 1. ,  3. ,  5. ],
       [ 1.5,  3.5,  5.5]])

In [23]:
d[0]

array([ 1.,  3.,  5.])

In [24]:
d[0, 0]

1.0

In [25]:
d[:, 0]

array([ 1. ,  1.5])

In [None]:
d.sum()

In [None]:
d.mean()

In [None]:
d.sum(axis=0)

In [None]:
d.mean(axis=1)

### Reshaping and inplace update

In [None]:
e = np.arange(12)

In [None]:
e

In [None]:
f = e.reshape(3, 4)

In [None]:
f

In [None]:
e

In [None]:
e[5:] = 0

In [None]:
e

In [None]:
f

### Combining arrays

In [None]:
a

In [None]:
b

In [None]:
d

In [None]:
np.concatenate([a, a, a])

In [None]:
np.vstack([a, b, d])

In [None]:
np.hstack([b, d])

## A Matplotlib primer

In [None]:
%matplotlib inline

In [None]:
import matplotlib.pyplot as plt

In [None]:
x = np.linspace(0, 2, 10)

In [None]:
x

In [None]:
plt.plot(x, 'o-');

In [None]:
plt.plot(x, x, 'o-', label='linear')
plt.plot(x, x ** 2, 'x-', label='quadratic')

plt.legend(loc='best')
plt.title('Linear vs Quadratic progression')
plt.xlabel('Input')
plt.ylabel('Output');

In [None]:
samples = np.random.normal(loc=1.0, scale=0.5, size=1000)

In [None]:
samples.shape

In [None]:
samples.dtype

In [None]:
samples[:30]

In [None]:
plt.hist(samples, bins=50);

In [None]:
samples_1 = np.random.normal(loc=1, scale=.5, size=10000)
samples_2 = np.random.standard_t(df=10, size=10000)

In [None]:
bins = np.linspace(-3, 3, 50)
_ = plt.hist(samples_1, bins=bins, alpha=0.5, label='samples 1')
_ = plt.hist(samples_2, bins=bins, alpha=0.5, label='samples 2')
plt.legend(loc='upper left');

In [None]:
plt.scatter(samples_1, samples_2, alpha=0.1);