<h1>Python libraries for data analysis</h1>


<li><b><span style="color:blue">Numpy</span></b>: supports numerical and array operations
<li><b><span style="color:blue">Scipy</span></b>: open source library for mathematics and scientific computing
<li><b><span style="color:blue">Pandas</span></b>: supports data manipulation and analysis
<li><b><span style="color:blue">Visualization libraries</span></b>: matplotlib, seaborne, bokeh, plotly, gmplot, and many others provide support for charts and graphs

<h1>numpy</h1>


<h2>Why numpy?</h2>
<li>Multi-dimensional arrays:
<li>Faster and more space efficient than lists 
<li>Can incorporate C/C++/Fortran code
<li>Linear algebra, Fourier transforms, Random number support



In [26]:
not 2.5 or 1-1

0

<h2>numpy array</h2>

In [27]:
import numpy as np
ax = np.array([1,2,3,4,5])
print(type(ax))


<class 'numpy.ndarray'>


In [28]:
ax

array([1, 2, 3, 4, 5])

In [29]:
print (ax)

[1 2 3 4 5]


<h2>Specifying the type</h2>
<h3>Useful when reading a text stream directly into a numerical array</h3>

In [30]:
x=['1','2','3']
xi = np.array(x,'int')
xf = np.array(x,'float')
xs = np.array(x,'str')
print(xi,xf,xs,sep='\n')

[1 2 3]
[ 1.  2.  3.]
['1' '2' '3']


<h2>Basic operations</h2>

In [31]:
x = np.array([13,24,21.2,17.6,21.7],'float')
print(x.sum(),x.mean(),x.std(),sep='\n')

97.5
19.5
3.84291555983


In [32]:
x = np.array([13,24,21.2,17.6,21.7],'float')
y = np.array([1,3,4,7,2],'float')
print (x - y)

[ 12.   21.   17.2  10.6  19.7]


In [33]:
x+y

array([ 14. ,  27. ,  25.2,  24.6,  23.7])

In [34]:
x*y

array([  13. ,   72. ,   84.8,  123.2,   43.4])

In [35]:
x/y

array([ 13.        ,   8.        ,   5.3       ,   2.51428571,  10.85      ])

<h2>Multi-dimensional arrays</h2>

In [36]:
import numpy as np
x=[[0,1,2,3,4,5],[10,11,12,13,14,15],[20,21,22,23,24,25]]
ax=np.array(x,float)
print(ax)

[[  0.   1.   2.   3.   4.   5.]
 [ 10.  11.  12.  13.  14.  15.]
 [ 20.  21.  22.  23.  24.  25.]]


<h3>Indexing</h3>

In [37]:
ax[1,3] #indexing

13.0

<h3>Slicing</h3>

In [38]:
#ax[1:3,2:4]
ax[:,2:]

array([[  2.,   3.,   4.,   5.],
       [ 12.,  13.,  14.,  15.],
       [ 22.,  23.,  24.,  25.]])

In [39]:
ax.shape

(3, 6)

<h3>Reshaping</h3>


In [40]:
print(ax.shape)
ax.reshape(9,2)
#ax.reshape(10,3)

(3, 6)


array([[  0.,   1.],
       [  2.,   3.],
       [  4.,   5.],
       [ 10.,  11.],
       [ 12.,  13.],
       [ 14.,  15.],
       [ 20.,  21.],
       [ 22.,  23.],
       [ 24.,  25.]])

<h3>Creating initialized arrays</h3>

In [41]:
ax = np.arange(10)
print(ax)
ay = np.array([np.arange(10),np.arange(10)])
print(ay)

[0 1 2 3 4 5 6 7 8 9]
[[0 1 2 3 4 5 6 7 8 9]
 [0 1 2 3 4 5 6 7 8 9]]


In [42]:
ax = np.ones(10)
print(ax)

[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]


In [43]:
ax = np.zeros(10)
print(ax)

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]


In [44]:
ax = np.arange(10)**2
print(ax)

[ 0  1  4  9 16 25 36 49 64 81]


In [45]:
l=np.array([1,2,3])
print (l**2)

[1 4 9]


In [46]:
l=[1,2,3]
l*2

[1, 2, 3, 1, 2, 3]

In [47]:
np.identity(10)

array([[ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.]])

<h3>Matrix multiplication</h3>


In [48]:
ax = np.arange(10)
ay = np.array([ax,ax])
#Scalar multiplication
ay*2

array([[ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18],
       [ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18]])

In [49]:
np.dot(ay,ay.reshape(10,2)) #Dot product

array([[220, 265],
       [220, 265]])

<h2>Lists vs numpy arrays</h2>
<li>Lists are heterogenous. Elements of a list can be of multiple types
<li>Numpy arrays are homogeneous. Elements can be of only one type
<li>Both are mutable
<li>Homogeneity makes indexed access faster and more memory efficient
<li>numpy are optimized for matrix operations
<li>numpy provides random number support

<h3>numpy arrays are homogeneous</h3>

In [50]:
import numpy as np
x = [11,'s',[[2,4,5]]]
x
#ax = np.array([11,'s',[[2,4,5]]])
#ax

[11, 's', [[2, 4, 5]]]

<h3>numpy arrays are faster</h3>

In [51]:
n=10
ax = np.array([np.arange(n)**2,np.arange(n)**3])
ay = ax.transpose()
print(ax)
print(ay)
np.dot(ax,ay)

[[  0   1   4   9  16  25  36  49  64  81]
 [  0   1   8  27  64 125 216 343 512 729]]
[[  0   0]
 [  1   1]
 [  4   8]
 [  9  27]
 [ 16  64]
 [ 25 125]
 [ 36 216]
 [ 49 343]
 [ 64 512]
 [ 81 729]]


array([[ 15333, 120825],
       [120825, 978405]])

<h4>Functionalize this</h4>


In [52]:
def dotproduct(n):
    ax = np.array([np.arange(n)**2,np.arange(n)**3])
    ay = ax.transpose()
    import datetime
    start = datetime.datetime.now()
    np.dot(ax,ay)
    end = datetime.datetime.now()
    return end-start
    
dotproduct(10)    

datetime.timedelta(0, 0, 12)

<h4>Do the same with python lists</h4>


In [53]:
def dot_product_lists(n):
    x = [x**2 for x in range(n)]
    y = [x**3 for x in range(n)]
    ax = [x,y]
    ay = [list(i) for i in zip(*ax)]
    import datetime
    start = datetime.datetime.now()
    [[sum(a*b for a,b in zip(X_row,Y_col)) for Y_col in zip(*ay)] for X_row in ax]
    end = datetime.datetime.now()
    return end-start
    
dot_product_lists(10)

datetime.timedelta(0, 0, 25)

In [54]:
x = [x**2 for x in range(n)]
y = [x**3 for x in range(n)]
ax = [x,y]
print(ax)
print(list(zip(ax)))
print(list(zip(*ax)))
ay = [list(i) for i in zip(*ax)]
print(ay)
print(list(zip(*ay)))

[[0, 1, 4, 9, 16, 25, 36, 49, 64, 81], [0, 1, 8, 27, 64, 125, 216, 343, 512, 729]]
[([0, 1, 4, 9, 16, 25, 36, 49, 64, 81],), ([0, 1, 8, 27, 64, 125, 216, 343, 512, 729],)]
[(0, 0), (1, 1), (4, 8), (9, 27), (16, 64), (25, 125), (36, 216), (49, 343), (64, 512), (81, 729)]
[[0, 0], [1, 1], [4, 8], [9, 27], [16, 64], [25, 125], [36, 216], [49, 343], [64, 512], [81, 729]]
[(0, 1, 4, 9, 16, 25, 36, 49, 64, 81), (0, 1, 8, 27, 64, 125, 216, 343, 512, 729)]


In [55]:
ax

[[0, 1, 4, 9, 16, 25, 36, 49, 64, 81],
 [0, 1, 8, 27, 64, 125, 216, 343, 512, 729]]

<h4>Compare the two</h4>

In [56]:
for n in [10,100,1000,10000]:
    numpy_result = dotproduct(n)
    list_result = dot_product_lists(n)
    print(n,numpy_result,list_result,sep='\t')

10	0:00:00.000301	0:00:00.000023
100	0:00:00.000007	0:00:00.000084
1000	0:00:00.000008	0:00:00.109554
10000	0:00:00.000039	0:00:00.023940


<h3>numpy indexing vs list indexing</h3>

In [57]:
ax = np.array([1,2,3,4,8,9])
x = [1,2,3,4,8,9]

#Extract the first and last elements from the numpy array into a single array
ax[[0,-1]]

#Extract the first and last elements from the list into a new list
#[x[0],x[-1]]

array([1, 9])

<h3>numpy slicing vs list slicing</h3>

In [58]:
ax = np.array([[11,12,13,14],[21,22,23,24],[31,32,33,34]])
ax[1:3,0:2]

array([[21, 22],
       [31, 32]])

In [59]:
ax

array([[11, 12, 13, 14],
       [21, 22, 23, 24],
       [31, 32, 33, 34]])

In [60]:
lx = [[11,12,13,14],[21,22,23,24],[31,32,33,34]]
lx
#HELP!

[[11, 12, 13, 14], [21, 22, 23, 24], [31, 32, 33, 34]]

<h3>numpy: selecting elements using a boolean mask</h3>

In [61]:
ax = np.array([1,4,7,9,2,3,10,11,34,2])
ax < 7

array([ True,  True, False, False,  True,  True, False, False, False,  True], dtype=bool)

In [62]:
ax[ax<7]

array([1, 4, 2, 3, 2])

<h3>Selecting elements from an np array</h3>

In [63]:
x=[[0,1,2,3,4,5],[10,11,12,13,14,15],[20,21,22,23,24,25]]
ax=np.array(x,float)
np.where(ax%2==0,1,0)

array([[1, 0, 1, 0, 1, 0],
       [1, 0, 1, 0, 1, 0],
       [1, 0, 1, 0, 1, 0]])

In [64]:

#linalg, a linear algebra module
#functions dealing with polynomials, differentials, etc


In [65]:
import scipy
scipy.nanmean(x)

12.5

<h3>Random number support in numpy</h3>

In [66]:
#np.random.normal(size=10)
#np.random.normal(size=(100,100))
#np.random.exponential()
#np.random.exponential(1.0,size=(6,3))
#np.random.randint(-10,10,size=(9,9))

In [67]:
np.random.normal(size=10)

array([ 0.75881538, -1.48696954,  1.73094949,  1.17230872, -0.74435586,
       -1.73517367, -0.77471452, -0.31002784,  1.65878369,  0.80823087])

In [68]:
np.random.normal(size=(4,2))

array([[-1.09082674, -0.51173145],
       [ 0.16324132, -0.66502979],
       [ 0.06641726, -1.29570296],
       [-0.44231781,  1.04770421]])

In [69]:
np.random.exponential()

0.2679535995256224

In [70]:
np.random.exponential(1.0,size=(6,3))

array([[ 0.47119682,  1.4703567 ,  2.8245557 ],
       [ 0.10112585,  0.46947564,  0.82764098],
       [ 4.49198332,  0.24298975,  2.69078065],
       [ 1.01297616,  0.12627747,  0.60085753],
       [ 0.62270795,  0.33932342,  1.27564254],
       [ 0.83691427,  0.39803595,  3.3844492 ]])