# Python Library - NumPy

Import NumPy in Python

In [1]:
import numpy as np
#or use from numpy import array for example, so no need use np.array(), just use array(), not recommended
import pandas as pd

## 1. array properties
Array can only contain a single type, i.e. all entries are int, or all are floats, or all are booleans.

In [2]:
a = np.array([1.0, "is", True])
print(a)
#in this case everything converts into string

['1.0' 'is' 'True']


## 2. array creation from scratch

In [3]:
a=np.array([1,2,3])
print(a)
print(type(a))

[1 2 3]
<class 'numpy.ndarray'>


## 3. convert list into array

In [4]:
height = [1.73, 1.68, 1.71, 1.89, 1.79]
weight = [65.4, 59.2, 63.6, 88.4, 68.7]
np_weight = np.array(weight)
np_height = np.array(height)
print(np_weight)
print(np_height)
print(type(np_weight))
print(type(np_height))

[65.4 59.2 63.6 88.4 68.7]
[1.73 1.68 1.71 1.89 1.79]
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


## 4. element-wise computation in the whole array
note that adding two array will do element-wise addition, NOT stiching two array together (like list).

In [5]:
bmi = np_weight/np_height ** 2
print(bmi)

[21.85171573 20.97505669 21.75028214 24.7473475  21.44127836]


## 5. subsetting/slicing array

In [6]:
bmi[1]

20.97505668934241

## 6. subsetting/slicing with booleans

In [7]:
bmi > 23 #creating an array of 23, then compare element wise, fill in the boolean

array([False, False, False,  True, False])

In [8]:
bmi[bmi > 23]

array([24.7473475])

In [9]:
np.logical_and(bmi>21,bmi<22) #logical_or(),logical_not() for array equivalent, they do element-wise boolean compare

array([ True, False,  True, False,  True])

In [10]:
bmi[np.logical_and(bmi>21,bmi<22)]

array([21.85171573, 21.75028214, 21.44127836])

## 7. 2D array
2D array can be created from a list of list

In [11]:
np_2d = np.array([[1.73, 1.68, 1.71, 1.89, 1.79],[65.4, 59.2, 63.6, 88.4, 68.7]])
print(np_2d)
np_2d.shape #(row,col)

[[ 1.73  1.68  1.71  1.89  1.79]
 [65.4  59.2  63.6  88.4  68.7 ]]


(2, 5)

subsetting a row in 2D array

In [12]:
np_2d[0]

array([1.73, 1.68, 1.71, 1.89, 1.79])

subsetting a particular element(s)

In [13]:
np_2d[0][2]

1.71

In [14]:
np_2d[0,2] #alternatively, you can do this, think of it as intersection of row indice and col indice

1.71

In [15]:
np_2d[:,1:3] #select all row first, then get col 1 -> 3

array([[ 1.68,  1.71],
       [59.2 , 63.6 ]])

## 8. generating data

In [16]:
height = np.round(np.random.normal(1.75, 0.20, 5000),2) #normal dist, distribution mean,stdev,samples
weight = np.round(np.random.normal(60.32, 15, 5000),2)
np_city = np.column_stack((height,weight)) #paste the col together

## 9. simple stats (mean,median,corr,stdev)

In [17]:
print(np.mean(np_city[:,0])) #height mean
print(np.median(np_city[:,0])) #height median
print(np.corrcoef(np_city[:,0],np_city[:,1])) #correlation between height and weight
print(np.std(np_city[:,0]))

1.7471879999999997
1.74
[[1.         0.00709868]
 [0.00709868 1.        ]]
0.20121315229378023


## 10. for loop

In [18]:
print(bmi)
for val in bmi:
    print(val)

[21.85171573 20.97505669 21.75028214 24.7473475  21.44127836]
21.85171572722109
20.97505668934241
21.750282138093777
24.74734749867025
21.44127836209856


In [19]:
np_h=np.array([1.73,1.68,1.71,1.89,1.79])
np_w=np.array([65.4,59.2,63.6,88.4,68.7])
meas=np.array([np_h,np_w]) #2D array

for val in meas:
    print(val)
    
for val in np.nditer(meas): #first height(1st row), then weight(2nd row), use nditer function !
    print(val)

[1.73 1.68 1.71 1.89 1.79]
[65.4 59.2 63.6 88.4 68.7]
1.73
1.68
1.71
1.89
1.79
65.4
59.2
63.6
88.4
68.7


## 11. Import file

### 11.1 *loadtxt()*

tend to breakdown when we have mixed data type. Although numpy can handle it, but it is most natural to handle them in dataframe.

In [23]:
filename = "/Users/XavierTang/Documents/Data Science/Python/data/mnist_kaggle_some_rows.csv"
data = np.loadtxt(filename, delimiter = ",")
data

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [2., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 0.]])

### 11.1.1 skip certain rows (e.g. first row for header)

In [21]:
filename = "/Users/XavierTang/Documents/Data Science/Python/data/mnist_kaggle_some_rows.csv"
data = np.loadtxt(filename, delimiter = ",", skiprows=1) #skip the first row for header
data

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       ...,
       [2., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 0.]])

### 11.1.2 use only certain column

In [24]:
filename = "/Users/XavierTang/Documents/Data Science/Python/data/mnist_kaggle_some_rows.csv"
data = np.loadtxt(filename, delimiter = ",", usecols=[0,2]) #only use 1st AND 3rd columns
data

array([[1., 0.],
       [0., 0.],
       [1., 0.],
       [4., 0.],
       [0., 0.],
       [0., 0.],
       [7., 0.],
       [3., 0.],
       [5., 0.],
       [3., 0.],
       [8., 0.],
       [9., 0.],
       [1., 0.],
       [3., 0.],
       [3., 0.],
       [1., 0.],
       [2., 0.],
       [0., 0.],
       [7., 0.],
       [5., 0.],
       [8., 0.],
       [6., 0.],
       [2., 0.],
       [0., 0.],
       [2., 0.],
       [3., 0.],
       [6., 0.],
       [9., 0.],
       [9., 0.],
       [7., 0.],
       [8., 0.],
       [9., 0.],
       [4., 0.],
       [9., 0.],
       [2., 0.],
       [1., 0.],
       [3., 0.],
       [1., 0.],
       [1., 0.],
       [4., 0.],
       [9., 0.],
       [1., 0.],
       [4., 0.],
       [4., 0.],
       [2., 0.],
       [6., 0.],
       [3., 0.],
       [7., 0.],
       [7., 0.],
       [4., 0.],
       [7., 0.],
       [5., 0.],
       [1., 0.],
       [9., 0.],
       [0., 0.],
       [2., 0.],
       [2., 0.],
       [3., 0.],
       [9., 0.

### 11.1.3 import only certain data type

in this file (seaslug.txt), first row is header (string), if directly use *loadtxt()*, it will give ValueError as it cannot convert string to float. Two ways around it:   
1) set data type argument to str (dtype = str)   
2) skip the first row

In [30]:
filename="/Users/XavierTang/Documents/Data Science/Python/data/seaslug.txt"
data = np.loadtxt(filename, delimiter = "\t", dtype=str)
print(data)

data_float = np.loadtxt(filename, delimiter="\t", dtype=float, skiprows=1)
print(data_float)

[['Time' 'Percent']
 ['99' '0.067']
 ['99' '0.133']
 ['99' '0.067']
 ['99' '0']
 ['99' '0']
 ['0' '0.5']
 ['0' '0.467']
 ['0' '0.857']
 ['0' '0.5']
 ['0' '0.357']
 ['0' '0.533']
 ['5' '0.467']
 ['5' '0.467']
 ['5' '0.125']
 ['5' '0.4']
 ['5' '0.214']
 ['5' '0.4']
 ['10' '0.067']
 ['10' '0.067']
 ['10' '0.333']
 ['10' '0.333']
 ['10' '0.133']
 ['10' '0.133']
 ['15' '0.267']
 ['15' '0.286']
 ['15' '0.333']
 ['15' '0.214']
 ['15' '0']
 ['15' '0']
 ['20' '0.267']
 ['20' '0.2']
 ['20' '0.267']
 ['20' '0.437']
 ['20' '0.077']
 ['20' '0.067']
 ['25' '0.133']
 ['25' '0.267']
 ['25' '0.412']
 ['25' '0']
 ['25' '0.067']
 ['25' '0.133']
 ['30' '0']
 ['30' '0.071']
 ['30' '0']
 ['30' '0.067']
 ['30' '0.067']
 ['30' '0.133']]
[[9.90e+01 6.70e-02]
 [9.90e+01 1.33e-01]
 [9.90e+01 6.70e-02]
 [9.90e+01 0.00e+00]
 [9.90e+01 0.00e+00]
 [0.00e+00 5.00e-01]
 [0.00e+00 4.67e-01]
 [0.00e+00 8.57e-01]
 [0.00e+00 5.00e-01]
 [0.00e+00 3.57e-01]
 [0.00e+00 5.33e-01]
 [5.00e+00 4.67e-01]
 [5.00e+00 4.67e-01]
 [5.

### 11.2 genfromtxt()

one column of data may contain strings and another floats, use *genfromtxt()* in this case.

In [32]:
file = "/Users/XavierTang/Documents/Data Science/Python/data/titanic_sub.csv"
data = np.genfromtxt(file, delimiter=",", names=True, dtype=None) #names = True -> got header
data["Survived"]

  


array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,

### 11.3 recfromcsv()

similar to genfromtxt(), but default dtype=None, also included delimiter="," and names=True

In [33]:
file = "/Users/XavierTang/Documents/Data Science/Python/data/titanic_sub.csv"
data = np.recfromcsv(file)
data

  output = genfromtxt(fname, **kwargs)


rec.array([(  1, 0, 3, b'male', 22.  , 1, 0, b'A/5 21171',   7.25  , b'', b'S'),
           (  2, 1, 1, b'female', 38.  , 1, 0, b'PC 17599',  71.2833, b'C85', b'C'),
           (  3, 1, 3, b'female', 26.  , 0, 0, b'STON/O2. 3101282',   7.925 , b'', b'S'),
           (  4, 1, 1, b'female', 35.  , 1, 0, b'113803',  53.1   , b'C123', b'S'),
           (  5, 0, 3, b'male', 35.  , 0, 0, b'373450',   8.05  , b'', b'S'),
           (  6, 0, 3, b'male',   nan, 0, 0, b'330877',   8.4583, b'', b'Q'),
           (  7, 0, 1, b'male', 54.  , 0, 0, b'17463',  51.8625, b'E46', b'S'),
           (  8, 0, 3, b'male',  2.  , 3, 1, b'349909',  21.075 , b'', b'S'),
           (  9, 1, 3, b'female', 27.  , 0, 2, b'347742',  11.1333, b'', b'S'),
           ( 10, 1, 2, b'female', 14.  , 1, 0, b'237736',  30.0708, b'', b'C'),
           ( 11, 1, 3, b'female',  4.  , 1, 1, b'PP 9549',  16.7   , b'G6', b'S'),
           ( 12, 1, 1, b'female', 58.  , 0, 0, b'113783',  26.55  , b'C103', b'S'),
           ( 13, 0,

## 12. Stacking Arrays

In [5]:
#using number created by consective interger using range()
#transforming 1D array into rectangular shape using reshape()
#these are matrix
A = np.arange(8).reshape(2,4) + 0.1
B = np.arange(6).reshape(2,3) + 0.2
C = np.arange(12).reshape(3,4) + 0.3
print(A)
print(B)
print(C)

[[0.1 1.1 2.1 3.1]
 [4.1 5.1 6.1 7.1]]
[[0.2 1.2 2.2]
 [3.2 4.2 5.2]]
[[ 0.3  1.3  2.3  3.3]
 [ 4.3  5.3  6.3  7.3]
 [ 8.3  9.3 10.3 11.3]]


### 12.1 Stacking arrays horizontally

the number of rows of two array must match

In [6]:
np.hstack([B,A]) #input is a list of np arrays

array([[0.2, 1.2, 2.2, 0.1, 1.1, 2.1, 3.1],
       [3.2, 4.2, 5.2, 4.1, 5.1, 6.1, 7.1]])

In [7]:
np.concatenate([B,A],axis=1)

array([[0.2, 1.2, 2.2, 0.1, 1.1, 2.1, 3.1],
       [3.2, 4.2, 5.2, 4.1, 5.1, 6.1, 7.1]])

### 12.2 Stacking arrays vertically 

the number of cols of two array must match

In [8]:
np.vstack([A,C]) #input is a list of np arrays

array([[ 0.1,  1.1,  2.1,  3.1],
       [ 4.1,  5.1,  6.1,  7.1],
       [ 0.3,  1.3,  2.3,  3.3],
       [ 4.3,  5.3,  6.3,  7.3],
       [ 8.3,  9.3, 10.3, 11.3]])

In [10]:
np.concatenate([A,C],axis=0)

array([[ 0.1,  1.1,  2.1,  3.1],
       [ 4.1,  5.1,  6.1,  7.1],
       [ 0.3,  1.3,  2.3,  3.3],
       [ 4.3,  5.3,  6.3,  7.3],
       [ 8.3,  9.3, 10.3, 11.3]])

## 13. meshgrid()

In [4]:
u = np.linspace(-2,2,3)
v =np.linspace(-1,1,5)
print(u)
print(v)

[-2.  0.  2.]
[-1.  -0.5  0.   0.5  1. ]


In [7]:
X,Y=np.meshgrid(u,v)
print(X)
print(Y)

[[-2.  0.  2.]
 [-2.  0.  2.]
 [-2.  0.  2.]
 [-2.  0.  2.]
 [-2.  0.  2.]]
[[-1.  -1.  -1. ]
 [-0.5 -0.5 -0.5]
 [ 0.   0.   0. ]
 [ 0.5  0.5  0.5]
 [ 1.   1.   1. ]]
