In [26]:
import numpy as np
import math

# Numpy Array

## ndarrays  
N-dimensional array is a key feature of Numpy.


In [2]:
#using random numbers to create ndarray
#create a tuple with 2 rows and 3 columns
data = np.random.randn(2,3) 
data

array([[ 0.33202789, -1.99802173, -1.31018884],
       [ 1.21002326, -0.25079765,  0.09796337]])

We can perform some mathematical operations with ndarray objects.

In [3]:
data + data

array([[ 0.66405579, -3.99604345, -2.62037769],
       [ 2.42004652, -0.50159531,  0.19592674]])

In [4]:
data * 100

array([[  33.2027894 , -199.80217257, -131.01888429],
       [ 121.00232585,  -25.07976537,    9.79633705]])

ndarray objects has two attributes: **`ndim`** and **`shape`**.

In [5]:
data.ndim

2

In [6]:
data.shape

(2, 3)

We can also see the data type of elements in an array.

In [7]:
data.dtype #float numbers

dtype('float64')

In [8]:
data_2 = np.random.randint(1,100,(5,5))

In [9]:
data_2.dtype #integers

dtype('int64')

## Array Creation




### 1 From a List


In [10]:
#unidimensional array
a = np.array([1,2,3])
a

array([1, 2, 3])

In [11]:
a.ndim

1

In [13]:
#multidimensional array
b = np.array([[11,12,13],[3.44,5.66,7.99], 
             ['a', 'b', 'c']])
b

array([['11', '12', '13'],
       ['3.44', '5.66', '7.99'],
       ['a', 'b', 'c']], dtype='<U32')

In [14]:
b.ndim

2

In [15]:
b.dtype

dtype('<U32')

### 2 Using Array Creation functions

In [16]:
#ones
np.ones((2,3)) #2 rows and 3 columns

array([[1., 1., 1.],
       [1., 1., 1.]])

In [17]:
#zeros
np.zeros([3,4,5])

array([[[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]]])

In [21]:
#identity
np.identity(3)

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

### 3 From random number generators

In [23]:
#rand
np.random.rand(2,3) #no need to use a tuple

array([[0.25253682, 0.94373492, 0.04287293],
       [0.03187186, 0.68417324, 0.90639375]])

In [24]:
#randn - std norm dist
np.random.randn(3,4)

array([[ 1.45272645, -1.21798365, -0.27133463,  1.33254075],
       [-0.47361879, -1.4624039 , -0.84010577, -0.90515055],
       [ 0.12042047, -0.89522936, -0.26790567,  2.02870152]])

In [25]:
#randint
#use a tuple to describe the size
np.random.randint(1,100,(2,3)) 

array([[94, 70, 24],
       [67, 45, 37]])

### 4 `arange` and `linspace`

In [26]:
#arange
#Return evenly spaced values within a given interval.
np.arange(1,20,2,dtype='float64') 
#the third argument is step (difference)

array([ 1.,  3.,  5.,  7.,  9., 11., 13., 15., 17., 19.])

In [27]:
#linspace
#Return evenly spaced numbers over a specified interval.
np.linspace(1,20,10)
#the third argument is the total number of elements in this array.

array([ 1.        ,  3.11111111,  5.22222222,  7.33333333,  9.44444444,
       11.55555556, 13.66666667, 15.77777778, 17.88888889, 20.        ])

## Array Operations

### Arithmetic with Arrays

In [28]:
a = np.array([1,2,3])
b = np.array([4,5,6])

print(b-a)
print(b % a)

[3 3 3]
[0 1 0]


### Boolean Operations

In [29]:
temp_f = np.array([23,13,20,15,18])
temp_c = (temp_f - 31)*(5/9)
temp_c

array([ -4.44444444, -10.        ,  -6.11111111,  -8.88888889,
        -7.22222222])

In [30]:
#comparison 
temp_c >= -5

array([ True, False, False, False, False])

In [31]:
temp_c % 2 == 0

array([False,  True, False, False, False])

### Aggregation Functions

In [38]:
arr_1 = np.random.randint(1,10,(4,3))
arr_2 = np.random.randn(4,3)
arr_3 = arr_1 + arr_2
arr_3

array([[ 0.59594597,  1.50512685,  2.08522965],
       [10.71753974,  7.60702711,  4.53455717],
       [ 7.42738127,  4.6259718 ,  4.26798703],
       [ 0.71458707,  8.6850562 ,  5.45572279]])

In [41]:
#sum
print(arr_3.sum(axis=0)) 
#sum along each row = sum of each column
print(arr_3.sum(axis=1))
#sum along each column = sum of each row
print(arr_3.sum())
#sum of the array

[19.45545405 22.42318197 16.34349665]
[ 4.18630247 22.85912402 16.32134011 14.85536606]
58.22213266768099


In [43]:
#min/max
print(arr_3.min(axis=0))
print(arr_3.max())

[0.59594597 1.50512685 2.08522965]
10.71753974121848


In [45]:
#mean
arr_3.mean(axis=1)

array([1.39543416, 7.61970801, 5.4404467 , 4.95178869])

### Reshape Function

In [24]:
a = np.arange(32)
a

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])

In [25]:
a.reshape(8,4)

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22, 23],
       [24, 25, 26, 27],
       [28, 29, 30, 31]])

## Slicing and Indexing

### 1 Indexing using Integers

In [6]:
#unidimensional
a = np.array([1,2,3,4])
print(a[0]) #start from zero
print(a[2])
print(a[1:]) #use colon from the second to the end
print(a[:])  #bare slice - select all

1
3
[2 3 4]
[1 2 3 4]


We can also use slicing and indexing to propagate values to an array.

In [7]:
a[2:] = 123
a

array([  1,   2, 123, 123])

In [8]:
#multidimensional
a = np.random.randint(1,100,(4,4))
a

array([[72, 25, 88, 82],
       [39, 90, 22, 42],
       [19,  2, 74,  6],
       [60, 70, 54, 33]])

In [9]:
#arr[row_int, col_int]
#to choose 2, which is in row3, column2
#remember in python we start at 0
a[2,1]

2

In [10]:
#get multiple results
a[[1,3], 2:]
# choose row 2 and 4; column 3 onwards

array([[22, 42],
       [54, 33]])

### 2 Boolean Indexing  
This allows us to select arbitrary elements based on certain **conditions**.


In [11]:
a

array([[72, 25, 88, 82],
       [39, 90, 22, 42],
       [19,  2, 74,  6],
       [60, 70, 54, 33]])

In [12]:
a % 2 == 0 #condition

array([[ True, False,  True,  True],
       [False,  True,  True,  True],
       [False,  True,  True,  True],
       [ True,  True,  True, False]])

In [13]:
a[ a % 2 == 0] #wrap up and select only if True

array([72, 88, 82, 90, 22, 42,  2, 74,  6, 60, 70, 54])

### 3 More about Slicing

Slicing is a way to create a **sub-array** based on the original array. We use brackets to indicate that we are to slice a sub-array.

In [14]:
#unidimensional
a = np.array([1,2,3,4])
a[:2] #from 0 to 1, not 2!

array([1, 2])

In [16]:
#multidimensional 
a = np.array([[1,2,3,4],
             [5,6,7,8],
             [9,10,11,12]])
print('sub array index [1,2] value before change:', a[1,2])
a[1,2] = 100
print('sub array index [1,2] value after change:', a[1,2])

sub array index [1,2] value before change: 7
sub array index [1,2] value after change: 100


### 4 Fancy Indexing

Fancy indexing is a term adopted by Numpy to describe **indexing using integer arrays**.

In [20]:
#Example 1
arr = np.empty((8,5))

for i in range(8):
  arr[i] = i**2

arr

array([[ 0.,  0.,  0.,  0.,  0.],
       [ 1.,  1.,  1.,  1.,  1.],
       [ 4.,  4.,  4.,  4.,  4.],
       [ 9.,  9.,  9.,  9.,  9.],
       [16., 16., 16., 16., 16.],
       [25., 25., 25., 25., 25.],
       [36., 36., 36., 36., 36.],
       [49., 49., 49., 49., 49.]])

We can pass a list to select rows in a specific order.

In [21]:
arr[[7,1,2,3],:]

array([[49., 49., 49., 49., 49.],
       [ 1.,  1.,  1.,  1.,  1.],
       [ 4.,  4.,  4.,  4.,  4.],
       [ 9.,  9.,  9.,  9.,  9.]])

We can use negative indices to select from the end.

In [22]:
arr[-1] - arr[7]

array([0., 0., 0., 0., 0.])

In [23]:
arr[[-1,-2,-2,-7]]

array([[49., 49., 49., 49., 49.],
       [36., 36., 36., 36., 36.],
       [36., 36., 36., 36., 36.],
       [ 1.,  1.,  1.,  1.,  1.]])

# Applying NumPy to datasets

First, let's look at a popular dataset on wine quality. We are only looking at red wine and the data fields include:  
- fixed acidity, 
- volatile aciditycitric acid, 
- residual sugar, 
- chlorides, 
- free sulfur dioxide,
- total sulfur dioxidedensity, 
- pH, 
- sulphates, 
- alcohol, 
- quality

In [27]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [28]:
wines = np.genfromtxt('/content/gdrive/MyDrive/Intro to DS with Python (Umich)/class resources/Course 1 - Notebook Resources/resources/week-1/datasets/winequality-red.csv',
                      delimiter = ';',
                      skip_header=1)
wines

array([[ 7.4  ,  0.7  ,  0.   , ...,  0.56 ,  9.4  ,  5.   ],
       [ 7.8  ,  0.88 ,  0.   , ...,  0.68 ,  9.8  ,  5.   ],
       [ 7.8  ,  0.76 ,  0.04 , ...,  0.65 ,  9.8  ,  5.   ],
       ...,
       [ 6.3  ,  0.51 ,  0.13 , ...,  0.75 , 11.   ,  6.   ],
       [ 5.9  ,  0.645,  0.12 , ...,  0.71 , 10.2  ,  5.   ],
       [ 6.   ,  0.31 ,  0.47 , ...,  0.66 , 11.   ,  6.   ]])

In [29]:
#slice all rows but only the first column
wines[:, 0]

array([7.4, 7.8, 7.8, ..., 6.3, 5.9, 6. ])

In [32]:
#preserve the results in their own rows
wines[:, 0:1]

array([[7.4],
       [7.8],
       [7.8],
       ...,
       [6.3],
       [5.9],
       [6. ]])

In [33]:
#selecting non-consecutive columns
wines[: , [1,3,5]]

array([[ 0.7  ,  1.9  , 11.   ],
       [ 0.88 ,  2.6  , 25.   ],
       [ 0.76 ,  2.3  , 15.   ],
       ...,
       [ 0.51 ,  2.3  , 29.   ],
       [ 0.645,  2.   , 32.   ],
       [ 0.31 ,  3.6  , 18.   ]])

In [34]:
#aggregation 
wines[: , 2].mean()

0.2709756097560976

Now, let's look at a admission dataset.

In [None]:
grad_admin = np.genfromtxt('/content/gdrive/MyDrive/Intro to DS with Python (Umich)/class resources/Course 1 - Notebook Resources/resources/week-1/datasets/Admission_Predict.csv',
                           names=('Serial No','GRE Score', 'TOEFL Score', 'University Rating', 'SOP',
                                          'LOR','CGPA','Research', 'Chance of Admit'),
                           dtype = None, delimiter = ',',
                           skip_header=1)
grad_admin

In [36]:
#retrieve data according to the names of columns
grad_admin['CGPA'][:5]

array([9.65, 8.87, 8.  , 8.67, 8.21])

In [37]:
#transform data
grad_admin['CGPA'] = grad_admin['CGPA'] / 10 * 4
grad_admin['CGPA'][:10]

array([3.86 , 3.548, 3.2  , 3.468, 3.284, 3.736, 3.28 , 3.16 , 3.2  ,
       3.44 ])

In [38]:
#how many applicants are interested in research
#programs?
len(grad_admin[ grad_admin['Research'] == 1])

219

In [43]:
#find out the average GRE scores of students
#with high probability of admission (>0.8)
grad_admin[ grad_admin['Chance_of_Admit'] > 0.8]['GRE_Score'].mean()

328.7350427350427

In [40]:
#Do the same thing with CGPA
grad_admin[ grad_admin['Chance_of_Admit'] > 0.9]['CGPA'].mean()

3.8147755102040817