## Data Manipulation / Type Review Numpy

#### Array Creation

In [1]:
import numpy as np

In [3]:
a = np.array([1,2,3])
a, a.ndim

(array([1, 2, 3]), 1)

In [4]:
b = np.array([[0,1,2], [3,4,5]])
b, b.ndim, b.shape

(array([[0, 1, 2],
        [3, 4, 5]]),
 2,
 (2, 3))

In [7]:
c = np.array([2.2, 5, 1.1])
c.dtype.name, c

('float64', array([2.2, 5. , 1.1]))

In [10]:
# Fill our numpy array with *dummy values*
d = np.zeros((2,3))
print(d)
e = np.ones((2,3))
print(e)

[[0. 0. 0.]
 [0. 0. 0.]]
[[1. 1. 1.]
 [1. 1. 1.]]


In [11]:
# Generate with Random
np.random.rand(2,3)

array([[0.96779597, 0.02456816, 0.55818037],
       [0.98765728, 0.17542609, 0.97968405]])

* Commonly see `zeros`, `ones`, and `rand` use quite of to create example arrays

In [12]:
f = np.arange(10,50,2)
f

array([10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42,
       44, 46, 48])

In [13]:
# Generate a sequence of floats, we can use the `linspace()` function. Function the third argument
# isn't the difference between two numbers, but the total number of items you want to generate
np.linspace(0, 2, 15)

array([0.        , 0.14285714, 0.28571429, 0.42857143, 0.57142857,
       0.71428571, 0.85714286, 1.        , 1.14285714, 1.28571429,
       1.42857143, 1.57142857, 1.71428571, 1.85714286, 2.        ])

* We want 15 numbers between 0 and 2 which are both inclusive values

### Array Operations

In [16]:
# Arithemetic operators on array apply element wise
a = np.array([10,20,30,40])
b = np.array([1,2,3,4])

print(b - a, a - b, a * b, b + a, a-1)

[ -9 -18 -27 -36] [ 9 18 27 36] [ 10  40  90 160] [11 22 33 44] [ 9 19 29 39]


* Common way to think is a `broadcast` of the operation across each element

In [19]:
farehnheit = np.array([0, -10, -5, -15, 0])

# formula for conversion is (F - 32) * 5/9
celsius = (farehnheit - 31) * 5/9
print(farehnheit, celsius)

[  0 -10  -5 -15   0] [-17.22222222 -22.77777778 -20.         -25.55555556 -17.22222222]


#### Boolean Array

In [20]:
celsius % 2 == 0

array([False, False,  True, False, False])

In [21]:
# Element Wise or matrix product
A = np.array([[1,1], [0, 1]])
B = np.array([[2,0], [3,4]])
print(A * B)

[[2 0]
 [0 4]]


In [22]:
print(A@B)

[[5 4]
 [3 4]]


* Calculated As [[1x2 + 1x3, 1x0 + 1x4], [0x2 + 1x3, 0x0 + 1x4]]

In [23]:
np.dot(A, B)

array([[5, 4],
       [3, 4]])

* https://www.sharpsightlabs.com/blog/numpy-dot/

```python
import numpy.matlib 
import numpy as np 

a = np.array([[1,2],[3,4]]) 
b = np.array([[11,12],[13,14]]) 
np.dot(a,b)

[[37  40] 
[85  92]] 
```

* Calculated as 
[[1x11 + 2x13, 1x12 + 2x14],[3x11 + 4x13, 3x12 + 4x14]]

In [24]:
a = np.array([[1,2],[3,4]]) 
b = np.array([[11,12],[13,14]]) 
np.dot(a,b)

array([[37, 40],
       [85, 92]])

In [25]:
# When manipulating arrays of different types, the type of the resulting array wil lcorrespond to
# the more general of the two type. This is called upcasting 

array1 = np.array([[1,2,3], [4,5,6]])
print(array1.dtype)

array2 = np.array([[7.1,8.2,9.1], [10.4, 11, 12.3]])
print(array2.dtype)

int64
float64


In [26]:
array3 = array1 + array2
array3

array([[ 8.1, 10.2, 12.1],
       [14.4, 16. , 18.3]])

In [27]:
print(array3.sum(), array3.max(), array3.mean(), array3.min())

79.1 18.3 13.183333333333332 8.1


In [28]:
b = np.arange(1,16,1).reshape(3,5)
b

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10],
       [11, 12, 13, 14, 15]])

### Indexing, Slicing & Iterating

In [29]:
a = np.array([1,3,5,7])
a[2]

5

In [30]:
a = np.array([[1,2], [3,4], [5,6]])
a

array([[1, 2],
       [3, 4],
       [5, 6]])

In [31]:
a[1,1]

4

In [32]:
# Get multiple elements
print(np.array([a[0, 1], a[1, 0], a[2, 1]])) # [2, 3, 6]

[2 3 6]


In [33]:
# Same way but zipping up the numbers you're after (just match the index of each argument above for a shortcut)
print(a[[0, 1, 2], [1, 0, 1]])

[2 3 6]


In [34]:
print(a>5)

[[False False]
 [False False]
 [False  True]]


In [36]:
print(a[a>5]) # only retun the True boolean mask (or result of broadcast)

[6]


In [38]:
# Slicing
a = np.array([0,1,2,3,4,5])
a[:2], a[2:5]

(array([0, 1]), array([2, 3, 4]))

In [39]:
# multi-dimensional
a = np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]])
a

array([[ 1,  2,  3,  4],
       [ 5,  6,  7,  8],
       [ 9, 10, 11, 12]])

In [40]:
a[:1] # row returned for the slice

array([[1, 2, 3, 4]])

In [41]:
a[:1, :2] # first row, first two columns

array([[1, 2]])

In [43]:
a[2, 2:]

array([11, 12])

### With Datasets

In [46]:
wines = np.genfromtxt('Datasets/winequality-red.csv', delimiter=';', skip_header=1) # skip header row
wines.shape, type(wines)

((1599, 12), numpy.ndarray)

In [47]:
wines[:5, :] # preview the numpy array

array([[7.400e+00, 7.000e-01, 0.000e+00, 1.900e+00, 7.600e-02, 1.100e+01,
        3.400e+01, 9.978e-01, 3.510e+00, 5.600e-01, 9.400e+00, 5.000e+00],
       [7.800e+00, 8.800e-01, 0.000e+00, 2.600e+00, 9.800e-02, 2.500e+01,
        6.700e+01, 9.968e-01, 3.200e+00, 6.800e-01, 9.800e+00, 5.000e+00],
       [7.800e+00, 7.600e-01, 4.000e-02, 2.300e+00, 9.200e-02, 1.500e+01,
        5.400e+01, 9.970e-01, 3.260e+00, 6.500e-01, 9.800e+00, 5.000e+00],
       [1.120e+01, 2.800e-01, 5.600e-01, 1.900e+00, 7.500e-02, 1.700e+01,
        6.000e+01, 9.980e-01, 3.160e+00, 5.800e-01, 9.800e+00, 6.000e+00],
       [7.400e+00, 7.000e-01, 0.000e+00, 1.900e+00, 7.600e-02, 1.100e+01,
        3.400e+01, 9.978e-01, 3.510e+00, 5.600e-01, 9.400e+00, 5.000e+00]])

In [48]:
# So all rows combined but only the first column from them would be
print("one integer 0 for slicing: ", wines[:, 0])
# But if we wanted the same values but wanted to "prserve" that they sit in their own rouws we would write
print("0 to 1 for slicing: \n", wines[:, 0:1])

one integer 0 for slicing:  [7.4 7.8 7.8 ... 6.3 5.9 6. ]
0 to 1 for slicing: 
 [[7.4]
 [7.8]
 [7.8]
 ...
 [6.3]
 [5.9]
 [6. ]]


In [49]:
# Non consecutive, all rows and only certain columns
wines[:, [2, 3, 5]]

array([[ 0.  ,  1.9 , 11.  ],
       [ 0.  ,  2.6 , 25.  ],
       [ 0.04,  2.3 , 15.  ],
       ...,
       [ 0.13,  2.3 , 29.  ],
       [ 0.12,  2.  , 32.  ],
       [ 0.47,  3.6 , 18.  ]])

In [53]:
# last column mean
wines[:, -1].mean(), len(wines[:, -1]), wines[:, -1].sum() / len(wines[:, -1])

(5.6360225140712945, 1599, 5.6360225140712945)

In [55]:
# Have numpy try and infer type
grad_admission = np.genfromtxt('Datasets/Admission_Predict.csv', dtype=None,
                              delimiter=',', skip_header=1, names=[
                                  'Serial No', 'GRE Score', 'TOEFL Score', 'University Rating', 'SOP',
                                  'LOR', 'CGPA', 'Research', 'Chance of Admit'
                              ])
grad_admission[:5]

array([(1, 337, 118, 4, 4.5, 4.5, 9.65, 1, 0.92),
       (2, 324, 107, 4, 4. , 4.5, 8.87, 1, 0.76),
       (3, 316, 104, 3, 3. , 3.5, 8.  , 1, 0.72),
       (4, 322, 110, 3, 3.5, 2.5, 8.67, 1, 0.8 ),
       (5, 314, 103, 2, 2. , 3. , 8.21, 0, 0.65)],
      dtype=[('Serial_No', '<i8'), ('GRE_Score', '<i8'), ('TOEFL_Score', '<i8'), ('University_Rating', '<i8'), ('SOP', '<f8'), ('LOR', '<f8'), ('CGPA', '<f8'), ('Research', '<i8'), ('Chance_of_Admit', '<f8')])

In [56]:
grad_admission.shape

(400,)

In [58]:
# Resulting array is a one-dimension array with 400 tuples
# CGPA column
grad_admission['CGPA'][:5]

array([9.65, 8.87, 8.  , 8.67, 8.21])

In [59]:
# Convert CGPA 
grad_admission['CGPA'] = grad_admission['CGPA'] / 10 * 4
grad_admission['CGPA'][:5]

array([3.86 , 3.548, 3.2  , 3.468, 3.284])

In [60]:
# Recall boolean masking, We can use this to find out how many students have had research experience by
# creating a boolean mask and passing to the array indexing operator
len(grad_admission[grad_admission['Research'] == 1])

219

In [64]:
print(grad_admission[grad_admission['Research'] == 1][:5])

[(1, 337, 118, 4, 4.5, 4.5, 3.86 , 1, 0.92)
 (2, 324, 107, 4, 4. , 4.5, 3.548, 1, 0.76)
 (3, 316, 104, 3, 3. , 3.5, 3.2  , 1, 0.72)
 (4, 322, 110, 3, 3.5, 2.5, 3.468, 1, 0.8 )
 (6, 330, 115, 5, 4.5, 3. , 3.736, 1, 0.9 )]


In [70]:
print(type(grad_admission['Research']==1), grad_admission['Research']==1)

<class 'numpy.ndarray'> [ True  True  True  True False  True  True False False False  True  True
  True  True  True False False  True False False  True False  True  True
  True  True False  True False False  True  True  True  True  True  True
 False False False False  True  True  True False  True  True  True False
  True  True  True  True  True  True False False False False  True False
 False False  True  True False False False  True  True  True  True  True
  True  True False  True  True False  True False  True  True  True  True
  True False False False False  True  True False False  True False False
 False  True  True  True  True False False False  True  True  True  True
  True False False  True  True False  True  True False False False  True
  True  True False False False  True  True  True  True  True  True False
 False False  True  True False  True  True False  True  True  True  True
  True  True False  True  True  True  True  True  True False False False
 False False False False Fa

* Mask shows how the rows or index selector passes whether the condition was true as a row for the boolean selector to check

In [74]:
# Look at the mean CGPA based on the chance of admission mask
print(grad_admission[grad_admission['Chance_of_Admit'] > 0.8]['CGPA'].mean())
print(grad_admission[grad_admission['Chance_of_Admit'] < 0.4]['CGPA'].mean())

3.7106666666666666
3.0222857142857142


In [76]:
a1 = np.random.rand(4)
a2 = np.random.rand(4, 1)
a3 = np.array([[1, 2, 3, 4]])
a4 = np.arange(1, 4, 1)
a5 = np.linspace(1 ,4, 4)

In [79]:
a1, a2, a3, a4, a5

(array([0.02223133, 0.31210975, 0.1369326 , 0.14749916]),
 array([[0.23955481],
        [0.52941134],
        [0.87290363],
        [0.61388699]]),
 array([[1, 2, 3, 4]]),
 array([1, 2, 3]),
 array([1., 2., 3., 4.]))

In [81]:
a5.shape == a1.shape

True

In [87]:
import re
s = 'ACAABAACAAAB'
result = re.findall('A{1,2}', s)
L = len(result)
result, L

(['A', 'AA', 'AA', 'AA', 'A'], 5)

* Which of the following is the correct regular expression to extract all the phone numbers from the following chunk of text:

1) Office of Research Administration: (734) 647-6333 | 4325 North Quad
2) Office of Budget and Financial Administration: (734) 647-8044 | 309 Maynard, Suite 205
3) Health Informatics Program: (734) 763-2285 | 333 Maynard, Suite 500
4) Office of the Dean: (734) 647-3576 | 4322 North Quad
5) UMSI Engagement Center: (734) 763-1251 | 777 North University
6) Faculty Adminstrative Support Staff: (734) 764-9376 | 4322 North Quad

* [(]\d{3}[)]\s\d{3}[-]\d{4}