# Numpy

Arrays have:
- fixed size
- same data type in the whole array

In [3]:
import numpy as np

### 1D arrays

In [5]:
a = np.array([1, 2, 3, 4, 5])

a

array([1, 2, 3, 4, 5])

In [6]:
a.dtype

dtype('int64')

In [7]:
b = np.array([1, 2, 3, 4, 5], dtype='float32')

b

array([1., 2., 3., 4., 5.], dtype=float32)

In [8]:
b.size

5

In [9]:
b.shape

(5,)

### 2D arrays

In [10]:
c = np.array([[1, 2, 3, 4, 5],
              [6, 7, 8, 9, 10]])

c

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10]])

In [12]:
c.size

10

In [11]:
c.shape

(2, 5)

### Indexing and sclicing

With integer

In [15]:
a[3] = 44

a

array([ 1,  2,  3, 44,  5])

With slice

In [16]:
a[1:4]

array([ 2,  3, 44])

With list

In [18]:
indices = [0, 1, 3, 1]

a[indices]

array([ 1,  2, 44,  2])

With mask

In [19]:
mask = [True, True, False, True, False]

a[mask]

array([ 1,  2, 44])

2D

In [22]:
c[1, 3]

9

In [24]:
c[1, :3]  # :3 --> from beginning to index 3 (not including)
          # i.e. "first 3 elements"

array([6, 7, 8])

In [27]:
c[0, 1:4]  # [2, 3, 4]

array([2, 3, 4])

In [28]:
c[:, 3:]   # [[4, 5]
           #  [9, 10]]
           # 3: --> from index 3 (including) to end 

array([[ 4,  5],
       [ 9, 10]])

In [29]:
c

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10]])

In [30]:
c[:, -2:]  # -2: --> from index -2 (including) to end 
           # i.e. "last two elements"

array([[ 4,  5],
       [ 9, 10]])

In [32]:
c[[0, 0, 1, 1], [3, 4, 0, 1]]  # [4, 5, 6, 7]

array([4, 5, 6, 7])

In [38]:
c

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10]])

In [39]:
c[[True, False], 2]

array([3])

In [40]:
c[[0, 0, 1], :2]

array([[1, 2],
       [1, 2],
       [6, 7]])

## Operations

By default ops between arrays are elementwise

In [41]:
a

array([ 1,  2,  3, 44,  5])

In [42]:
b

array([1., 2., 3., 4., 5.], dtype=float32)

In [43]:
a + b  # elementwise addition

array([ 2.,  4.,  6., 48., 10.])

In [44]:
a - b  # elementwise subtraction

array([ 0.,  0.,  0., 40.,  0.])

In [45]:
a * b  # elementwise multiplication

array([  1.,   4.,   9., 176.,  25.])

In [46]:
a / b  # elementwise division

array([ 1.,  1.,  1., 11.,  1.])

In [48]:
d = np.array([[1, 2],
              [6, 2],
              [5, 2],
              [6, 1],
              [1, 1]])

In [51]:
np.dot(c, d)  # matrix multiplication

array([[ 57,  21],
       [152,  61]])

In [52]:
c @ d  # matrix multiplication

array([[ 57,  21],
       [152,  61]])

In [53]:
c * d  # can't elementwise multiply these arrays

ValueError: ignored

Broadcasting

![](https://i.stack.imgur.com/JcKv1.png)

In [54]:
c

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10]])

In [55]:
a

array([ 1,  2,  3, 44,  5])

In [56]:
c * a

array([[  1,   4,   9, 176,  25],
       [  6,  14,  24, 396,  50]])

In [57]:
a2 = np.array([[ 1,  2,  3, 44,  5],
               [ 1,  2,  3, 44,  5]])

c * a2

array([[  1,   4,   9, 176,  25],
       [  6,  14,  24, 396,  50]])

In [60]:
c - 10

array([[-9, -8, -7, -6, -5],
       [-4, -3, -2, -1,  0]])

### Creating arrays

In [63]:
np.arange(5)  # equivalent to python's range but returns array

array([0, 1, 2, 3, 4])

In [64]:
np.arange(1, 10)

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [65]:
np.arange(1, 10, 2)

array([1, 3, 5, 7, 9])

In [66]:
np.linspace(0, 7, 20)  # create a range from 0 to 7 with 20 equal sized steps

array([0.        , 0.36842105, 0.73684211, 1.10526316, 1.47368421,
       1.84210526, 2.21052632, 2.57894737, 2.94736842, 3.31578947,
       3.68421053, 4.05263158, 4.42105263, 4.78947368, 5.15789474,
       5.52631579, 5.89473684, 6.26315789, 6.63157895, 7.        ])

In [68]:
np.zeros(shape=(2, 5))  # create array with shape (2, 5) filled with zeros

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [69]:
np.random.random(size=20)  # create an array with 20 random values from 0 to 1

array([0.23238019, 0.15219462, 0.77472454, 0.06419771, 0.56919991,
       0.77629892, 0.05937261, 0.74056963, 0.08119358, 0.55147968,
       0.89636093, 0.53784587, 0.05575655, 0.61700024, 0.03697198,
       0.91259831, 0.23774385, 0.23796646, 0.87915133, 0.02178697])

### Reshaping arrays

In [71]:
c

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10]])

In [73]:
np.arange(1, 11).reshape(2, 5)

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10]])

In [76]:
c.reshape(-1, 2)

array([[ 1,  2],
       [ 3,  4],
       [ 5,  6],
       [ 7,  8],
       [ 9, 10]])

In [77]:
c.T  # transpose

array([[ 1,  6],
       [ 2,  7],
       [ 3,  8],
       [ 4,  9],
       [ 5, 10]])

# Pandas

Data strutctures:
- **Series** <-- we will discuss today
- DataFrame

In [79]:
import pandas as pd

In [81]:
ser = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])

ser

a    1
b    2
c    3
d    4
e    5
dtype: int64

Indexing/slicing with position

In [82]:
ser.iloc[2]  # integer

3

In [83]:
ser.iloc[2:]  # slice

c    3
d    4
e    5
dtype: int64

In [84]:
ser.iloc[[2, 1, 4]]  # list

c    3
b    2
e    5
dtype: int64

In [85]:
ser.iloc[[True, False, True, True, False]]  # mask

a    1
c    3
d    4
dtype: int64

Indexing/slicing with names

In [87]:
ser.loc['c']  # with name

3

In [88]:
ser.loc['c':]  # slice

c    3
d    4
e    5
dtype: int64

In [90]:
ser.loc[['c', 'b', 'e']]  # list

c    3
b    2
e    5
dtype: int64

In [91]:
ser.loc[[True, False, True, True, False]]

a    1
c    3
d    4
dtype: int64

Difference with slicing

In [92]:
ser

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [93]:
ser.loc['b': 'd']  # inclusive

b    2
c    3
d    4
dtype: int64

In [95]:
ser.iloc[1: 3]  # not inclusive

b    2
c    3
dtype: int64

In [97]:
ser2 = pd.Series([1, 2, 3, 4, 5])

In [99]:
ser2.loc[2]

3

In [102]:
ser2.iloc[2]

3

In [103]:
ser2.loc[2:4]

2    3
3    4
4    5
dtype: int64

In [104]:
ser2.iloc[2:4]

2    3
3    4
dtype: int64

## Built-in methods

Statistical aggregations

In [113]:
print(f'{ser.min() = }')
print(f'{ser.max() = }')
print(f'{ser.mean() = }')
print(f'{ser.sum() = }')
print(f'{ser.prod() = }')
print(f'{ser.var() = }')
print(f'{ser.std() = }')

ser.min() = 1
ser.max() = 5
ser.mean() = 3.0
ser.sum() = 15
ser.prod() = 120
ser.var() = 2.5
ser.std() = 1.5811388300841898


Unique values and histograms

In [121]:
ser = pd.Series([1, 2, 1, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 2, 3, 5,
                 1, 2, 3, 4, 2, 1, 4, 2, 1, 1, 2, 2, 3, 1, 2, 1])

In [122]:
ser.unique()  # unique values in series

array([1, 2, 3, 4, 5])

In [123]:
ser.value_counts()  # histogram of series' values

1    11
2    11
3     6
4     3
5     1
dtype: int64

Change values in array

In [127]:
values_map = {1: 1, 2: 2, 3: 3, 4: 3, 5: 3}

ser.map(values_map).value_counts()

# map is not an inplace operation, if I wanted persistent changes:
# ser = ser.map(values_map)

1    11
2    11
3    10
dtype: int64

In [130]:
ser2 = pd.Series(['a']* 20 + ['b'] * 10 + ['c'] * 15 + ['d'] * 5)

ser2.value_counts()

a    20
c    15
b    10
d     5
dtype: int64

In [132]:
capitals_map = {'a': 'A', 'b' : 'B', 'c': 'C', 'd': 'D'}

ser2 = ser2.map(capitals_map)

In [135]:
ser2.value_counts()

A    20
C    15
B    10
D     5
dtype: int64

In [136]:
capitals_map = {'A': 'A', 'B' : 'BD', 'C': 'C', 'D': 'BD'}

ser2 = ser2.map(capitals_map)

ser2.value_counts()

A     20
BD    15
C     15
dtype: int64

Missing values

In [152]:
ser[[3, 15, 21, 27, 30]] = np.nan  # replce some values with NaN

In [153]:
ser.isna()  # Binary mask showing if an element is NaN or not

0     False
1     False
2     False
3      True
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15     True
16    False
17    False
18    False
19    False
20    False
21     True
22    False
23    False
24    False
25    False
26    False
27     True
28    False
29    False
30     True
31    False
dtype: bool

In [154]:
print(f'Does my series have any missing values? {ser.isna().any()}')
print(f'How many missing values does my array have? {ser.isna().sum()}')

Does my series have any missing values? True
How many missing values does my array have? 5


In [155]:
print('Which values are missing?')

ser[ser.isna()]

Which values are missing?


3    NaN
15   NaN
21   NaN
27   NaN
30   NaN
dtype: float64

In [156]:
ser2 = ser.fillna(ser.mean())  # fill missing values with series mean

In [157]:
print(f'Does my series have any missing values? {ser2.isna().any()}')
print(f'How many missing values does my array have? {ser2.isna().sum()}')

Does my series have any missing values? False
How many missing values does my array have? 0


In [160]:
ser3 = ser.dropna()  # drop missing values entirely

Inspection

In [162]:
ser.head(7)  # return first 7 elements in series

0    1.0
1    2.0
2    1.0
3    NaN
4    2.0
5    3.0
6    1.0
dtype: float64

In [163]:
ser.tail()  # return last 5 elements (by default) in series

27    NaN
28    3.0
29    1.0
30    NaN
31    1.0
dtype: float64

In [164]:
ser.sample(5)  # return 5 random elements from series

20    2.0
31    1.0
16    1.0
14    3.0
11    3.0
dtype: float64

In [166]:
ser.info()  # see info on series

<class 'pandas.core.series.Series'>
RangeIndex: 32 entries, 0 to 31
Series name: None
Non-Null Count  Dtype  
--------------  -----  
27 non-null     float64
dtypes: float64(1)
memory usage: 384.0 bytes


In [167]:
ser.describe()  # statistical information on series

count    27.000000
mean      2.111111
std       1.012739
min       1.000000
25%       1.000000
50%       2.000000
75%       3.000000
max       4.000000
dtype: float64

In [169]:
len(ser)

32

In [170]:
ser.shape

(32,)

apply

In [179]:
def is_even(num):
    return num % 2 == 0


ser.apply(is_even)

0     False
1      True
2     False
3     False
4      True
5     False
6     False
7      True
8     False
9     False
10     True
11    False
12     True
13     True
14    False
15    False
16    False
17     True
18    False
19     True
20     True
21    False
22     True
23     True
24    False
25    False
26     True
27    False
28    False
29    False
30    False
31    False
dtype: bool

In [181]:
def fill_nan_with_mean(num):
    if np.isnan(num):
        return 2.111
    return num


ser.apply(fill_nan_with_mean)

0     1.000
1     2.000
2     1.000
3     2.111
4     2.000
5     3.000
6     1.000
7     2.000
8     3.000
9     1.000
10    2.000
11    3.000
12    4.000
13    2.000
14    3.000
15    2.111
16    1.000
17    2.000
18    3.000
19    4.000
20    2.000
21    2.111
22    4.000
23    2.000
24    1.000
25    1.000
26    2.000
27    2.111
28    3.000
29    1.000
30    2.111
31    1.000
dtype: float64

In [184]:
# def replace_values(num):
#     if num > 2 or np.isnan(num):
#         return 3
#     return num


# ser.apply(replace_values)

ser.apply(lambda num: 3 if num > 2 or np.isnan(num) else num)

0     1.0
1     2.0
2     1.0
3     3.0
4     2.0
5     3.0
6     1.0
7     2.0
8     3.0
9     1.0
10    2.0
11    3.0
12    3.0
13    2.0
14    3.0
15    3.0
16    1.0
17    2.0
18    3.0
19    3.0
20    2.0
21    3.0
22    3.0
23    2.0
24    1.0
25    1.0
26    2.0
27    3.0
28    3.0
29    1.0
30    3.0
31    1.0
dtype: float64