# NumPy

NumPy (or Numpy) is a Linear Algebra Library for Python, the reason it is so important for Data Science with Python is that almost all of the libraries in the PyData Ecosystem rely on NumPy as one of their main building blocks.

Numpy is also incredibly fast, as it has bindings to C libraries. For more info on why you would want to use Arrays instead of lists, check out this great StackOverflow post.

We will only learn the basics of NumPy, to get started we need to install it!


In [1]:
import numpy as np

## Create null vector of size 10

In [4]:
z = np.zeros(10)
print(z)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


## Create a null vector of size 10 but the fifth value which is 1

In [5]:
z = np.zeros(10)
z[4] = 1
print(z)

[0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]


## Create a vector with values ranging from 10 to 49

hint: arange

In [6]:
v = np.arange(10,50)
print(v)

[10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49]


In [8]:
v = np.arange(10,50)
v = v[::-1]
print(v)

[49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26
 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]


## Create a 3x3x3 array with random values

hint: np.random.random

In [11]:
z = np.random.random((3,3,3))
z

array([[[0.46442275, 0.56738491, 0.25462178],
        [0.40183604, 0.94684032, 0.19906028],
        [0.23562845, 0.2337042 , 0.70829565]],

       [[0.48414435, 0.29547251, 0.83174077],
        [0.16451285, 0.88300345, 0.06491134],
        [0.54298189, 0.8472886 , 0.65517473]],

       [[0.7091614 , 0.61312627, 0.32935259],
        [0.30958838, 0.03026905, 0.97107118],
        [0.19398165, 0.20214235, 0.67688847]]])

In [12]:
z = np.random.random((3,3))
z

array([[0.45081681, 0.20154585, 0.61825217],
       [0.72199832, 0.00783718, 0.01434538],
       [0.33425902, 0.29307084, 0.80568485]])

In [13]:
z_min, z_max = z.min(), z.max()
print(z_min, z_max)

0.007837178835764291 0.8056848452203309


## Create a 2d array with 1 on the border and 0 inside 
hint: array[1:-1, 1:-1]

In [17]:
z = np.ones((10,10))
z[1:-1, 1:-1] = 0
z

array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])

## add a border (filled with 0's) around an existing array

hint : np.pad

In [21]:
z = np.ones((5,5))
z = np.pad(z, pad_width=1,mode='constant')
print(z)

[[0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 1. 1. 1. 1. 0.]
 [0. 1. 1. 1. 1. 1. 0.]
 [0. 1. 1. 1. 1. 1. 0.]
 [0. 1. 1. 1. 1. 1. 0.]
 [0. 1. 1. 1. 1. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0.]]


## Create a 5x5 matrix with values 1,2,3,4 

hint : np.diag

[1 0 0 0]

[0 2 0 0]

[0 0 3 0]

[0 0 0 4]

In [31]:
z = np.zeros((5,5))
z = np.diag(np.arange(1,5))
print(z)

[[1 0 0 0]
 [0 2 0 0]
 [0 0 3 0]
 [0 0 0 4]]


## Multiply a 5x3 matrix by a 3x2 matrix (real matrix product)

In [33]:
a = np.ones((5,3))
b = np.ones((3,2))
z = np.dot(a,b)
print(z)

[[3. 3.]
 [3. 3.]
 [3. 3.]
 [3. 3.]
 [3. 3.]]


## Given a 1D array, negate all elements which are between 3 and 8, in place.

hint: >,<

In [38]:
z = np.arange(11)
#z[(3 < z) & (z <= 8)] = -1
z[3:9] = -1
print(z)

[ 0  1  2 -1 -1 -1 -1 -1 -1  9 10]


In [43]:
z = np.arange(11)
#z[(3 < z) & (z <= 8)] = -1
z[3:9] *= -1
print(z)


[ 0  1  2 -3 -4 -5 -6 -7 -8  9 10]


In [44]:
Z = np.random.random(10)
print(Z)

[0.94745161 0.47347967 0.53009326 0.35442034 0.78382713 0.73082508
 0.12640053 0.48441974 0.85048946 0.42644424]


In [52]:
z = np.random.uniform(-5,5,10)
print(z)
round_off = np.copysign(np.ceil(np.abs(z)), z)
print(round_off)

[ 1.93746427  2.98671002  3.37663759 -2.74897278  1.56454732  0.31811823
 -1.32420115 -3.48754613 -0.44371316  1.52172608]
[ 2.  3.  4. -3.  2.  1. -2. -4. -1.  2.]


In [56]:
Z = np.random.uniform(-5,5,10)
print(Z)
np.ceil(np.abs(Z))

[-4.94081315 -0.88023705 -3.60863374 -1.28393499 -4.43961176 -1.34723228
 -2.95763032 -0.07701793  0.32475683 -3.38344535]


array([5., 1., 4., 2., 5., 2., 3., 1., 1., 4.])

# Pandas

In [61]:
import numpy as np
import pandas as pd

# Creating a Series

In [64]:
labels = ['x', 'y', 'z']
my_list = [10,20,30]
arr = np.array([10,20,30])
d = {'a':10, 'b':20, 'c':30}

In [65]:
pd.Series(data=my_list)

0    10
1    20
2    30
dtype: int64

In [66]:
pd.Series(data=my_list, index=labels)

x    10
y    20
z    30
dtype: int64

In [67]:
pd.Series(my_list,labels)

x    10
y    20
z    30
dtype: int64

In [69]:
pd.Series(arr,labels)

x    10
y    20
z    30
dtype: int32

In [70]:
d = {'a':10, 'b':20, 'c':30}


In [71]:
pd.Series(d)

a    10
b    20
c    30
dtype: int64

In [72]:
series1 = pd.Series([1,2,3,4], index=['USA', 'INDIA', 'GERMANY', 'JAPAN'])

In [73]:
series1

USA        1
INDIA      2
GERMANY    3
JAPAN      4
dtype: int64

In [74]:
series1['INDIA']

2

In [78]:
series2 = pd.Series([5,6,7,8], index=['USA', 'INDIA', 'GERMANY', 'ITALY'])

In [79]:
series2

USA        5
INDIA      6
GERMANY    7
ITALY      8
dtype: int64

In [80]:
series1 + series2

GERMANY    10.0
INDIA       8.0
ITALY       NaN
JAPAN       NaN
USA         6.0
dtype: float64

# Dataframes

In [81]:
from numpy.random import randn
np.random.seed(101)

In [83]:
df = pd.DataFrame(randn(5,4), index = 'A B C D E'.split(), columns='W X Y Z'.split())

In [84]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [85]:
df['W']

A    0.302665
B   -0.134841
C    0.807706
D   -0.497104
E   -0.116773
Name: W, dtype: float64

In [88]:
df[['W','Z']]

Unnamed: 0,W,Z
A,0.302665,-1.159119
B,-0.134841,0.184502
C,0.807706,0.329646
D,-0.497104,0.484752
E,-0.116773,1.996652


In [89]:
df.W #Not recommended!!

A    0.302665
B   -0.134841
C    0.807706
D   -0.497104
E   -0.116773
Name: W, dtype: float64

In [90]:
type(df['W'])

pandas.core.series.Series

In [91]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [98]:
df['Add_WX'] = df['W'] + df['X']
df

Unnamed: 0,W,X,Y,Z,Add_WX
A,0.302665,1.693723,-1.706086,-1.159119,1.996388
B,-0.134841,0.390528,0.166905,0.184502,0.255687
C,0.807706,0.07296,0.638787,0.329646,0.880666
D,-0.497104,-0.75407,-0.943406,0.484752,-1.251174
E,-0.116773,1.901755,0.238127,1.996652,1.784981


In [103]:
df.drop('Add_WX',axis=1,inplace=True)

In [104]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [105]:
df.drop('E',axis=0,inplace=True)

In [106]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752


In [107]:
df.loc['A']

W    0.302665
X    1.693723
Y   -1.706086
Z   -1.159119
Name: A, dtype: float64

In [110]:
df.iloc[0]

W    0.302665
X    1.693723
Y   -1.706086
Z   -1.159119
Name: A, dtype: float64

In [111]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752


In [112]:
df.loc['B','Y']

0.16690463609281317

In [114]:
df.loc[['A','B'],['W','Y']]

Unnamed: 0,W,Y
A,0.302665,-1.706086
B,-0.134841,0.166905


In [115]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752


In [117]:
df = pd.DataFrame(randn(6,5), index = 'A B C D E F'.split(), columns='V W X Y Z'.split())

In [118]:
df

Unnamed: 0,V,W,X,Y,Z
A,0.38603,2.084019,-0.376519,0.230336,0.681209
B,1.035125,-0.03116,1.939932,-1.005187,-0.74179
C,0.187125,-0.732845,-1.38292,1.482495,0.961458
D,-2.141212,0.992573,1.192241,-1.04678,1.292765
E,-1.467514,-0.494095,-0.162535,0.485809,0.392489
F,0.221491,-0.855196,1.54199,0.666319,-0.538235


In [119]:
df>0

Unnamed: 0,V,W,X,Y,Z
A,True,True,False,True,True
B,True,False,True,False,False
C,True,False,False,True,True
D,False,True,True,False,True
E,False,False,False,True,True
F,True,False,True,True,False


In [120]:
df[df>0]

Unnamed: 0,V,W,X,Y,Z
A,0.38603,2.084019,,0.230336,0.681209
B,1.035125,,1.939932,,
C,0.187125,,,1.482495,0.961458
D,,0.992573,1.192241,,1.292765
E,,,,0.485809,0.392489
F,0.221491,,1.54199,0.666319,


In [121]:
df[df['W']>0]

Unnamed: 0,V,W,X,Y,Z
A,0.38603,2.084019,-0.376519,0.230336,0.681209
D,-2.141212,0.992573,1.192241,-1.04678,1.292765


In [123]:
df.reset_index()

Unnamed: 0,index,V,W,X,Y,Z
0,A,0.38603,2.084019,-0.376519,0.230336,0.681209
1,B,1.035125,-0.03116,1.939932,-1.005187,-0.74179
2,C,0.187125,-0.732845,-1.38292,1.482495,0.961458
3,D,-2.141212,0.992573,1.192241,-1.04678,1.292765
4,E,-1.467514,-0.494095,-0.162535,0.485809,0.392489
5,F,0.221491,-0.855196,1.54199,0.666319,-0.538235


In [124]:
states = 'MH KA KL GJ DL RJ'.split()
df['States'] = states
df

Unnamed: 0,V,W,X,Y,Z,States
A,0.38603,2.084019,-0.376519,0.230336,0.681209,MH
B,1.035125,-0.03116,1.939932,-1.005187,-0.74179,KA
C,0.187125,-0.732845,-1.38292,1.482495,0.961458,KL
D,-2.141212,0.992573,1.192241,-1.04678,1.292765,GJ
E,-1.467514,-0.494095,-0.162535,0.485809,0.392489,DL
F,0.221491,-0.855196,1.54199,0.666319,-0.538235,RJ


In [126]:
df.set_index("States")

Unnamed: 0_level_0,V,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MH,0.38603,2.084019,-0.376519,0.230336,0.681209
KA,1.035125,-0.03116,1.939932,-1.005187,-0.74179
KL,0.187125,-0.732845,-1.38292,1.482495,0.961458
GJ,-2.141212,0.992573,1.192241,-1.04678,1.292765
DL,-1.467514,-0.494095,-0.162535,0.485809,0.392489
RJ,0.221491,-0.855196,1.54199,0.666319,-0.538235


# Multi-Index and Index Hierarchy

In [127]:
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]

In [129]:
hier_index = list(zip(outside,inside))
hier_index

[('G1', 1), ('G1', 2), ('G1', 3), ('G2', 1), ('G2', 2), ('G2', 3)]

In [132]:
multi_index = pd.MultiIndex.from_tuples(hier_index)
multi_index

MultiIndex(levels=[['G1', 'G2'], [1, 2, 3]],
           codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])

In [133]:
df = pd.DataFrame(np.random.randn(6,2), index=multi_index, columns=['A','B'])

In [134]:
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,-0.568581,1.407338
G1,2,0.641806,-0.9051
G1,3,-0.391157,1.028293
G2,1,-1.972605,-0.866885
G2,2,0.720788,-1.223082
G2,3,1.60678,-1.11571


In [135]:
df.loc['G1']

Unnamed: 0,A,B
1,-0.568581,1.407338
2,0.641806,-0.9051
3,-0.391157,1.028293


In [136]:
df.loc['G1'].loc[1]

A   -0.568581
B    1.407338
Name: 1, dtype: float64

In [137]:
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,-0.568581,1.407338
G1,2,0.641806,-0.9051
G1,3,-0.391157,1.028293
G2,1,-1.972605,-0.866885
G2,2,0.720788,-1.223082
G2,3,1.60678,-1.11571


In [138]:
df.index.names = ['Groups', 'Num']

In [139]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,-0.568581,1.407338
G1,2,0.641806,-0.9051
G1,3,-0.391157,1.028293
G2,1,-1.972605,-0.866885
G2,2,0.720788,-1.223082
G2,3,1.60678,-1.11571


In [140]:
df.xs('G1')

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,-0.568581,1.407338
2,0.641806,-0.9051
3,-0.391157,1.028293


In [141]:
df.xs(['G1',1])

A   -0.568581
B    1.407338
Name: (G1, 1), dtype: float64

In [142]:
df.xs(1,level='Num')

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,-0.568581,1.407338
G2,-1.972605,-0.866885


In [143]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,-0.568581,1.407338
G1,2,0.641806,-0.9051
G1,3,-0.391157,1.028293
G2,1,-1.972605,-0.866885
G2,2,0.720788,-1.223082
G2,3,1.60678,-1.11571


In [144]:
df.xs(2,level='Num')

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,0.641806,-0.9051
G2,0.720788,-1.223082
