# OUTLINE
- NumPy
- Files
- Pandas

# NumPy
- Provides `ndarray`
    - fast and space-efficient multidimensional array with
    - vectorized arithmetic operations or matrix operations
- Provides linear algebra, random number number generation
- Provides standard mathematical operations on on entire array of data
    - as opposed to writing loops

In [1]:
import numpy as np

In [2]:
arr = np.array([1, 2, 3])

In [3]:
type(arr)

numpy.ndarray

In [4]:
arr.ndim # number of dimensions or axes

1

In [5]:
arr.shape  # dimensions of the array

(3,)

In [6]:
arr.size

3

In [7]:
arr = np.array([[1.1, 2.0, 3.14], [4.22, 5.54, 6.3]])

In [8]:
type(arr), arr.ndim, arr.shape, arr.size

(numpy.ndarray, 2, (2, 3), 6)

## arrays with initial placeholder
- expanding lists is expensive
- provides ways to create arrays with initial placeholder content
    - useful when we don't have data but size of the data is known

In [9]:
# creating arrays with zeros
arr = np.zeros((2,3), dtype=int)
arr

array([[0, 0, 0],
       [0, 0, 0]])

In [10]:
# creating arrays with ones
arr = np.ones((2, 3), dtype=int)
arr

array([[1, 1, 1],
       [1, 1, 1]])

In [11]:
# creating arrays with arange
x = np.array([1, 3, 5])
y = np.arange(start=2, stop=8, step=2, dtype=int)
print(x, y)
x + y

[1 3 5] [2 4 6]


array([ 3,  7, 11])

In [12]:
# an array operation
x / 2

array([0.5, 1.5, 2.5])

In [13]:
# multidimensional array operation
x = np.array([[1, 2], [3, 4]])
y = np.array([[2, 0], [5, 9]])
x + y

array([[ 3,  2],
       [ 8, 13]])

In [14]:
print(x)
print(y)
x * y

[[1 2]
 [3 4]]
[[2 0]
 [5 9]]


array([[ 2,  0],
       [15, 36]])

In [15]:
print(x)
print(y)
x @ y

[[1 2]
 [3 4]]
[[2 0]
 [5 9]]


array([[12, 18],
       [26, 36]])

In [16]:
x.dot(y)

array([[12, 18],
       [26, 36]])

In [17]:
# some aggregation
arr = np.random.randint(1, 100, 10)
arr

array([89, 43, 53, 16, 49, 98, 59, 42, 18, 86])

In [18]:
arr.sum(), arr.min(), arr.max()

(553, 16, 98)

In [19]:
arr.argmax(), arr.argmin(), arr.mean()

(5, 3, 55.3)

In [20]:
# Working with Files
with open('iris.csv') as f:
    lines = f.readlines()
lines

['sepal_length,sepal_width,petal_length,petal_width,species\n',
 '5.1,3.5,1.4,0.2,setosa\n',
 '4.9,3.0,1.4,0.2,setosa\n',
 '4.7,3.2,1.3,0.2,setosa\n',
 '4.6,3.1,1.5,0.2,setosa\n',
 '5.0,3.6,1.4,0.2,setosa\n',
 '5.4,3.9,1.7,0.4,setosa\n',
 '4.6,3.4,1.4,0.3,setosa\n',
 '5.0,3.4,1.5,0.2,setosa\n',
 '4.4,2.9,1.4,0.2,setosa\n',
 '4.9,3.1,1.5,0.1,setosa\n',
 '5.4,3.7,1.5,0.2,setosa\n',
 '4.8,3.4,1.6,0.2,setosa\n',
 '4.8,3.0,1.4,0.1,setosa\n',
 '4.3,3.0,1.1,0.1,setosa\n',
 '5.8,4.0,1.2,0.2,setosa\n',
 '5.7,4.4,1.5,0.4,setosa\n',
 '5.4,3.9,1.3,0.4,setosa\n',
 '5.1,3.5,1.4,0.3,setosa\n',
 '5.7,3.8,1.7,0.3,setosa\n',
 '5.1,3.8,1.5,0.3,setosa\n',
 '5.4,3.4,1.7,0.2,setosa\n',
 '5.1,3.7,1.5,0.4,setosa\n',
 '4.6,3.6,1.0,0.2,setosa\n',
 '5.1,3.3,1.7,0.5,setosa\n',
 '4.8,3.4,1.9,0.2,setosa\n',
 '5.0,3.0,1.6,0.2,setosa\n',
 '5.0,3.4,1.6,0.4,setosa\n',
 '5.2,3.5,1.5,0.2,setosa\n',
 '5.2,3.4,1.4,0.2,setosa\n',
 '4.7,3.2,1.6,0.2,setosa\n',
 '4.8,3.1,1.6,0.2,setosa\n',
 '5.4,3.4,1.5,0.4,setosa\n',
 '5.2,4.

In [21]:
data = []
for line in lines:
    data.append(line.strip().split(','))
data

[['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species'],
 ['5.1', '3.5', '1.4', '0.2', 'setosa'],
 ['4.9', '3.0', '1.4', '0.2', 'setosa'],
 ['4.7', '3.2', '1.3', '0.2', 'setosa'],
 ['4.6', '3.1', '1.5', '0.2', 'setosa'],
 ['5.0', '3.6', '1.4', '0.2', 'setosa'],
 ['5.4', '3.9', '1.7', '0.4', 'setosa'],
 ['4.6', '3.4', '1.4', '0.3', 'setosa'],
 ['5.0', '3.4', '1.5', '0.2', 'setosa'],
 ['4.4', '2.9', '1.4', '0.2', 'setosa'],
 ['4.9', '3.1', '1.5', '0.1', 'setosa'],
 ['5.4', '3.7', '1.5', '0.2', 'setosa'],
 ['4.8', '3.4', '1.6', '0.2', 'setosa'],
 ['4.8', '3.0', '1.4', '0.1', 'setosa'],
 ['4.3', '3.0', '1.1', '0.1', 'setosa'],
 ['5.8', '4.0', '1.2', '0.2', 'setosa'],
 ['5.7', '4.4', '1.5', '0.4', 'setosa'],
 ['5.4', '3.9', '1.3', '0.4', 'setosa'],
 ['5.1', '3.5', '1.4', '0.3', 'setosa'],
 ['5.7', '3.8', '1.7', '0.3', 'setosa'],
 ['5.1', '3.8', '1.5', '0.3', 'setosa'],
 ['5.4', '3.4', '1.7', '0.2', 'setosa'],
 ['5.1', '3.7', '1.5', '0.4', 'setosa'],
 ['4.6', '3.6', '1.0',

In [22]:
data_arr = np.array(data)
data_arr

array([['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
        'species'],
       ['5.1', '3.5', '1.4', '0.2', 'setosa'],
       ['4.9', '3.0', '1.4', '0.2', 'setosa'],
       ['4.7', '3.2', '1.3', '0.2', 'setosa'],
       ['4.6', '3.1', '1.5', '0.2', 'setosa'],
       ['5.0', '3.6', '1.4', '0.2', 'setosa'],
       ['5.4', '3.9', '1.7', '0.4', 'setosa'],
       ['4.6', '3.4', '1.4', '0.3', 'setosa'],
       ['5.0', '3.4', '1.5', '0.2', 'setosa'],
       ['4.4', '2.9', '1.4', '0.2', 'setosa'],
       ['4.9', '3.1', '1.5', '0.1', 'setosa'],
       ['5.4', '3.7', '1.5', '0.2', 'setosa'],
       ['4.8', '3.4', '1.6', '0.2', 'setosa'],
       ['4.8', '3.0', '1.4', '0.1', 'setosa'],
       ['4.3', '3.0', '1.1', '0.1', 'setosa'],
       ['5.8', '4.0', '1.2', '0.2', 'setosa'],
       ['5.7', '4.4', '1.5', '0.4', 'setosa'],
       ['5.4', '3.9', '1.3', '0.4', 'setosa'],
       ['5.1', '3.5', '1.4', '0.3', 'setosa'],
       ['5.7', '3.8', '1.7', '0.3', 'setosa'],
       ['5.1', '3

In [23]:
data_arr[:, 0]

array(['sepal_length', '5.1', '4.9', '4.7', '4.6', '5.0', '5.4', '4.6',
       '5.0', '4.4', '4.9', '5.4', '4.8', '4.8', '4.3', '5.8', '5.7',
       '5.4', '5.1', '5.7', '5.1', '5.4', '5.1', '4.6', '5.1', '4.8',
       '5.0', '5.0', '5.2', '5.2', '4.7', '4.8', '5.4', '5.2', '5.5',
       '4.9', '5.0', '5.5', '4.9', '4.4', '5.1', '5.0', '4.5', '4.4',
       '5.0', '5.1', '4.8', '5.1', '4.6', '5.3', '5.0', '7.0', '6.4',
       '6.9', '5.5', '6.5', '5.7', '6.3', '4.9', '6.6', '5.2', '5.0',
       '5.9', '6.0', '6.1', '5.6', '6.7', '5.6', '5.8', '6.2', '5.6',
       '5.9', '6.1', '6.3', '6.1', '6.4', '6.6', '6.8', '6.7', '6.0',
       '5.7', '5.5', '5.5', '5.8', '6.0', '5.4', '6.0', '6.7', '6.3',
       '5.6', '5.5', '5.5', '6.1', '5.8', '5.0', '5.6', '5.7', '5.7',
       '6.2', '5.1', '5.7', '6.3', '5.8', '7.1', '6.3', '6.5', '7.6',
       '4.9', '7.3', '6.7', '7.2', '6.5', '6.4', '6.8', '5.7', '5.8',
       '6.4', '6.5', '7.7', '7.7', '6.0', '6.9', '5.6', '7.7', '6.3',
       '6.7', '7.2

In [24]:
data_arr[data_arr[:, 4] == 'virginica']

array([['6.3', '3.3', '6.0', '2.5', 'virginica'],
       ['5.8', '2.7', '5.1', '1.9', 'virginica'],
       ['7.1', '3.0', '5.9', '2.1', 'virginica'],
       ['6.3', '2.9', '5.6', '1.8', 'virginica'],
       ['6.5', '3.0', '5.8', '2.2', 'virginica'],
       ['7.6', '3.0', '6.6', '2.1', 'virginica'],
       ['4.9', '2.5', '4.5', '1.7', 'virginica'],
       ['7.3', '2.9', '6.3', '1.8', 'virginica'],
       ['6.7', '2.5', '5.8', '1.8', 'virginica'],
       ['7.2', '3.6', '6.1', '2.5', 'virginica'],
       ['6.5', '3.2', '5.1', '2.0', 'virginica'],
       ['6.4', '2.7', '5.3', '1.9', 'virginica'],
       ['6.8', '3.0', '5.5', '2.1', 'virginica'],
       ['5.7', '2.5', '5.0', '2.0', 'virginica'],
       ['5.8', '2.8', '5.1', '2.4', 'virginica'],
       ['6.4', '3.2', '5.3', '2.3', 'virginica'],
       ['6.5', '3.0', '5.5', '1.8', 'virginica'],
       ['7.7', '3.8', '6.7', '2.2', 'virginica'],
       ['7.7', '2.6', '6.9', '2.3', 'virginica'],
       ['6.0', '2.2', '5.0', '1.5', 'virginica'],


In [25]:
# first column of species virginica as a fraction number
data_arr[data_arr[:, 4] == 'virginica'][:,0].astype("float")

array([6.3, 5.8, 7.1, 6.3, 6.5, 7.6, 4.9, 7.3, 6.7, 7.2, 6.5, 6.4, 6.8,
       5.7, 5.8, 6.4, 6.5, 7.7, 7.7, 6. , 6.9, 5.6, 7.7, 6.3, 6.7, 7.2,
       6.2, 6.1, 6.4, 7.2, 7.4, 7.9, 6.4, 6.3, 6.1, 7.7, 6.3, 6.4, 6. ,
       6.9, 6.7, 6.9, 5.8, 6.8, 6.7, 6.7, 6.3, 6.5, 6.2, 5.9])

In [26]:
data_arr[data_arr[:, 4] == 'virginica'][:, 0:4].astype("float").mean(axis=0)

array([6.588, 2.974, 5.552, 2.026])

In [27]:
data_arr[data_arr[:, 4] == 'virginica'][:, 0:4].astype("float").mean(axis=1)

array([4.525, 3.875, 4.525, 4.15 , 4.375, 4.825, 3.4  , 4.575, 4.2  ,
       4.85 , 4.2  , 4.075, 4.35 , 3.8  , 4.025, 4.3  , 4.2  , 5.1  ,
       4.875, 3.675, 4.525, 3.825, 4.8  , 3.925, 4.45 , 4.55 , 3.9  ,
       3.95 , 4.225, 4.4  , 4.55 , 5.025, 4.25 , 3.925, 3.925, 4.775,
       4.425, 4.2  , 3.9  , 4.375, 4.45 , 4.35 , 3.875, 4.55 , 4.55 ,
       4.3  , 3.925, 4.175, 4.325, 3.95 ])

All this can get very unreadable and confusing

# Enter Pandas

- open-source, and high performance
- makes it easier to work with data structures and analysis tools 
- convenient for use with numpy centric applications
- primarily uses Series and DataFrame

In [28]:
import pandas as pd

## Series

In [48]:
data = pd.Series([1, 2, 3, 4, 5])
data

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [49]:
data.index

RangeIndex(start=0, stop=5, step=1)

In [50]:
data.values

array([1, 2, 3, 4, 5])

In [51]:
data[0] = 10
data

0    10
1     2
2     3
3     4
4     5
dtype: int64

In [52]:
data = pd.Series([1, 2, 3, 4], index=['d', 'c', 'b', 'a'])
data

d    1
c    2
b    3
a    4
dtype: int64

In [54]:
data['b']
data[0]

1

## DataFrame
- tabular, spreadsheet like ordered data
- each column can be different data type
- a column is basically a "series"

In [35]:
# example data frame
dummy1 = pd.DataFrame(np.arange(12).reshape((4, 3)),
                   columns=['a', 'b', 'c'],
                   index=['firstq', 'secondq', 'thirdq', 'forthq'])
dummy1

Unnamed: 0,a,b,c
firstq,0,1,2
secondq,3,4,5
thirdq,6,7,8
forthq,9,10,11


In [36]:
dummy2 = pd.DataFrame(np.arange(9).reshape((3, 3)),
                   columns=['b', 'c', 'd'],
                   index=['firstq', 'secondq', 'thirdq'])
dummy2

Unnamed: 0,b,c,d
firstq,0,1,2
secondq,3,4,5
thirdq,6,7,8


In [37]:
dummy = dummy1 + dummy2
dummy

Unnamed: 0,a,b,c,d
firstq,,1.0,3.0,
forthq,,,,
secondq,,7.0,9.0,
thirdq,,13.0,15.0,


In [38]:
data = pd.read_csv('iris.csv')
data

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [39]:
data.groupby('species').max()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.8,4.4,1.9,0.6
versicolor,7.0,3.4,5.1,1.8
virginica,7.9,3.8,6.9,2.5


In [40]:
data.groupby('species').min()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,4.3,2.3,1.0,0.1
versicolor,4.9,2.0,3.0,1.0
virginica,4.9,2.2,4.5,1.4


In [41]:
data['sepal_length'].max()

7.9

In [42]:
data.isna().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [56]:
print(dummy)
dummy.isna().sum()

          a     b     c   d
firstq  NaN   1.0   3.0 NaN
forthq  NaN   NaN   NaN NaN
secondq NaN   7.0   9.0 NaN
thirdq  NaN  13.0  15.0 NaN


a    4
b    1
c    1
d    4
dtype: int64

In [44]:
dummy.fillna(0)

Unnamed: 0,a,b,c,d
firstq,0.0,1.0,3.0,0.0
forthq,0.0,0.0,0.0,0.0
secondq,0.0,7.0,9.0,0.0
thirdq,0.0,13.0,15.0,0.0


In [45]:
dummy.fillna({'a' : 0, 'b' : 1, 'c' : 2, 'd' : 4})

Unnamed: 0,a,b,c,d
firstq,0.0,1.0,3.0,4.0
forthq,0.0,1.0,2.0,4.0
secondq,0.0,7.0,9.0,4.0
thirdq,0.0,13.0,15.0,4.0


In [46]:
# learn to use the official reference https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.fillna.html

In [47]:
dummy['b'].dropna(axis=0)

firstq      1.0
secondq     7.0
thirdq     13.0
Name: b, dtype: float64