## Lesson 9 - Pandas Basics

Outline:

* Series
* DataFrame
* index, columns
* dtypes, info, describe
* read_csv
* head, tail
* loc, iloc, ix
* to_datetime

In [1]:
import pandas as pd
import numpy as np

### Series

In [2]:
# a list of strings
my_list = ['cubs', 'pirates', 'giants', 'yankees', 'donkeys']
my_list

['cubs', 'pirates', 'giants', 'yankees', 'donkeys']

In [3]:
# pandas Series from list
series_from_list = pd.Series(my_list)
series_from_list

0       cubs
1    pirates
2     giants
3    yankees
4    donkeys
dtype: object

In [4]:
# indexing a Series is similar to lists and arrays
series_from_list[3]

'yankees'

In [5]:
# a numpy array
my_array = np.random.rand(5)
my_array

array([ 0.33342042,  0.70196207,  0.62444784,  0.21750377,  0.6632444 ])

In [6]:
# pandas Series from array
series_from_array = pd.Series(my_array)
series_from_array

0    0.333420
1    0.701962
2    0.624448
3    0.217504
4    0.663244
dtype: float64

In [7]:
# indexing also supports slices
series_from_array[3:]

3    0.217504
4    0.663244
dtype: float64

### DataFrame

#### 2D array to DataFrame

In [8]:
# create a 2D numpy array
my_2d_array = np.random.randn(5,5)
my_2d_array

array([[ 1.12670633,  0.17838792,  1.70594108, -0.16869501, -1.78948246],
       [-0.15480417,  1.0505224 , -0.15826701,  0.17889264, -1.65297439],
       [ 0.64753895, -1.62550566,  0.56275979,  0.03727301,  1.23548106],
       [ 1.27374994, -0.82282161, -0.18523316, -1.1681018 , -2.49805059],
       [-0.38809268, -1.30739097,  0.17590965,  0.43598894, -1.38318692]])

In [9]:
# make a DataFrame from the 2D numpy array
pd.DataFrame(my_2d_array)

Unnamed: 0,0,1,2,3,4
0,1.126706,0.178388,1.705941,-0.168695,-1.789482
1,-0.154804,1.050522,-0.158267,0.178893,-1.652974
2,0.647539,-1.625506,0.56276,0.037273,1.235481
3,1.27375,-0.822822,-0.185233,-1.168102,-2.498051
4,-0.388093,-1.307391,0.17591,0.435989,-1.383187


In [10]:
# we can set the index and column labels when we create the DataFrame
df_from_2d_array = pd.DataFrame(my_2d_array, 
                                index=['row1', 'row2', 'row3', 'row4', 'row5'], 
                                columns=['col1', 'col2', 'col3', 'col4', 'col5'])
df_from_2d_array

Unnamed: 0,col1,col2,col3,col4,col5
row1,1.126706,0.178388,1.705941,-0.168695,-1.789482
row2,-0.154804,1.050522,-0.158267,0.178893,-1.652974
row3,0.647539,-1.625506,0.56276,0.037273,1.235481
row4,1.27375,-0.822822,-0.185233,-1.168102,-2.498051
row5,-0.388093,-1.307391,0.17591,0.435989,-1.383187


#### List or Series to DataFrame

In [11]:
# method 1: getting data as a list of series will orient them as rows
x = pd.DataFrame(data=[series_from_list, series_from_array])
x

Unnamed: 0,0,1,2,3,4
0,cubs,pirates,giants,yankees,donkeys
1,0.33342,0.701962,0.624448,0.217504,0.663244


In [12]:
# we can transpose a DataFrame using T or transpose
x.T

Unnamed: 0,0,1
0,cubs,0.33342
1,pirates,0.701962
2,giants,0.624448
3,yankees,0.217504
4,donkeys,0.663244


In [13]:
x.transpose()

Unnamed: 0,0,1
0,cubs,0.33342
1,pirates,0.701962
2,giants,0.624448
3,yankees,0.217504
4,donkeys,0.663244


In [14]:
# method 2: pass list/Series as value of dictionary
y = pd.DataFrame({'a': series_from_list, 'b': series_from_array}, dtype=str)
y

Unnamed: 0,a,b
0,cubs,0.333420417979
1,pirates,0.701962065254
2,giants,0.624447841768
3,yankees,0.217503773927
4,donkeys,0.663244400221


In [15]:
# method 3: use pd.concat to combine series in column orientation
df = pd.concat([series_from_list, series_from_array], axis=1)
df

Unnamed: 0,0,1
0,cubs,0.33342
1,pirates,0.701962
2,giants,0.624448
3,yankees,0.217504
4,donkeys,0.663244


### index, columns

In [16]:
# set the index and column names to an existing DataFrame
df.index = ['a', 'b', 'c', 'd', 'e']
df.columns = ['team', 'random']
df

Unnamed: 0,team,random
a,cubs,0.33342
b,pirates,0.701962
c,giants,0.624448
d,yankees,0.217504
e,donkeys,0.663244


In [17]:
# add a new column to the DataFrame
df['integers'] = [2, 3, 5, 8, 13]
df

Unnamed: 0,team,random,integers
a,cubs,0.33342,2
b,pirates,0.701962,3
c,giants,0.624448,5
d,yankees,0.217504,8
e,donkeys,0.663244,13


### dtypes, info, describe

In [18]:
# gives the datatype of each column
df.dtypes

team         object
random      float64
integers      int64
dtype: object

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, a to e
Data columns (total 3 columns):
team        5 non-null object
random      5 non-null float64
integers    5 non-null int64
dtypes: float64(1), int64(1), object(1)
memory usage: 160.0+ bytes


In [20]:
df.describe()

Unnamed: 0,random,integers
count,5.0,5.0
mean,0.508116,6.2
std,0.21803,4.438468
min,0.217504,2.0
25%,0.33342,3.0
50%,0.624448,5.0
75%,0.663244,8.0
max,0.701962,13.0


### read_csv

In [21]:
# by default column headers are the first row and row indexes are integers starting from zero
df_sio = pd.read_csv('scripps_pier_20151110.csv')

In [22]:
# by default, read_csv will infer the object types
df_sio.dtypes

Date            object
chl (ug/L)     float64
pres (dbar)    float64
sal (PSU)      float64
temp (C)       float64
dtype: object

In [23]:
df_sio.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66 entries, 0 to 65
Data columns (total 5 columns):
Date           66 non-null object
chl (ug/L)     66 non-null float64
pres (dbar)    66 non-null float64
sal (PSU)      66 non-null float64
temp (C)       66 non-null float64
dtypes: float64(4), object(1)
memory usage: 2.7+ KB


In [24]:
df_sio.describe()

Unnamed: 0,chl (ug/L),pres (dbar),sal (PSU),temp (C)
count,66.0,66.0,66.0,66.0
mean,22.349576,3.041818,33.199318,20.06697
std,0.038988,0.254295,0.004959,0.0685
min,22.305,2.714,33.184,19.94
25%,22.319,2.81325,33.197,20.04
50%,22.3335,2.997,33.199,20.07
75%,22.385,3.2155,33.203,20.105
max,22.426,3.712,33.206,20.19


In [25]:
# we can also specify the dtype (and specify index and header to defaults)
# sometimes it's better to specify the dtype as object and convert to int, float, etc. later
df_sio = pd.read_csv('scripps_pier_20151110.csv', dtype=object, index_col=None, header=0)

In [26]:
df_sio.dtypes

Date           object
chl (ug/L)     object
pres (dbar)    object
sal (PSU)      object
temp (C)       object
dtype: object

#### Changing dtype of columns

In [27]:
# method 1: list comprehension (one column)
df_sio['chl (ug/L)'] = [float(x) for x in df_sio['chl (ug/L)']]

In [28]:
# method 2: pd.to_numeric (one column)
df_sio['pres (dbar)'] = pd.to_numeric(df_sio['pres (dbar)'])

In [29]:
# method 3: apply(pd.to_numeric) (multiple columns)
df_sio[['sal (PSU)','temp (C)']] = df_sio[['sal (PSU)','temp (C)']].apply(pd.to_numeric)

In [30]:
df_sio.dtypes

Date            object
chl (ug/L)     float64
pres (dbar)    float64
sal (PSU)      float64
temp (C)       float64
dtype: object

### head, tail

In [31]:
# add a number to change the number of rows printed
df_sio.head(7)

Unnamed: 0,Date,chl (ug/L),pres (dbar),sal (PSU),temp (C)
0,11/10/15 1:42,22.307,3.712,33.199,19.95
1,11/10/15 1:35,22.311,3.588,33.201,19.94
2,11/10/15 1:29,22.305,3.541,33.2,19.95
3,11/10/15 1:23,22.323,3.463,33.2,19.95
4,11/10/15 1:17,22.316,3.471,33.199,19.95
5,11/10/15 1:11,22.315,3.476,33.198,19.95
6,11/10/15 1:05,22.31,3.448,33.199,19.96


In [32]:
# tail works the same way
df_sio.tail(3)

Unnamed: 0,Date,chl (ug/L),pres (dbar),sal (PSU),temp (C)
63,11/9/15 19:22,22.418,3.316,33.202,19.96
64,11/9/15 19:16,22.41,3.209,33.2,19.96
65,11/9/15 19:10,22.426,3.328,33.203,19.95


### loc, iloc, ix

Pandas's three indexing methods defined:

* loc works on labels in the index.
* iloc works on the positions in the index (so it only takes integers).
* ix usually tries to behave like loc but falls back to behaving like iloc if the label is not in the index.

In [33]:
df

Unnamed: 0,team,random,integers
a,cubs,0.33342,2
b,pirates,0.701962,3
c,giants,0.624448,5
d,yankees,0.217504,8
e,donkeys,0.663244,13


#### brackets only -- column by header

In [34]:
# to get a column (Series), use the column header (don't need .loc, .iloc, or .ix)
df['team']

a       cubs
b    pirates
c     giants
d    yankees
e    donkeys
Name: team, dtype: object

In [35]:
# for multiple columns, put a list inside the brackets (so two sets of brackets)
df[['team', 'random']]

Unnamed: 0,team,random
a,cubs,0.33342
b,pirates,0.701962
c,giants,0.624448
d,yankees,0.217504
e,donkeys,0.663244


#### loc -- row by index

In [36]:
# to get a row by name, use .loc with the row index
df.loc['a']

team           cubs
random      0.33342
integers          2
Name: a, dtype: object

In [37]:
# for multiple rows, put a list inside the brackets (so two sets of brackets)
df.loc[['a', 'd']]

Unnamed: 0,team,random,integers
a,cubs,0.33342,2
d,yankees,0.217504,8


#### iloc -- row (or column) by position

In [38]:
# to get a row by position, use .iloc with the row number
df.iloc[0]

team           cubs
random      0.33342
integers          2
Name: a, dtype: object

In [39]:
# for multiple rows, put a list inside the brackets (so two sets of brackets)
df.iloc[[0, 3]]

Unnamed: 0,team,random,integers
a,cubs,0.33342,2
d,yankees,0.217504,8


In [40]:
# or pass a slice
df.iloc[2:]

Unnamed: 0,team,random,integers
c,giants,0.624448,5
d,yankees,0.217504,8
e,donkeys,0.663244,13


In [41]:
# iloc also works with columns
df.iloc[:,[0, 2]]

Unnamed: 0,team,integers
a,cubs,2
b,pirates,3
c,giants,5
d,yankees,8
e,donkeys,13


#### ix -- row (or column) by index or position

In [42]:
# ix supports both index labels and numbers
df.ix['d']

team         yankees
random      0.217504
integers           8
Name: d, dtype: object

In [43]:
df.ix[3]

team         yankees
random      0.217504
integers           8
Name: d, dtype: object

In [44]:
# ix also works with column labels and numbers
df.ix[:, 1]

a    0.333420
b    0.701962
c    0.624448
d    0.217504
e    0.663244
Name: random, dtype: float64

In [45]:
df.ix[:, 'random']

a    0.333420
b    0.701962
c    0.624448
d    0.217504
e    0.663244
Name: random, dtype: float64

### to_datetime

We will cover time series in greater detail in a future lesson.

In [46]:
df_sio.head()

Unnamed: 0,Date,chl (ug/L),pres (dbar),sal (PSU),temp (C)
0,11/10/15 1:42,22.307,3.712,33.199,19.95
1,11/10/15 1:35,22.311,3.588,33.201,19.94
2,11/10/15 1:29,22.305,3.541,33.2,19.95
3,11/10/15 1:23,22.323,3.463,33.2,19.95
4,11/10/15 1:17,22.316,3.471,33.199,19.95


In [47]:
time = pd.to_datetime(df_sio['Date'])
time.head()

0   2015-11-10 01:42:00
1   2015-11-10 01:35:00
2   2015-11-10 01:29:00
3   2015-11-10 01:23:00
4   2015-11-10 01:17:00
Name: Date, dtype: datetime64[ns]

In [48]:
df_sio['Date'] = time

In [49]:
df_sio.head()

Unnamed: 0,Date,chl (ug/L),pres (dbar),sal (PSU),temp (C)
0,2015-11-10 01:42:00,22.307,3.712,33.199,19.95
1,2015-11-10 01:35:00,22.311,3.588,33.201,19.94
2,2015-11-10 01:29:00,22.305,3.541,33.2,19.95
3,2015-11-10 01:23:00,22.323,3.463,33.2,19.95
4,2015-11-10 01:17:00,22.316,3.471,33.199,19.95
