# Non-Mini Intro to Pandas 2
by Dr Liang Jin

Part of Mini Python Sessions: [github.com/drliangjin/minipy](https://github.com/drliangjin/minipy)

Official Pandas Doc: [pandas.pydata.org](https://pandas.pydata.org/)

In [3]:
import numpy as np
import pandas as pd

### Pandas -- Continued...
1. Data Transformation
2. Data Grouping & Aggregation
3. Time Series

## 1. Data Transformation

### Element-wise transformation using map()

In [4]:
# map() method is a Series method
sr = pd.Series(np.random.randn(2))
sr

0    0.776688
1    0.676112
dtype: float64

In [5]:
# apply a lambda function to each data point
func = lambda x: round(x, 2)
sr.map(func)

0    0.78
1    0.68
dtype: float64

### Element-wise Transformation using applymap()

In [6]:
# this method applies a function that accepts and returns a scalar to every element of a DataFrame
# A DataFrame with 2x2 floats
df = pd.DataFrame(np.random.randn(3, 3), columns=['col_1', 'col_2', 'col_3'], index=['row_1', 'row_2', 'row_3'])
df

Unnamed: 0,col_1,col_2,col_3
row_1,-0.064345,1.042345,-0.904842
row_2,1.111289,-0.658066,1.124888
row_3,-0.074977,-1.248327,0.881831


In [10]:
# applemap is a DataFrame method
# round to 2 decimals for each data point in DataFrame
func = lambda x: round(x, 2)
df.applymap(func)

Unnamed: 0,col_1,col_2,col_3
row_1,-0.06,1.04,-0.9
row_2,1.11,-0.66,1.12
row_3,-0.07,-1.25,0.88


### array-wise Transformation using apply()

In [11]:
# axis=0 ==> computation on from top to bottom
df.apply(lambda x: x.max() - x.min(), axis=0)

col_1    1.186266
col_2    2.290672
col_3    2.029730
dtype: float64

In [12]:
# axis=1 ==> computation on data from left to right
df.apply(lambda x: x.max() - x.min(), axis=1)

row_1    1.947186
row_2    1.782954
row_3    2.130158
dtype: float64

## 2. Data Grouping

<img src="img/split-apply-combine.svg">

In [14]:
# create the above dataset
data = {'key': ['A', 'B', 'C', 'A', 'B', 'C'], 
        'data': [1, 2, 3, 4, 5, 6]}
df = pd.DataFrame(data, columns=['key', 'data'])
df

Unnamed: 0,key,data
0,A,1
1,B,2
2,C,3
3,A,4
4,B,5
5,C,6


In [15]:
# Pandas's GroupBy object.
# It has not actually computed anything yet but form the intermediate datasets
grouped = df.groupby(by='key', as_index=False)
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x12169d390>

In [16]:
# to see what have been saved in the GroupBy object
# we can use iteration to print out data in each group
for name, group in grouped:
    print("Sub-group: {}".format(name))
    print(group)
    print("\n")

Sub-group: A
  key  data
0   A     1
3   A     4


Sub-group: B
  key  data
1   B     2
4   B     5


Sub-group: C
  key  data
2   C     3
5   C     6




In [25]:
# Now let's apply some functions and/or methods
# Note, the function sum() has been applied to each group
# results are then combined together as a DataFrame object
grouped.apply(sum)

Unnamed: 0,key,data
0,AA,5
1,BB,7
2,CC,9


In [27]:
# More importantly, we can use aggregate() methods
# to apply multiple functions
def max_minus_min(arr):
    return arr.max() - arr.min()

grouped.agg(['sum', 'mean', 'std', max_minus_min])

Unnamed: 0_level_0,data,data,data,data
Unnamed: 0_level_1,sum,mean,std,max_minus_min
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
A,5,2.5,2.12132,3
B,7,3.5,2.12132,3
C,9,4.5,2.12132,3


## 3. Time Series

### Datetime Object

In [19]:
# built-in `datetime` module
from datetime import datetime, timedelta
from dateutil.parser import parse

# datetime stores both the date and time down to the microsecond
datetime.now()

datetime.datetime(2019, 2, 19, 13, 35, 56, 467691)

In [20]:
# we can compute the temporal diff between two datetime objective
delta = datetime.now() - datetime(1949, 10, 1)
delta

datetime.timedelta(25343, 48964, 529624)

In [22]:
# use diff: timedelta methods
start = datetime.now()
start + timedelta(12)

datetime.datetime(2019, 3, 3, 13, 36, 33, 500756)

### Converting between string and datetime

In [28]:
# convert datetime object to a spefic "human friendly" format
datetime.now().strftime('%d-%m-%Y') # <= str 'f'ormat time

'19-02-2019'

In [29]:
# convert string to datetime object
datetime.strptime('2018-05-09', '%Y-%m-%d') # <= str 'p'arse time

datetime.datetime(2018, 5, 9, 0, 0)

In [30]:
# use dateutil package
parse('May 09, 2018, 23:59')

datetime.datetime(2018, 5, 9, 23, 59)

In [31]:
# pandas's to_datetime function
pd.to_datetime(['May 09, 2018, 23:59', '2018-05-09 23:59', None])

DatetimeIndex(['2018-05-09 23:59:00', '2018-05-09 23:59:00', 'NaT'], dtype='datetime64[ns]', freq=None)

### Use datetime object as Index

In [33]:
# mannually create timestmaps
dates = [datetime(2018, 5, 10), datetime(2018, 5, 11), datetime(2018, 5, 12)]
# the list of dates are passed as index
ts1 = pd.Series(np.random.randn(3), index=dates)
ts1

2018-05-10   -1.333611
2018-05-11    2.942947
2018-05-12   -0.547403
dtype: float64

In [35]:
# to get a fixed date index objective
# Pandas's date_range, by default, generates daily timestamps
pd.date_range('2018-05-10', '2018, May, 12') # accepts different formats...

DatetimeIndex(['2018-05-10', '2018-05-11', '2018-05-12'], dtype='datetime64[ns]', freq='D')

In [36]:
# specify start (end) date, and periods
pd.date_range(start='2018-05-10', periods=3)

DatetimeIndex(['2018-05-10', '2018-05-11', '2018-05-12'], dtype='datetime64[ns]', freq='D')

In [37]:
# specify frequency
pd.date_range('2018-05-01', '2018-08-30', freq='M') # <= month end, others 'D', 'Q'

DatetimeIndex(['2018-05-31', '2018-06-30', '2018-07-31'], dtype='datetime64[ns]', freq='M')

### Shift method

In [None]:
# create a dataset with month-start as index
index = pd.date_range('1/1/2000', periods=3, freq='MS')
ts2 = pd.Series(np.random.randn(3), index=index)

In [None]:
ts2

What if we want to create `lead` or `lag` data?

In [None]:
# shift() moves the data point forward or backward
# leaves datetime index unmodified
ts2.shift(1) 

In [None]:
# with passing a freq argument, instead of moving data,
# shift() method move timestamps
ts2.shift(9, freq='MS')

In [None]:
# Another handy function to shift datetime
# especially helpful when merging databases
from pandas.tseries.offsets import MonthEnd

datetime.now() + MonthEnd(0)