<a href="https://colab.research.google.com/github/ckraju/python-data-analytics-2e/blob/master/Chapter_6_pandas_in_Depth_Data_Manipulation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CHAPTER 6 - PANDAS IN DEPTH - DATA MANIPULATION

## Merging

In [1]:
import numpy as np
import pandas as pd

In [2]:
frame1 = pd.DataFrame( {'id':['ball','pencil','pen','mug','ashtray'],
                        'price': [12.33,11.44,33.21,13.23,33.62]})
frame1

Unnamed: 0,id,price
0,ball,12.33
1,pencil,11.44
2,pen,33.21
3,mug,13.23
4,ashtray,33.62


In [3]:
frame2 = pd.DataFrame( {'id': ['pencil','pencil','ball','pen'],
                        'color':['white','red','red','black']})
frame2

Unnamed: 0,id,color
0,pencil,white
1,pencil,red
2,ball,red
3,pen,black


In [4]:
pd.merge(frame1,frame2)

Unnamed: 0,id,price,color
0,ball,12.33,red
1,pencil,11.44,white
2,pencil,11.44,red
3,pen,33.21,black


In [5]:
frame1 = pd.DataFrame( {'id': ['ball','pencil','pen',',mug','ashtray'],
                        'color': ['white','red','red','black','green'],
                        'brand': ['OMG','ABC','ABC','POD','PPOD']})
frame1

Unnamed: 0,id,color,brand
0,ball,white,OMG
1,pencil,red,ABC
2,pen,red,ABC
3,",mug",black,POD
4,ashtray,green,PPOD


In [6]:
frame2 = pd.DataFrame( {'id':['pencil','pencil','ball','pen'],
                        'brand': ['OMG','POD','ABC','POD']})
frame2

Unnamed: 0,id,brand
0,pencil,OMG
1,pencil,POD
2,ball,ABC
3,pen,POD


In [7]:
pd.merge(frame1,frame2)

Unnamed: 0,id,color,brand


In [8]:
pd.merge(frame1, frame2, on='id')

Unnamed: 0,id,color,brand_x,brand_y
0,ball,white,OMG,ABC
1,pencil,red,ABC,OMG
2,pencil,red,ABC,POD
3,pen,red,ABC,POD


In [9]:
pd.merge(frame1, frame2, on='brand')

Unnamed: 0,id_x,color,brand,id_y
0,ball,white,OMG,pencil
1,pencil,red,ABC,ball
2,pen,red,ABC,ball
3,",mug",black,POD,pencil
4,",mug",black,POD,pen


In [10]:
frame2.columns = ['brand','sid']
frame2

Unnamed: 0,brand,sid
0,pencil,OMG
1,pencil,POD
2,ball,ABC
3,pen,POD


In [11]:
pd.merge(frame1, frame2, left_on='id', right_on='sid')

Unnamed: 0,id,color,brand_x,brand_y,sid


In [12]:
frame2.columns = ['brand','id']
pd.merge(frame1,frame2,on='id')

Unnamed: 0,id,color,brand_x,brand_y


In [13]:
pd.merge(frame1,frame2,on='id',how='outer')

Unnamed: 0,id,color,brand_x,brand_y
0,ball,white,OMG,
1,pencil,red,ABC,
2,pen,red,ABC,
3,",mug",black,POD,
4,ashtray,green,PPOD,
5,OMG,,,pencil
6,POD,,,pencil
7,POD,,,pen
8,ABC,,,ball


In [14]:
pd.merge(frame1,frame2,on='id',how='left')

Unnamed: 0,id,color,brand_x,brand_y
0,ball,white,OMG,
1,pencil,red,ABC,
2,pen,red,ABC,
3,",mug",black,POD,
4,ashtray,green,PPOD,


In [15]:
pd.merge(frame1,frame2,on='id',how='right')

Unnamed: 0,id,color,brand_x,brand_y
0,OMG,,,pencil
1,POD,,,pencil
2,POD,,,pen
3,ABC,,,ball


In [16]:
pd.merge(frame1,frame2,on=['id','brand'], how='outer')

Unnamed: 0,id,color,brand
0,ball,white,OMG
1,pencil,red,ABC
2,pen,red,ABC
3,",mug",black,POD
4,ashtray,green,PPOD
5,OMG,,pencil
6,POD,,pencil
7,ABC,,ball
8,POD,,pen


### Merging on Index

In [17]:
pd.merge(frame1,frame2,right_index=True, left_index=True)

Unnamed: 0,id_x,color,brand_x,brand_y,id_y
0,ball,white,OMG,pencil,OMG
1,pencil,red,ABC,pencil,POD
2,pen,red,ABC,ball,ABC
3,",mug",black,POD,pen,POD


## Concatenating

In [19]:
array1 = np.arange(9).reshape((3,3))
array1

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [20]:
array2 = np.arange(9).reshape((3,3))+6
array2

array([[ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [21]:
np.concatenate([array1,array2],axis=1)

array([[ 0,  1,  2,  6,  7,  8],
       [ 3,  4,  5,  9, 10, 11],
       [ 6,  7,  8, 12, 13, 14]])

In [22]:
np.concatenate([array1,array2],axis=0)

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [23]:
ser1 = pd.Series(np.random.rand(4), index=[1,2,3,4])
ser1

1    0.040558
2    0.443864
3    0.000813
4    0.549838
dtype: float64

In [24]:
ser2 = pd.Series(np.random.rand(4), index=[5,6,7,8])
ser2

5    0.348860
6    0.140134
7    0.003710
8    0.971789
dtype: float64

In [25]:
pd.concat([ser1,ser2])

1    0.040558
2    0.443864
3    0.000813
4    0.549838
5    0.348860
6    0.140134
7    0.003710
8    0.971789
dtype: float64

In [26]:
pd.concat([ser1,ser2], axis=1)

Unnamed: 0,0,1
1,0.040558,
2,0.443864,
3,0.000813,
4,0.549838,
5,,0.34886
6,,0.140134
7,,0.00371
8,,0.971789


In [27]:
pd.concat([ser1,ser2], axis=1, join='inner')

Unnamed: 0,0,1


In [28]:
pd.concat([ser1,ser2], keys=[1,2])

1  1    0.040558
   2    0.443864
   3    0.000813
   4    0.549838
2  5    0.348860
   6    0.140134
   7    0.003710
   8    0.971789
dtype: float64

In [29]:
pd.concat([ser1,ser2], axis=1, keys=[1,2])

Unnamed: 0,1,2
1,0.040558,
2,0.443864,
3,0.000813,
4,0.549838,
5,,0.34886
6,,0.140134
7,,0.00371
8,,0.971789


In [30]:
frame1 = pd.DataFrame(np.random.rand(9).reshape(3,3), index=[1,2,3], columns=['A','B','C'])
frame2 = pd.DataFrame(np.random.rand(9).reshape(3,3), index=[4,5,6], columns=['A','B','C'])
pd.concat([frame1, frame2])

Unnamed: 0,A,B,C
1,0.757545,0.241116,0.050638
2,0.529061,0.536698,0.976845
3,0.64252,0.606434,0.681856
4,0.185978,0.709585,0.474095
5,0.862107,0.399844,0.542653
6,0.228139,0.921003,0.723359


In [31]:
pd.concat([frame1, frame2], axis=1)

Unnamed: 0,A,B,C,A.1,B.1,C.1
1,0.757545,0.241116,0.050638,,,
2,0.529061,0.536698,0.976845,,,
3,0.64252,0.606434,0.681856,,,
4,,,,0.185978,0.709585,0.474095
5,,,,0.862107,0.399844,0.542653
6,,,,0.228139,0.921003,0.723359


### Combining

In [32]:
ser1 = pd.Series(np.random.rand(5), index=[1,2,3,4,5])
ser1

1    0.171492
2    0.790376
3    0.077255
4    0.219225
5    0.127506
dtype: float64

In [33]:
ser2 = pd.Series(np.random.rand(4), index=[2,4,5,6])
ser2

2    0.936108
4    0.640962
5    0.074999
6    0.610792
dtype: float64

In [34]:
ser1.combine_first(ser2)

1    0.171492
2    0.790376
3    0.077255
4    0.219225
5    0.127506
6    0.610792
dtype: float64

In [35]:
ser2.combine_first(ser1)

1    0.171492
2    0.936108
3    0.077255
4    0.640962
5    0.074999
6    0.610792
dtype: float64

In [36]:
ser1[:3].combine_first(ser2[:3])

1    0.171492
2    0.790376
3    0.077255
4    0.640962
5    0.074999
dtype: float64

## Pivoting

### Pivoting with Hierarchical Indexing

In [37]:
frame1 = pd.DataFrame(np.arange(9).reshape(3,3),
                     index=['white','black','red'],
                     columns=['ball','pen','pencil'])
frame1

Unnamed: 0,ball,pen,pencil
white,0,1,2
black,3,4,5
red,6,7,8


In [38]:
ser = frame1.stack()
ser

white  ball      0
       pen       1
       pencil    2
black  ball      3
       pen       4
       pencil    5
red    ball      6
       pen       7
       pencil    8
dtype: int64

In [39]:
ser.unstack()

Unnamed: 0,ball,pen,pencil
white,0,1,2
black,3,4,5
red,6,7,8


In [40]:
ser.unstack(0)

Unnamed: 0,white,black,red
ball,0,3,6
pen,1,4,7
pencil,2,5,8


### Pivoting from "Long" to "Wide" Format

In [41]:
longframe = pd.DataFrame({ 'color':['white','white','white',
                                    'red','red','red',
                                    'black','black','black'],
                           'item':['ball','pen','mug',
                                   'ball','pen','mug',
                                   'ball','pen','mug'],
                           'value': np.random.rand(9)})
longframe

Unnamed: 0,color,item,value
0,white,ball,0.425909
1,white,pen,0.351534
2,white,mug,0.609364
3,red,ball,0.038219
4,red,pen,0.907997
5,red,mug,0.232209
6,black,ball,0.639073
7,black,pen,0.080654
8,black,mug,0.708395


In [42]:
wideframe = longframe.pivot('color','item')
wideframe

Unnamed: 0_level_0,value,value,value
item,ball,mug,pen
color,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
black,0.639073,0.708395,0.080654
red,0.038219,0.232209,0.907997
white,0.425909,0.609364,0.351534


### Removing

In [43]:
frame1 = pd.DataFrame(np.arange(9).reshape(3,3),
                       index=['white','black','red'],
                       columns=['ball','pen','pencil'])
frame1

Unnamed: 0,ball,pen,pencil
white,0,1,2
black,3,4,5
red,6,7,8


In [44]:
del frame1['ball']
frame1

Unnamed: 0,pen,pencil
white,1,2
black,4,5
red,7,8


In [45]:
frame1.drop('white')

Unnamed: 0,pen,pencil
black,4,5
red,7,8


## Data Transformation

### Removing Duplicates

In [46]:
dframe = pd.DataFrame({ 'color': ['white','white','red','red','white'],
                        'value': [2,1,3,3,2]})
dframe

Unnamed: 0,color,value
0,white,2
1,white,1
2,red,3
3,red,3
4,white,2


In [47]:
dframe.duplicated()

0    False
1    False
2    False
3     True
4     True
dtype: bool

In [48]:
dframe[dframe.duplicated()]

Unnamed: 0,color,value
3,red,3
4,white,2


### Replacing Values via Mapping

In [49]:
frame = pd.DataFrame({ 'item':['ball','mug','pen','pencil','ashtray'],
                       'color':['white','rosso','verde','black','yellow'],
                       'price':[5.56,4.20,1.30,0.56,2.75]})
frame

Unnamed: 0,item,color,price
0,ball,white,5.56
1,mug,rosso,4.2
2,pen,verde,1.3
3,pencil,black,0.56
4,ashtray,yellow,2.75


In [50]:
newcolors = {
    'rosso': 'red',
    'verde': 'green'
}

In [51]:
frame.replace(newcolors)

Unnamed: 0,item,color,price
0,ball,white,5.56
1,mug,red,4.2
2,pen,green,1.3
3,pencil,black,0.56
4,ashtray,yellow,2.75


In [52]:
ser = pd.Series([1,3,np.nan,4,6,np.nan,3])
ser

0    1.0
1    3.0
2    NaN
3    4.0
4    6.0
5    NaN
6    3.0
dtype: float64

In [53]:
ser.replace(np.nan,0)

0    1.0
1    3.0
2    0.0
3    4.0
4    6.0
5    0.0
6    3.0
dtype: float64

### Adding Values via Mapping

In [54]:
frame = pd.DataFrame({'item':['ball','mug','pen','pencil','ashtray'],
                      'color':['white','red','green','black','yellow']})
frame

Unnamed: 0,item,color
0,ball,white
1,mug,red
2,pen,green
3,pencil,black
4,ashtray,yellow


In [55]:
price = {
    'ball': 5.56,
    'mug': 4.20,
    'bottle': 1.30,
    'scissors': 3.41,
    'pen': 1.30,
    'pencil': 0.56,
    'ashtray': 2.75
}

In [56]:
frame['price'] = frame['item'].map(price)
frame

Unnamed: 0,item,color,price
0,ball,white,5.56
1,mug,red,4.2
2,pen,green,1.3
3,pencil,black,0.56
4,ashtray,yellow,2.75


### Rename the Indexes of the Axes

In [57]:
frame

Unnamed: 0,item,color,price
0,ball,white,5.56
1,mug,red,4.2
2,pen,green,1.3
3,pencil,black,0.56
4,ashtray,yellow,2.75


In [58]:
reindex = {
    0: 'first',
    1: 'second',
    2: 'third',
    3: 'fourth',
    4: 'fifth'
}
frame.rename(reindex)

Unnamed: 0,item,color,price
first,ball,white,5.56
second,mug,red,4.2
third,pen,green,1.3
fourth,pencil,black,0.56
fifth,ashtray,yellow,2.75


In [59]:
recolumn = {
    'item': 'object',
    'prince': 'value'
}
frame.rename(index=reindex, columns=recolumn)

Unnamed: 0,object,color,price
first,ball,white,5.56
second,mug,red,4.2
third,pen,green,1.3
fourth,pencil,black,0.56
fifth,ashtray,yellow,2.75


In [60]:
frame.rename(index={1:'first'}, columns={'item':'object'})

Unnamed: 0,object,color,price
0,ball,white,5.56
first,mug,red,4.2
2,pen,green,1.3
3,pencil,black,0.56
4,ashtray,yellow,2.75


In [61]:
frame.rename(columns={'item':'object'}, inplace=True)

In [62]:
frame

Unnamed: 0,object,color,price
0,ball,white,5.56
1,mug,red,4.2
2,pen,green,1.3
3,pencil,black,0.56
4,ashtray,yellow,2.75


## Discretization and Binning

In [63]:
results = [12,34,67,55,28,90,99,12,3,56,74,44,87,23,49,89,87]
bins = [0,25,50,75,100]
cat = pd.cut(results, bins)
cat

[(0, 25], (25, 50], (50, 75], (50, 75], (25, 50], ..., (75, 100], (0, 25], (25, 50], (75, 100], (75, 100]]
Length: 17
Categories (4, interval[int64]): [(0, 25] < (25, 50] < (50, 75] < (75, 100]]

In [64]:
cat.categories

IntervalIndex([(0, 25], (25, 50], (50, 75], (75, 100]],
              closed='right',
              dtype='interval[int64]')

In [65]:
cat.codes

array([0, 1, 2, 2, 1, 3, 3, 0, 0, 2, 2, 1, 3, 0, 1, 3, 3], dtype=int8)

In [66]:
pd.value_counts(cat)

(75, 100]    5
(50, 75]     4
(25, 50]     4
(0, 25]      4
dtype: int64

In [67]:
bin_names = ['unlikely','less likely','likely','highly likely']
pd.cut(results, bins, labels=bin_names)

['unlikely', 'less likely', 'likely', 'likely', 'less likely', ..., 'highly likely', 'unlikely', 'less likely', 'highly likely', 'highly likely']
Length: 17
Categories (4, object): ['unlikely' < 'less likely' < 'likely' < 'highly likely']

In [68]:
pd.cut(results, 5)

[(2.904, 22.2], (22.2, 41.4], (60.6, 79.8], (41.4, 60.6], (22.2, 41.4], ..., (79.8, 99.0], (22.2, 41.4], (41.4, 60.6], (79.8, 99.0], (79.8, 99.0]]
Length: 17
Categories (5, interval[float64]): [(2.904, 22.2] < (22.2, 41.4] < (41.4, 60.6] < (60.6, 79.8] <
                                    (79.8, 99.0]]

In [69]:
quintiles = pd.qcut(results, 5)
quintiles

[(2.999, 24.0], (24.0, 46.0], (62.6, 87.0], (46.0, 62.6], (24.0, 46.0], ..., (62.6, 87.0], (2.999, 24.0], (46.0, 62.6], (87.0, 99.0], (62.6, 87.0]]
Length: 17
Categories (5, interval[float64]): [(2.999, 24.0] < (24.0, 46.0] < (46.0, 62.6] < (62.6, 87.0] <
                                    (87.0, 99.0]]

In [70]:
pd.value_counts(quintiles)

(62.6, 87.0]     4
(2.999, 24.0]    4
(87.0, 99.0]     3
(46.0, 62.6]     3
(24.0, 46.0]     3
dtype: int64

### Detecting and Filtering Outliers

In [71]:
randframe = pd.DataFrame(np.random.randn(1000,3))
randframe.describe()

Unnamed: 0,0,1,2
count,1000.0,1000.0,1000.0
mean,0.032305,0.004304,0.037708
std,1.05483,0.98831,1.002239
min,-3.893006,-3.552613,-3.159748
25%,-0.679832,-0.691996,-0.690895
50%,0.019999,0.057999,0.034748
75%,0.757617,0.665674,0.726385
max,2.973885,3.077075,3.129742


In [72]:
randframe.std()

0    1.054830
1    0.988310
2    1.002239
dtype: float64

In [73]:
randframe[(np.abs(randframe) > (3*randframe.std())).any(1)]

Unnamed: 0,0,1,2
39,-3.893006,0.312597,0.912853
85,-3.672099,-0.166508,-0.065704
111,1.078439,-0.223202,-3.159748
393,-0.557889,-3.552613,1.543765
568,-3.319611,-0.056222,-0.132273
814,0.416764,2.990497,-1.666249
971,0.395041,-1.206426,3.129742
976,-1.196399,3.077075,-0.017928


## Permutation

In [74]:
nframe = pd.DataFrame(np.arange(25).reshape(5,5))
nframe

Unnamed: 0,0,1,2,3,4
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


In [75]:
new_order = np.random.permutation(5)
new_order

array([1, 0, 3, 2, 4])

In [76]:
nframe.take(new_order)

Unnamed: 0,0,1,2,3,4
1,5,6,7,8,9
0,0,1,2,3,4
3,15,16,17,18,19
2,10,11,12,13,14
4,20,21,22,23,24


In [77]:
new_order = [3,4,2]
nframe.take(new_order)

Unnamed: 0,0,1,2,3,4
3,15,16,17,18,19
4,20,21,22,23,24
2,10,11,12,13,14


### Random Sampling

In [78]:
sample = np.random.randint(0, len(nframe), size=3)
sample

array([0, 0, 3])

In [79]:
nframe.take(sample)

Unnamed: 0,0,1,2,3,4
0,0,1,2,3,4
0,0,1,2,3,4
3,15,16,17,18,19


## String Manipulation

### Built-in Methods for Manipulation of Strings

In [80]:
text = '16 Bolton Avenue , Boston'
text.split(',')

['16 Bolton Avenue ', ' Boston']

In [81]:
tokens = [s.strip() for s in text.split(',')]
tokens

['16 Bolton Avenue', 'Boston']

In [82]:
address, city = [s.strip() for s in text.split(',')]
address

'16 Bolton Avenue'

In [83]:
city

'Boston'

In [84]:
address + ',' + city

'16 Bolton Avenue,Boston'

In [85]:
strings = ['A+', 'A', 'A-', 'B', 'BB', 'BBB', 'C+']
';'.join(strings)

'A+;A;A-;B;BB;BBB;C+'

In [86]:
'Boston' in text

True

In [87]:
text.index('Boston')

19

In [88]:
text.find('Boston')

19

In [90]:
text.find('New York')

-1

In [91]:
text.count('e')

2

In [92]:
text.count('Avenue')

1

In [93]:
text.replace('Avenue','Street')

'16 Bolton Street , Boston'

In [94]:
text.replace('1','')

'6 Bolton Avenue , Boston'

### Regular Expressions

In [95]:
import re

In [96]:
text = "This is      an\t odd   \n text!"
re.split('\s+', text)

['This', 'is', 'an', 'odd', 'text!']

In [97]:
regex = re.compile('\s+')

In [98]:
regex.split(text)

['This', 'is', 'an', 'odd', 'text!']

In [99]:
text = 'This is my address: 16 Bolton Avenue, Boston'
re.findall('A\w+', text)

['Avenue']

In [100]:
re.findall('[A,a]\w+', text)

['address', 'Avenue']

In [101]:
re.search('[A,a]\w+', text)

<re.Match object; span=(11, 18), match='address'>

In [102]:
search = re.search('[A,a]\w+', text)
search.start()

11

In [103]:
search.end()

18

In [104]:
text[search.start():search.end()]

'address'

In [105]:
re.match('[A,a]\w+', text)

In [106]:
re.match('T\w+', text)

<re.Match object; span=(0, 4), match='This'>

In [107]:
match = re.match('T\w+', text)
text[match.start():match.end()]

'This'

## Data Aggregation

### A Practical Example

In [108]:
frame = pd.DataFrame({ 'color': ['white','red','green','red','green'],
                       'object': ['pen','pencil','pencil','ashtray','pen'],
                       'price1': [5.56, 4.20, 1.30, 0.56, 2.75],
                       'price2': [4.75,4.12,1.60,0.75,3.15]})
frame

Unnamed: 0,color,object,price1,price2
0,white,pen,5.56,4.75
1,red,pencil,4.2,4.12
2,green,pencil,1.3,1.6
3,red,ashtray,0.56,0.75
4,green,pen,2.75,3.15


In [109]:
group = frame['price1'].groupby(frame['color'])
group

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f09b0625050>

In [110]:
group.groups

{'green': [2, 4], 'red': [1, 3], 'white': [0]}

In [111]:
group.mean()

color
green    2.025
red      2.380
white    5.560
Name: price1, dtype: float64

In [112]:
group.sum()

color
green    4.05
red      4.76
white    5.56
Name: price1, dtype: float64

### Hierarchical Grouping

In [113]:
ggroup = frame['price1'].groupby([frame['color'],frame['object']])
ggroup.groups

{('green', 'pen'): [4], ('green', 'pencil'): [2], ('red', 'ashtray'): [3], ('red', 'pencil'): [1], ('white', 'pen'): [0]}

In [114]:
ggroup.sum()

color  object 
green  pen        2.75
       pencil     1.30
red    ashtray    0.56
       pencil     4.20
white  pen        5.56
Name: price1, dtype: float64

In [115]:
frame[['price1','price2']].groupby(frame['color']).mean()

Unnamed: 0_level_0,price1,price2
color,Unnamed: 1_level_1,Unnamed: 2_level_1
green,2.025,2.375
red,2.38,2.435
white,5.56,4.75


In [116]:
frame.groupby(frame['color']).mean()

Unnamed: 0_level_0,price1,price2
color,Unnamed: 1_level_1,Unnamed: 2_level_1
green,2.025,2.375
red,2.38,2.435
white,5.56,4.75


## Group Iteration

In [117]:
for name, group in frame.groupby('color'):
    print(name)
    print(group)

green
   color  object  price1  price2
2  green  pencil    1.30    1.60
4  green     pen    2.75    3.15
red
  color   object  price1  price2
1   red   pencil    4.20    4.12
3   red  ashtray    0.56    0.75
white
   color object  price1  price2
0  white    pen    5.56    4.75


### Chain of Transformations

In [118]:
result1 = frame['price1'].groupby(frame['color']).mean()
type(result1)

pandas.core.series.Series

In [119]:
result2 = frame.groupby(frame['color']).mean()
type(result2)

pandas.core.frame.DataFrame

In [120]:
frame['price1'].groupby(frame['color']).mean()

color
green    2.025
red      2.380
white    5.560
Name: price1, dtype: float64

In [121]:
frame.groupby(frame['color'])['price1'].mean()

color
green    2.025
red      2.380
white    5.560
Name: price1, dtype: float64

In [122]:
(frame.groupby(frame['color']).mean())['price1']

color
green    2.025
red      2.380
white    5.560
Name: price1, dtype: float64

In [123]:
means = frame.groupby('color').mean().add_prefix('mean_')
means

Unnamed: 0_level_0,mean_price1,mean_price2
color,Unnamed: 1_level_1,Unnamed: 2_level_1
green,2.025,2.375
red,2.38,2.435
white,5.56,4.75


### Functions on Groups

In [124]:
group = frame.groupby('color')
group['price1'].quantile(0.6)

color
green    2.170
red      2.744
white    5.560
Name: price1, dtype: float64

In [125]:
def range(series):
    return series.max() - series.min()
group['price1'].agg(range)

color
green    1.45
red      3.64
white    0.00
Name: price1, dtype: float64

In [126]:
group.agg(range)

Unnamed: 0_level_0,price1,price2
color,Unnamed: 1_level_1,Unnamed: 2_level_1
green,1.45,1.55
red,3.64,3.37
white,0.0,0.0


In [127]:
group['price1'].agg(['mean','std',range])

Unnamed: 0_level_0,mean,std,range
color,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
green,2.025,1.025305,1.45
red,2.38,2.573869,3.64
white,5.56,,0.0


## Advanced Data Aggregation

In [128]:
frame = pd.DataFrame({ 'color': ['white','red','green','red','green'],
                       'price1': [5.56, 4.20, 1.30, 0.56, 2.75],
                       'price2': [4.75,4.12,1.60,0.75,3.15]})
frame

Unnamed: 0,color,price1,price2
0,white,5.56,4.75
1,red,4.2,4.12
2,green,1.3,1.6
3,red,0.56,0.75
4,green,2.75,3.15


In [129]:
sums = frame.groupby('color').sum().add_prefix('tot_')
sums

Unnamed: 0_level_0,tot_price1,tot_price2
color,Unnamed: 1_level_1,Unnamed: 2_level_1
green,4.05,4.75
red,4.76,4.87
white,5.56,4.75


In [130]:
pd.merge(frame, sums, left_on='color', right_index=True)

Unnamed: 0,color,price1,price2,tot_price1,tot_price2
0,white,5.56,4.75,5.56,4.75
1,red,4.2,4.12,4.76,4.87
3,red,0.56,0.75,4.76,4.87
2,green,1.3,1.6,4.05,4.75
4,green,2.75,3.15,4.05,4.75


In [131]:
frame.groupby('color').transform(np.sum).add_prefix('tot_')

Unnamed: 0,tot_price1,tot_price2
0,5.56,4.75
1,4.76,4.87
2,4.05,4.75
3,4.76,4.87
4,4.05,4.75


In [132]:
frame = pd.DataFrame({ 'color': ['white','black','white','white','black','black'],
                       'status': ['up','up','down','down','down','up'],
                       'price1': [12.33,14.55,22.34,27.84,23.40,18.33],
                       'price2': [11.23,31.80,29.99,31.18,18.25,22.44]})
frame

Unnamed: 0,color,status,price1,price2
0,white,up,12.33,11.23
1,black,up,14.55,31.8
2,white,down,22.34,29.99
3,white,down,27.84,31.18
4,black,down,23.4,18.25
5,black,up,18.33,22.44


In [133]:
frame.groupby(['color','status']).apply( lambda x: x.max())

Unnamed: 0_level_0,Unnamed: 1_level_0,color,status,price1,price2
color,status,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
black,down,black,down,23.4,18.25
black,up,black,up,18.33,31.8
white,down,white,down,27.84,31.18
white,up,white,up,12.33,11.23


In [134]:
frame.rename(index=reindex, columns=recolumn)

Unnamed: 0,color,status,price1,price2
first,white,up,12.33,11.23
second,black,up,14.55,31.8
third,white,down,22.34,29.99
fourth,white,down,27.84,31.18
fifth,black,down,23.4,18.25
5,black,up,18.33,22.44


In [135]:
temp = pd.date_range('1/1/2015', periods=10, freq= 'H')
temp

DatetimeIndex(['2015-01-01 00:00:00', '2015-01-01 01:00:00',
               '2015-01-01 02:00:00', '2015-01-01 03:00:00',
               '2015-01-01 04:00:00', '2015-01-01 05:00:00',
               '2015-01-01 06:00:00', '2015-01-01 07:00:00',
               '2015-01-01 08:00:00', '2015-01-01 09:00:00'],
              dtype='datetime64[ns]', freq='H')

In [136]:
timeseries = pd.Series(np.random.rand(10), index=temp)
timeseries

2015-01-01 00:00:00    0.970751
2015-01-01 01:00:00    0.577875
2015-01-01 02:00:00    0.396938
2015-01-01 03:00:00    0.397027
2015-01-01 04:00:00    0.818759
2015-01-01 05:00:00    0.688307
2015-01-01 06:00:00    0.153333
2015-01-01 07:00:00    0.159567
2015-01-01 08:00:00    0.182199
2015-01-01 09:00:00    0.509585
Freq: H, dtype: float64

In [137]:
timetable = pd.DataFrame( {'date': temp, 'value1': np.random.rand(10),
                                      'value2': np.random.rand(10)})
timetable

Unnamed: 0,date,value1,value2
0,2015-01-01 00:00:00,0.197226,0.20817
1,2015-01-01 01:00:00,0.255211,0.461612
2,2015-01-01 02:00:00,0.581415,0.98312
3,2015-01-01 03:00:00,0.637749,0.205914
4,2015-01-01 04:00:00,0.736374,0.598988
5,2015-01-01 05:00:00,0.212877,0.869517
6,2015-01-01 06:00:00,0.442942,0.978964
7,2015-01-01 07:00:00,0.543694,0.737809
8,2015-01-01 08:00:00,0.482862,0.097393
9,2015-01-01 09:00:00,0.586487,0.14996


In [138]:
timetable['cat'] = ['up','down','left','left','up','up','down','right','right','up']
timetable

Unnamed: 0,date,value1,value2,cat
0,2015-01-01 00:00:00,0.197226,0.20817,up
1,2015-01-01 01:00:00,0.255211,0.461612,down
2,2015-01-01 02:00:00,0.581415,0.98312,left
3,2015-01-01 03:00:00,0.637749,0.205914,left
4,2015-01-01 04:00:00,0.736374,0.598988,up
5,2015-01-01 05:00:00,0.212877,0.869517,up
6,2015-01-01 06:00:00,0.442942,0.978964,down
7,2015-01-01 07:00:00,0.543694,0.737809,right
8,2015-01-01 08:00:00,0.482862,0.097393,right
9,2015-01-01 09:00:00,0.586487,0.14996,up
