# Part 09

## Group by

In [3]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

In [4]:
dframe = DataFrame({'k1':['X','X','Y','Y','Z'],
                    'k2':['alpha','beta','alpha','beta','alpha'],
                    'data1':np.arange(5),
                    'data2':np.arange(5)})
dframe

Unnamed: 0,data1,data2,k1,k2
0,0,0,X,alpha
1,1,1,X,beta
2,2,2,Y,alpha
3,3,3,Y,beta
4,4,4,Z,alpha


Group one column according to a key - in this case *k1*; the resulting object is a series

Note: this syntax/method does not work for more than one key

In [5]:
group1 = dframe['data1'].groupby(dframe['k1'])
group1

<pandas.core.groupby.SeriesGroupBy object at 0x7f7cd016beb8>

In [7]:
group1.mean()

k1
X    0.5
Y    2.5
Z    4.0
Name: data1, dtype: float64

In [8]:
dframe

Unnamed: 0,data1,data2,k1,k2
0,0,0,X,alpha
1,1,1,X,beta
2,2,2,Y,alpha
3,3,3,Y,beta
4,4,4,Z,alpha


Group all columns by *k1* - the resulting object is a data frame

In [9]:
dframe.groupby('k1')

<pandas.core.groupby.DataFrameGroupBy object at 0x7f7cd016be80>

In [10]:
dframe.groupby('k1').mean() # groupwise mean

Unnamed: 0_level_0,data1,data2
k1,Unnamed: 1_level_1,Unnamed: 2_level_1
X,0.5,0.5
Y,2.5,2.5
Z,4.0,4.0


In [11]:
dframe.groupby(['k1']).size() # groupwise count

k1
X    2
Y    2
Z    1
dtype: int64

In [12]:
dframe.groupby(['k1','k2']).mean() # group all columns by k1 and k2

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
k1,k2,Unnamed: 2_level_1,Unnamed: 3_level_1
X,alpha,0,0
X,beta,1,1
Y,alpha,2,2
Y,beta,3,3
Z,alpha,4,4


In [13]:
dframe.groupby(['k1','k2']).size()

k1  k2   
X   alpha    1
    beta     1
Y   alpha    1
    beta     1
Z   alpha    1
dtype: int64

In [14]:
cities = np.array(['NY','LA','LA','NY','NY'])
cities

array(['NY', 'LA', 'LA', 'NY', 'NY'], dtype='<U2')

In [15]:
month = np.array(['JAN','FEB','JAN','FEB','JAN'])
month

array(['JAN', 'FEB', 'JAN', 'FEB', 'JAN'], dtype='<U3')

In [16]:
dframe['data1']

0    0
1    1
2    2
3    3
4    4
Name: data1, dtype: int64

In [17]:
g_ds1 = dframe['data1'].groupby([cities,month])
g_ds1.mean() # grouping by a variable in external series

LA  FEB    1
    JAN    2
NY  FEB    3
    JAN    2
Name: data1, dtype: int64

# Data aggregation

In [18]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [19]:
dframe_wine = pd.read_csv("data/redwines.csv") # read some data

In [20]:
type(dframe_wine)

pandas.core.frame.DataFrame

In [21]:
dframe_wine.head(12)

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_so2,total_so2,density,pH,sulphates,alcohol,quality
0,6.5,0.9,0.0,1.6,0.052,9.0,17.0,0.99467,3.5,0.63,10.9,6
1,9.1,0.22,0.24,2.1,0.078,1.0,28.0,0.999,3.41,0.87,10.3,6
2,6.9,0.52,0.25,2.6,0.081,10.0,37.0,0.99685,3.46,0.5,11.0,5
3,7.3,0.59,0.26,2.0,0.08,17.0,104.0,0.99584,3.28,0.52,9.9,5
4,12.5,0.28,0.54,2.3,0.082,12.0,29.0,0.9997,3.11,1.36,9.8,7
5,5.4,0.74,0.09,1.7,0.089,16.0,26.0,0.99402,3.67,0.56,11.6,6
6,10.4,0.28,0.54,2.7,0.105,5.0,19.0,0.9988,3.25,0.63,9.5,5
7,7.9,0.4,0.3,1.8,0.157,2.0,45.0,0.99727,3.31,0.91,9.5,6
8,7.3,0.39,0.31,2.4,0.074,9.0,46.0,0.9962,3.41,0.54,9.4,6
9,9.5,0.37,0.52,2.0,0.088,12.0,51.0,0.99613,3.29,0.58,11.1,6


In [22]:
dframe_wine.shape

(1599, 12)

In [23]:
dframe_wine.columns

Index(['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar',
       'chlorides', 'free_so2', 'total_so2', 'density', 'pH', 'sulphates',
       'alcohol', 'quality'],
      dtype='object')

In [24]:
dframe_wine.describe()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_so2,total_so2,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [8]:
dframe_wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
fixed_acidity       1599 non-null float64
volatile_acidity    1599 non-null float64
citric_acid         1599 non-null float64
residual_sugar      1599 non-null float64
chlorides           1599 non-null float64
free_so2            1599 non-null float64
total_so2           1599 non-null float64
density             1599 non-null float64
pH                  1599 non-null float64
sulphates           1599 non-null float64
alcohol             1599 non-null float64
quality             1599 non-null int64
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [25]:
dframe_wine['alcohol'].mean() # aggregating using a built-in function - in this case, mean

10.422983114446508

In [None]:
dframe_wine.alcohol.mean() # same thing

#### Writing a user-defined function

In [None]:
def my_mean(a,b):
    m = (a+b)/2
    return m

In [None]:
z = my_mean(2,3)

In [None]:
z

In [27]:
# creating a user-defined function to aggregate by
def max_to_min(arr):
    '''
    Function to calculate range based on max and min
    inputs: numeric array
    '''
    z = arr.max() - arr.min()
    return z

In [28]:
x = max_to_min(dframe_wine['pH'])
x

1.2699999999999996

In [29]:
wino = dframe_wine.groupby(['quality'])

In [11]:
wino

<pandas.core.groupby.DataFrameGroupBy object at 0x7fd53afb0860>

In [30]:
wino.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,alcohol,chlorides,citric_acid,density,fixed_acidity,free_so2,pH,residual_sugar,sulphates,total_so2,volatile_acidity
quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3,count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
3,mean,9.955,0.1225,0.171,0.997464,8.36,11.0,3.398,2.635,0.57,24.9,0.8845
3,std,0.818009,0.066241,0.250664,0.002002,1.770875,9.763879,0.144052,1.401596,0.12202,16.828877,0.331256
3,min,8.4,0.061,0.0,0.99471,6.7,3.0,3.16,1.2,0.4,9.0,0.44
3,25%,9.725,0.079,0.005,0.99615,7.15,5.0,3.3125,1.875,0.5125,12.5,0.6475
3,50%,9.925,0.0905,0.035,0.997565,7.5,6.0,3.39,2.1,0.545,15.0,0.845
3,75%,10.575,0.143,0.3275,0.99877,9.875,14.5,3.495,3.1,0.615,42.5,1.01
3,max,11.0,0.267,0.66,1.0008,11.6,34.0,3.63,5.7,0.86,49.0,1.58
4,count,53.0,53.0,53.0,53.0,53.0,53.0,53.0,53.0,53.0,53.0,53.0
4,mean,10.265094,0.090679,0.174151,0.996542,7.779245,12.264151,3.381509,2.69434,0.596415,36.245283,0.693962


In [31]:
wino.mean()

Unnamed: 0_level_0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_so2,total_so2,density,pH,sulphates,alcohol
quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
3,8.36,0.8845,0.171,2.635,0.1225,11.0,24.9,0.997464,3.398,0.57,9.955
4,7.779245,0.693962,0.174151,2.69434,0.090679,12.264151,36.245283,0.996542,3.381509,0.596415,10.265094
5,8.167254,0.577041,0.243686,2.528855,0.092736,16.983847,56.51395,0.997104,3.304949,0.620969,9.899706
6,8.347179,0.497484,0.273824,2.477194,0.084956,15.711599,40.869906,0.996615,3.318072,0.675329,10.629519
7,8.872362,0.40392,0.375176,2.720603,0.076588,14.045226,35.020101,0.996104,3.290754,0.741256,11.465913
8,8.566667,0.423333,0.391111,2.577778,0.068444,13.277778,33.444444,0.995212,3.267222,0.767778,12.094444


`agg()` is the general container for the aggregation function - in this case the UDF `max_to_min()`...

In [32]:
wino['alcohol'].agg(max_to_min)

quality
3    2.6
4    4.1
5    6.4
6    5.6
7    4.8
8    4.2
Name: alcohol, dtype: float64

...and in this case `mean()`

In [17]:
wino.agg('mean')

Unnamed: 0_level_0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_so2,total_so2,density,pH,sulphates,alcohol
quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
3,8.36,0.8845,0.171,2.635,0.1225,11.0,24.9,0.997464,3.398,0.57,9.955
4,7.779245,0.693962,0.174151,2.69434,0.090679,12.264151,36.245283,0.996542,3.381509,0.596415,10.265094
5,8.167254,0.577041,0.243686,2.528855,0.092736,16.983847,56.51395,0.997104,3.304949,0.620969,9.899706
6,8.347179,0.497484,0.273824,2.477194,0.084956,15.711599,40.869906,0.996615,3.318072,0.675329,10.629519
7,8.872362,0.40392,0.375176,2.720603,0.076588,14.045226,35.020101,0.996104,3.290754,0.741256,11.465913
8,8.566667,0.423333,0.391111,2.577778,0.068444,13.277778,33.444444,0.995212,3.267222,0.767778,12.094444


In [None]:
wino['alcohol'].agg(max_to_min)

In [None]:
dframe_wine['qual_alc_ratio'] = dframe_wine['quality']/dframe_wine['alcohol']
dframe_wine.head()

# End of part 09