In [56]:
# Handle table-like data and matrices
import numpy as np
import pandas as pd

# Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

# Configure visualizations
%matplotlib inline
mpl.style.use( 'ggplot' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6

from __future__ import division

In [57]:
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [58]:
%%R

library(psych)
vars <- c("mpg", "hp", "wt")
cars_summary = describe(mtcars[vars])
cars_table = mtcars

In [59]:
%Rpull cars_summary
%Rpull cars_table

In [60]:
#df= pd.DataFrame(cars_summary)
print cars_summary

    vars  n   mean    sd median trimmed   mad   min    max  range skew kurtosis

mpg    1 32  20.09  6.03  19.20   19.70  5.41 10.40  33.90  23.50 0.61    -0.37

hp     2 32 146.69 68.56 123.00  141.19 77.10 52.00 335.00 283.00 0.73    -0.14

wt     3 32   3.22  0.98   3.33    3.15  0.77  1.51   5.42   3.91 0.42    -0.02

       se

mpg  1.07

hp  12.12

wt   0.17



In [61]:
cars_df = pd.DataFrame(cars_table)

In [62]:
cars_df.describe()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
count,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0
mean,20.090625,6.1875,230.721875,146.6875,3.596563,3.21725,17.84875,0.4375,0.40625,3.6875,2.8125
std,6.026948,1.785922,123.938694,68.562868,0.534679,0.978457,1.786943,0.504016,0.498991,0.737804,1.6152
min,10.4,4.0,71.1,52.0,2.76,1.513,14.5,0.0,0.0,3.0,1.0
25%,15.425,4.0,120.825,96.5,3.08,2.58125,16.8925,0.0,0.0,3.0,2.0
50%,19.2,6.0,196.3,123.0,3.695,3.325,17.71,0.0,0.0,4.0,2.0
75%,22.8,8.0,326.0,180.0,3.92,3.61,18.9,1.0,1.0,4.0,4.0
max,33.9,8.0,472.0,335.0,4.93,5.424,22.9,1.0,1.0,5.0,8.0


In [63]:
cars_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32 entries, Mazda RX4 to Volvo 142E
Data columns (total 11 columns):
mpg     32 non-null float64
cyl     32 non-null float64
disp    32 non-null float64
hp      32 non-null float64
drat    32 non-null float64
wt      32 non-null float64
qsec    32 non-null float64
vs      32 non-null float64
am      32 non-null float64
gear    32 non-null float64
carb    32 non-null float64
dtypes: float64(11)
memory usage: 3.0+ KB


In [64]:
#groupby single column
grouped = cars_df['mpg'].groupby(cars_df['cyl'])
grouped.mean()

cyl
4.0    26.663636
6.0    19.742857
8.0    15.100000
Name: mpg, dtype: float64

In [71]:
#different approach for the same result
grouped = cars_df.groupby('cyl')['mpg']
grouped.mean()

cyl
4.0    26.663636
6.0    19.742857
8.0    15.100000
Name: mpg, dtype: float64

In [65]:
#groupby multiple columns
grouped = cars_df['mpg'].groupby([cars_df['cyl'],cars_df['am']])
grouped.mean()

cyl  am 
4.0  0.0    22.900000
     1.0    28.075000
6.0  0.0    19.125000
     1.0    20.566667
8.0  0.0    15.050000
     1.0    15.400000
Name: mpg, dtype: float64

In [69]:
#count of each group
cars_df.groupby(['am', 'cyl']).size()

am   cyl
0.0  4.0     3
     6.0     4
     8.0    12
1.0  4.0     8
     6.0     3
     8.0     2
dtype: int64

In [88]:
cars_df.pivot_table(index=['am', 'cyl'],aggfunc=np.mean)

Unnamed: 0_level_0,Unnamed: 1_level_0,carb,disp,drat,gear,hp,mpg,qsec,vs,wt
am,cyl,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0.0,4.0,1.666667,135.866667,3.77,3.666667,84.666667,22.9,20.97,1.0,2.935
0.0,6.0,2.5,204.55,3.42,3.5,115.25,19.125,19.215,1.0,3.38875
0.0,8.0,3.083333,357.616667,3.120833,3.0,194.166667,15.05,17.1425,0.0,4.104083
1.0,4.0,1.5,93.6125,4.18375,4.25,81.875,28.075,18.45,0.875,2.04225
1.0,6.0,4.666667,155.0,3.806667,4.333333,131.666667,20.566667,16.326667,0.0,2.755
1.0,8.0,6.0,326.0,3.88,5.0,299.5,15.4,14.55,0.0,3.37


In [92]:
cars_df_pivot = cars_df.pivot_table(index=['am', 'cyl'],aggfunc=np.mean)
cars_df_pivot['mpg']

am   cyl
0.0  4.0    22.900000
     6.0    19.125000
     8.0    15.050000
1.0  4.0    28.075000
     6.0    20.566667
     8.0    15.400000
Name: mpg, dtype: float64