In [1]:
import numpy as np
import pandas as pd

## Groupby

There are many times that we want to look and summary statistics by a categorical column.

In [2]:
import seaborn as sns
iris = sns.load_dataset('iris')

In [3]:
type(iris.groupby('species'))

pandas.core.groupby.generic.DataFrameGroupBy

In [7]:
iris.iloc[:,0:4].cov()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
sepal_length,0.685694,-0.042434,1.274315,0.516271
sepal_width,-0.042434,0.189979,-0.329656,-0.121639
petal_length,1.274315,-0.329656,3.116278,1.295609
petal_width,0.516271,-0.121639,1.295609,0.581006


In [9]:
iris['sepal_length'].cov(iris['sepal_width'])

-0.0424340044742729

In [14]:
iris.groupby('species').agg({'sepal_width': ['sum', 'mean'], 'petal_length': 'max'})


Unnamed: 0_level_0,sepal_width,sepal_width,petal_length
Unnamed: 0_level_1,sum,mean,max
species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
setosa,171.4,3.428,1.9
versicolor,138.5,2.77,5.1
virginica,148.7,2.974,6.9


In [15]:
penguins = sns.load_dataset('penguins')

In [16]:
penguins.groupby(['species','island'])['bill_length_mm'].mean()

species    island   
Adelie     Biscoe       38.975000
           Dream        38.501786
           Torgersen    38.950980
Chinstrap  Dream        48.833824
Gentoo     Biscoe       47.504878
Name: bill_length_mm, dtype: float64

## Combining Datasets

```pandas.merge``` connects rows in DataFrames based on one or more keys
~~~
pd.merge(dfl, dfr, on='key')
pd.merge(dfl, dfr, left_on='lkey', right_on='rkey', how='inner')
~~~

By default, ```pd.merge``` performs an "inner" join, so if the ```how``` argument is left blank it assumes inner join.  Other possibilities:

* ```inner```: Use only the key combinations that are in both tables
* ```left```: Use all the key combinations in the left table
* ```right```: Use all the key combinations in the right table
* ```outer```: use all combinations that are in both tables together

In [17]:
dfl = pd.DataFrame({'lkey': ['b','b','a','c','a','a','b'], 'data1': range(7)})
dfl

Unnamed: 0,lkey,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [18]:
dfr = pd.DataFrame({'rkey': ['a','b','d'], 'data2': range(3)})
dfr

Unnamed: 0,rkey,data2
0,a,0
1,b,1
2,d,2


In [19]:
pd.merge(dfl, dfr, left_on='lkey', right_on='rkey')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,a,2,a,0
3,a,4,a,0
4,a,5,a,0
5,b,6,b,1


```pandas.concat``` concatentes or "stacks" together objects along an axis

In [20]:
df1 = pd.DataFrame({'state':['Utah','Ohio','Tennessee','Wyoming','Texas'], 'order':[45,17,16,44,28], 'min_elev':[2180,455,178,3105,0], 'capital':['Salt Lake City','Columbus','Nashville','Cheyenne','Austin']})
df1

Unnamed: 0,state,order,min_elev,capital
0,Utah,45,2180,Salt Lake City
1,Ohio,17,455,Columbus
2,Tennessee,16,178,Nashville
3,Wyoming,44,3105,Cheyenne
4,Texas,28,0,Austin


In [21]:
df2 = pd.DataFrame({'state':['Nebraska','Maryland','Arizona','North Carolina'], 'order':[37,7,48,12], 'min_elev':[840,0, 72,0], 'capital':['Lincoln','Annapolis', 'Phoenix', 'Raleigh']})
df2

Unnamed: 0,state,order,min_elev,capital
0,Nebraska,37,840,Lincoln
1,Maryland,7,0,Annapolis
2,Arizona,48,72,Phoenix
3,North Carolina,12,0,Raleigh


In [22]:
df_new = pd.concat([df1,df2])
df_new

Unnamed: 0,state,order,min_elev,capital
0,Utah,45,2180,Salt Lake City
1,Ohio,17,455,Columbus
2,Tennessee,16,178,Nashville
3,Wyoming,44,3105,Cheyenne
4,Texas,28,0,Austin
0,Nebraska,37,840,Lincoln
1,Maryland,7,0,Annapolis
2,Arizona,48,72,Phoenix
3,North Carolina,12,0,Raleigh


In [23]:
df_new.reset_index()

Unnamed: 0,index,state,order,min_elev,capital
0,0,Utah,45,2180,Salt Lake City
1,1,Ohio,17,455,Columbus
2,2,Tennessee,16,178,Nashville
3,3,Wyoming,44,3105,Cheyenne
4,4,Texas,28,0,Austin
5,0,Nebraska,37,840,Lincoln
6,1,Maryland,7,0,Annapolis
7,2,Arizona,48,72,Phoenix
8,3,North Carolina,12,0,Raleigh


In [24]:
df_new

Unnamed: 0,state,order,min_elev,capital
0,Utah,45,2180,Salt Lake City
1,Ohio,17,455,Columbus
2,Tennessee,16,178,Nashville
3,Wyoming,44,3105,Cheyenne
4,Texas,28,0,Austin
0,Nebraska,37,840,Lincoln
1,Maryland,7,0,Annapolis
2,Arizona,48,72,Phoenix
3,North Carolina,12,0,Raleigh


In [25]:
df3 = pd.DataFrame(np.random.randn(12).reshape((4,3)))
df3.columns = ["A","B","C"]

df4 = pd.DataFrame(np.random.randint(0,100,15).reshape(5,3))
df4.columns = ["A", "B", "D"]

In [26]:
df3

Unnamed: 0,A,B,C
0,-0.080631,0.382381,-0.389147
1,0.595465,-1.23801,-0.315509
2,-0.514149,1.167802,0.960427
3,0.229703,0.156181,-0.563657


In [27]:
df4

Unnamed: 0,A,B,D
0,74,95,17
1,53,67,86
2,35,75,91
3,7,96,67
4,44,77,12


In [28]:
pd.concat([df3,df4])

Unnamed: 0,A,B,C,D
0,-0.080631,0.382381,-0.389147,
1,0.595465,-1.23801,-0.315509,
2,-0.514149,1.167802,0.960427,
3,0.229703,0.156181,-0.563657,
0,74.0,95.0,,17.0
1,53.0,67.0,,86.0
2,35.0,75.0,,91.0
3,7.0,96.0,,67.0
4,44.0,77.0,,12.0


In [29]:
pd.concat([df3,df4], axis=1)

Unnamed: 0,A,B,C,A.1,B.1,D
0,-0.080631,0.382381,-0.389147,74,95,17
1,0.595465,-1.23801,-0.315509,53,67,86
2,-0.514149,1.167802,0.960427,35,75,91
3,0.229703,0.156181,-0.563657,7,96,67
4,,,,44,77,12
