# Data collection and GroupBy

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'],
                   'data1': np.random.randn(5),
                   'data2': np.random.randn(5)})

In [3]:
df

Unnamed: 0,data1,data2,key1,key2
0,0.073597,-0.396095,a,one
1,-1.027653,-0.522006,a,two
2,1.997508,-0.289185,b,one
3,-2.013951,-0.69743,b,two
4,0.692202,-0.721582,a,one


In [4]:
# Grouping data1 columns by key1 values
grouped = df['data1'].groupby(df['key1'])

In [5]:
grouped

<pandas.core.groupby.SeriesGroupBy object at 0x0000000009CDCCF8>

In [6]:
# Grouped object is iterable
for name, value in grouped:
    print(name)
    print(value)

a
0    0.073597
1   -1.027653
4    0.692202
Name: data1, dtype: float64
b
2    1.997508
3   -2.013951
Name: data1, dtype: float64


In [8]:
# Grouped by key1 and calculating mean
grouped.mean()

key1
a   -0.087285
b   -0.008222
Name: data1, dtype: float64

In [11]:
# Grouping by multiple columns is also available
means = df['data1'].groupby([df['key1'],df['key2']]).mean()

In [12]:
means

key1  key2
a     one     0.382900
      two    -1.027653
b     one     1.997508
      two    -2.013951
Name: data1, dtype: float64

In [17]:
# Stack(), unstack() is available
print(means.unstack())
print(type(means.unstack()))

key2       one       two
key1                    
a     0.382900 -1.027653
b     1.997508 -2.013951
<class 'pandas.core.frame.DataFrame'>


In [18]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.087285,-0.546561
b,-0.008222,-0.493307


In [19]:
df.groupby('key1').count()

Unnamed: 0_level_0,data1,data2,key2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,3,3,3
b,2,2,2


In [28]:
df.groupby(['key1','key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.3829,-0.558838
a,two,-1.027653,-0.522006
b,one,1.997508,-0.289185
b,two,-2.013951,-0.69743


In [29]:
df.groupby(['key1','key2'])['data2'].mean()

key1  key2
a     one    -0.558838
      two    -0.522006
b     one    -0.289185
      two    -0.697430
Name: data2, dtype: float64

In [30]:
df2 = pd.DataFrame(np.random.randn(5, 5), 
                   columns=['a', 'b', 'c', 'd', 'e'],
                   index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])

In [31]:
map_dict = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'orange', 'e': 'blue', 'f': 'sky'}

In [32]:
# Mapping columns to group and use it as groupby parameter
df2.groupby(map_dict, axis = 1).sum()
# axis = 1 means apply grouping to columns

Unnamed: 0,blue,orange,red
Joe,0.018207,1.125612,0.941919
Steve,-2.138913,-1.108357,0.965236
Wes,1.363107,-0.358042,1.272474
Jim,0.062129,-1.04049,1.809625
Travis,-0.72144,0.745165,-1.221021


In [34]:
pd.DataFrame.stack?

In [35]:
pd.DataFrame.unstack?

In [36]:
pd.DataFrame.pivot?

In [37]:
pd.DataFrame.set_index?

# Pivot_Table

In [38]:
data = {
    "도시": ["서울", "서울", "서울", "부산", "부산", "부산", "인천", "인천"  ],
    "연도": [ "2015", "2010", "2005", "2015", "2010", "2005", "2015", "2010" ],
    "인구": [ 9904312, 9631482, 9762546, 3448737, 3393191, 3512547, 2890451, 2632035 ],
    "지역": ["수도권", "수도권", "수도권", "경상권", "경상권", "경상권", "수도권", "수도권"]
}
columns = ["도시", "연도", "인구", "지역"]

In [39]:
df1 = pd.DataFrame(data, columns = columns)

In [40]:
df1

Unnamed: 0,도시,연도,인구,지역
0,서울,2015,9904312,수도권
1,서울,2010,9631482,수도권
2,서울,2005,9762546,수도권
3,부산,2015,3448737,경상권
4,부산,2010,3393191,경상권
5,부산,2005,3512547,경상권
6,인천,2015,2890451,수도권
7,인천,2010,2632035,수도권


In [41]:
# Roating Data by pivotting
df1.pivot("도시","연도","인구")

연도,2005,2010,2015
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
부산,3512547.0,3393191.0,3448737.0
서울,9762546.0,9631482.0,9904312.0
인천,,2632035.0,2890451.0


In [44]:
# Same result with cell above
df1.set_index(["도시","연도"])["인구"].unstack()

연도,2005,2010,2015
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
부산,3512547.0,3393191.0,3448737.0
서울,9762546.0,9631482.0,9904312.0
인천,,2632035.0,2890451.0


In [45]:
df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"],
        "B": ["one", "one", "one", "two", "two","one", "one", "two", "two"],
        "C": ["small", "large", "large", "small","small", "large", "small", "small", "large"],
        "D": [1, 2, 2, 3, 3, 4, 5, 6, 7]})

In [46]:
df

Unnamed: 0,A,B,C,D
0,foo,one,small,1
1,foo,one,large,2
2,foo,one,large,2
3,foo,two,small,3
4,foo,two,small,3
5,bar,one,large,4
6,bar,one,small,5
7,bar,two,small,6
8,bar,two,large,7


In [47]:
# Pivotting + Groupby = pivot_table
table = pd.pivot_table(df, values='D', index=['A','B'], columns = 'C', aggfunc='sum')

In [48]:
table

Unnamed: 0_level_0,C,large,small
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,4.0,5.0
bar,two,7.0,6.0
foo,one,4.0,1.0
foo,two,,6.0


# Seaborn and Folium

In [49]:
import folium

In [50]:
map_osm = folium.Map(location=[37.541, 126.986])

In [51]:
map_osm

In [53]:
map_osm.save('../data/map_seoul.html')

In [68]:
stamen = folium.Map(location = [37.385,127.125],zoom_start=13,tiles='Stamen Toner')

In [70]:
stamen

In [71]:
stamen = folium.Map(location = [37.385,127.125],zoom_start=13,tiles='Stamen Terrain')

In [73]:
stamen

In [77]:
stamen = folium.Map(location = [37.385195,127.123313],zoom_start=13)
folium.Marker([37.385195,127.123313],popup="서현역",icon=folium.Icon(icon='cloud')).add_to(stamen)
stamen