In [40]:
import pandas as pd
import numpy as np

<h3>DataFrame</h3><br>
DataFrame is a 2-dimensional labeled data structure with columns of potentially different types.<br>
You can think of it like a spreadsheet or SQL table, or <b><font color=red>a dict of Series objects</font></b>.<br>
It is generally the most commonly used pandas object. Like Series, DataFrame accepts many different kinds of input:

Along with the data, you can optionally pass index (row labels) and columns (column labels) arguments. If you pass an index and / or columns, you are guaranteeing the index and / or columns of the resulting DataFrame. Thus, a dict of Series plus a specific index will discard all data not matching up to the passed index.

If axis labels are not passed, they will be constructed from the input data based on common sense rules.

#### 0. DataFrame 만들기

Series와 달리, DataFrame은 관계형 데이터 형태를 띈다

In [5]:
# data, column name, index info 모두를 사용해 생성한 pandas DataFrame 객체
d = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c", "d"]),
}


df = pd.DataFrame(d)

In [8]:
d # dict. key와 value(Series 객체) 집합으로 구성됨

{'one': a    1.0
 b    2.0
 c    3.0
 dtype: float64,
 'two': a    1.0
 b    2.0
 c    3.0
 d    4.0
 dtype: float64}

In [9]:
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [6]:
# 필요한 index만 인자로 사용 시, 해당 index만 추출됨
pd.DataFrame(d, index=["d", "b", "a"])

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,1.0


In [11]:
# col name을 따로 지정할 수도 있음.
pd.DataFrame(d, index=["d", "b", "a"], columns=["two", "three"])

Unnamed: 0,two,three
d,4.0,
b,2.0,
a,1.0,


In [12]:
print(df.index)

Index(['a', 'b', 'c', 'd'], dtype='object')


In [15]:
print(df.columns)  # Series의 name에 대응하는 DataFrame의 columns

Index(['one', 'two'], dtype='object')


In [16]:
# 물론, 꼭 Series를 사용해 생성할 필요는 없다

In [17]:
d = {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]}

pd.DataFrame(d)

Unnamed: 0,one,two
0,1.0,4.0
1,2.0,3.0
2,3.0,2.0
3,4.0,1.0


In [18]:
pd.DataFrame(d, index=["a", "b", "c", "d"])

Unnamed: 0,one,two
a,1.0,4.0
b,2.0,3.0
c,3.0,2.0
d,4.0,1.0


In [20]:
# 이렇게도 생성 가능하다
d = [{"a": 1, "b": 2}, {"a": 5, "b": 10, "c": 20}]

pd.DataFrame(d)

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


In [21]:
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


#### 1. DataFrame indexing & slicing

In [35]:
# column indexing
df["one"]

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

In [36]:
# column slicing
df[["one", "two"]]

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [37]:
# row indexing : by index name(label)
df.loc[["a"]]

Unnamed: 0,one,two
a,1.0,1.0


In [38]:
# row slicing : by index name(label)
df.loc[["a", "b"]]

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0


In [31]:
# row indexing : by index order
df.iloc[0]

one    1.0
two    1.0
Name: a, dtype: float64

In [34]:
# row slicing: by index order
df.iloc[:2]

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0


In [74]:
df[:2]

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0


#### 2. Data alignment and arithmetic¶

In [57]:
df1 = pd.DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"])
df2 = pd.DataFrame(np.random.randn(7, 3), columns=["A", "B", "C"])

In [58]:
df1

Unnamed: 0,A,B,C,D
0,0.365291,-0.06249,2.490023,-0.846613
1,-1.789691,-1.036057,-1.243543,-0.555553
2,3.039058,-0.963842,-1.168469,0.266405
3,-1.472213,0.123238,0.455487,0.273248
4,1.287364,1.668074,0.925325,-0.795734
5,-1.187969,0.017282,1.26241,0.38438
6,0.430988,-1.269992,-1.103545,-0.3561
7,-0.912765,-0.756194,-0.827453,-0.105882
8,-1.034049,-1.246499,-0.830264,0.843799
9,0.682134,-0.939335,0.246522,-0.216427


In [59]:
df2

Unnamed: 0,A,B,C
0,-1.139007,0.628571,-0.057574
1,0.155737,1.344527,-0.611262
2,-1.385397,-1.193048,0.103592
3,0.669321,-0.150404,-1.746662
4,2.853849,-0.464162,0.339681
5,-0.016434,-0.167564,1.083287
6,-0.415644,-0.150068,0.086988


df1 + df2 : 어떻게 될까?<br><br>
Data alignment between DataFrame objects <font color=red>automatically align on both the columns and the index</font> (row labels).<br>
Again, <font color=red>the resulting object will have the union of the column and row labels</font>.

In [60]:
df1 + df2

Unnamed: 0,A,B,C,D
0,-0.773716,0.566082,2.432448,
1,-1.633954,0.30847,-1.854805,
2,1.653661,-2.15689,-1.064877,
3,-0.802892,-0.027167,-1.291175,
4,4.141213,1.203912,1.265006,
5,-1.204403,-0.150282,2.345697,
6,0.015345,-1.42006,-1.016557,
7,,,,
8,,,,
9,,,,


In [66]:
df1 * df2

Unnamed: 0,A,B,C,D
0,-0.416069,-0.039279,-0.143362,
1,-0.278721,-1.393007,0.76013,
2,-4.210302,1.149909,-0.121045,
3,-0.985383,-0.018535,-0.795582,
4,3.673943,-0.774257,0.314315,
5,0.019523,-0.002896,1.367552,
6,-0.179138,0.190586,-0.095996,
7,,,,
8,,,,
9,,,,


In [67]:
df1 / df2

Unnamed: 0,A,B,C,D
0,-0.32071,-0.099415,-43.248737,
1,-11.491767,-0.770573,2.034386,
2,-2.193637,0.807882,-11.279491,
3,-2.199561,-0.819376,-0.260776,
4,0.451097,-3.593733,2.724096,
5,72.286706,-0.103138,1.165351,
6,-1.036918,8.46275,-12.686119,
7,,,,
8,,,,
9,,,,


When doing an operation between DataFrame and Series, <b>the default behavior is to align the Series index on the DataFrame columns, <font color=red>thus broadcasting row-wise.</font></b>

In [61]:
df1 - df1.iloc[0]

Unnamed: 0,A,B,C,D
0,0.0,0.0,0.0,0.0
1,-2.154981,-0.973567,-3.733565,0.291059
2,2.673767,-0.901352,-3.658492,1.113017
3,-1.837504,0.185727,-2.034536,1.11986
4,0.922073,1.730564,-1.564698,0.050878
5,-1.55326,0.079772,-1.227613,1.230992
6,0.065698,-1.207502,-3.593568,0.490512
7,-1.278056,-0.693705,-3.317475,0.74073
8,-1.39934,-1.18401,-3.320287,1.690411
9,0.316843,-0.876845,-2.243501,0.630186


In [64]:
df1

Unnamed: 0,A,B,C,D
0,0.365291,-0.06249,2.490023,-0.846613
1,-1.789691,-1.036057,-1.243543,-0.555553
2,3.039058,-0.963842,-1.168469,0.266405
3,-1.472213,0.123238,0.455487,0.273248
4,1.287364,1.668074,0.925325,-0.795734
5,-1.187969,0.017282,1.26241,0.38438
6,0.430988,-1.269992,-1.103545,-0.3561
7,-0.912765,-0.756194,-0.827453,-0.105882
8,-1.034049,-1.246499,-0.830264,0.843799
9,0.682134,-0.939335,0.246522,-0.216427


In [65]:
df1 * df2

Unnamed: 0,A,B,C,D
0,-0.416069,-0.039279,-0.143362,
1,-0.278721,-1.393007,0.76013,
2,-4.210302,1.149909,-0.121045,
3,-0.985383,-0.018535,-0.795582,
4,3.673943,-0.774257,0.314315,
5,0.019523,-0.002896,1.367552,
6,-0.179138,0.190586,-0.095996,
7,,,,
8,,,,
9,,,,


For explicit control over the matching and broadcasting behavior, see the section on flexible binary operations.<br>
<font color=red>Operations with scalars are just as you would expect:</font>

In [69]:
df2 * 0 + 1

Unnamed: 0,A,B,C
0,1.0,1.0,1.0
1,1.0,1.0,1.0
2,1.0,1.0,1.0
3,1.0,1.0,1.0
4,1.0,1.0,1.0
5,1.0,1.0,1.0
6,1.0,1.0,1.0


#### 3. Transposing

In [71]:
df1.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
A,0.365291,-1.789691,3.039058,-1.472213,1.287364,-1.187969,0.430988,-0.912765,-1.034049,0.682134
B,-0.06249,-1.036057,-0.963842,0.123238,1.668074,0.017282,-1.269992,-0.756194,-1.246499,-0.939335
C,2.490023,-1.243543,-1.168469,0.455487,0.925325,1.26241,-1.103545,-0.827453,-0.830264,0.246522
D,-0.846613,-0.555553,0.266405,0.273248,-0.795734,0.38438,-0.3561,-0.105882,0.843799,-0.216427
