### pandas
- 데이터 분석을 위한 사용이 쉽고 성능이 좋은 오픈소스 python 라이브러리
- Series : index, value로 이루어진 데이터 타입
- DataFrame : index, column(Series), value로 이루어진 데이터 타입

In [2]:
import numpy as np
import pandas as pd

#### 1. Series

In [4]:
data = pd.Series(np.random.randint(10, size=5))
type(data), data

(pandas.core.series.Series,
 0    2
 1    0
 2    7
 3    8
 4    6
 dtype: int64)

In [7]:
data.index, data.values, data.dtype

(RangeIndex(start=0, stop=5, step=1), array([2, 0, 7, 8, 6]), dtype('int64'))

In [9]:
data[3] = 10
data

0     2
1     0
2     7
3    10
4     6
dtype: int64

In [11]:
data * 10

0     20
1      0
2     70
3    100
4     60
dtype: int64

In [14]:
data[data > 5]

2     7
3    10
4     6
dtype: int64

In [15]:
list("ABCDE")

['A', 'B', 'C', 'D', 'E']

In [17]:
data1 = pd.Series(np.random.randint(10, size=5), index=list("ABCDE"))
data2 = pd.Series(np.random.randint(10, size=3), index=list("DEF"))
data1, data2

(A    8
 B    4
 C    4
 D    3
 E    2
 dtype: int64,
 D    8
 E    0
 F    7
 dtype: int64)

In [19]:
result = data1 + data2
result

A     NaN
B     NaN
C     NaN
D    11.0
E     2.0
F     NaN
dtype: float64

In [23]:
data1

A    8
B    4
C    4
D    3
E    2
dtype: int64

In [24]:
result[result.isnull()] = data1
result

A     8.0
B     4.0
C     4.0
D    11.0
E     2.0
F     NaN
dtype: float64

In [25]:
result[result.isnull()] = data2
result

A     8.0
B     4.0
C     4.0
D    11.0
E     2.0
F     7.0
dtype: float64

#### 2. DataFrame

In [26]:
# 1. 데이터 프레임 생성

In [29]:
# 리스트의 딕셔너리
data = [
    {"name": "data", "email":"data@gmail.com", "id": 1},
    {"name": "data2", "email":"data2@naver.com", "id": 2},
]

In [30]:
df = pd.DataFrame(data)
df

Unnamed: 0,name,email,id
0,data,data@gmail.com,1
1,data2,data2@naver.com,2


In [None]:
# 딕셔너리의 리스트

In [32]:
df.to_dict("list")

{'name': ['data', 'data2'],
 'email': ['data@gmail.com', 'data2@naver.com'],
 'id': [1, 2]}

In [33]:
data = df.to_dict("list")
df = pd.DataFrame(data)
df

Unnamed: 0,name,email,id
0,data,data@gmail.com,1
1,data2,data2@naver.com,2


In [27]:
# 2. 데이터 선택, 수정, 삭제

In [37]:
# row : df.loc[row]
print(df.loc[0])
df.loc[:]

name               data
email    data@gmail.com
id                    1
Name: 0, dtype: object


Unnamed: 0,name,email,id
0,data,data@gmail.com,1
1,data2,data2@naver.com,2


In [None]:
# column : df[column]

In [38]:
df["name"]

0     data
1    data2
Name: name, dtype: object

In [39]:
df[["id", "name"]]

Unnamed: 0,id,name
0,1,data
1,2,data2


In [None]:
# row, column : df.loc[row, column]

In [44]:
df.loc[[0, 1], ["id", "name"]]

Unnamed: 0,id,name
0,1,data
1,2,data2


In [49]:
# 속성값
df.columns, df.index, df.values

(Index(['name', 'email', 'id'], dtype='object'),
 RangeIndex(start=0, stop=2, step=1),
 array([['data', 'data@gmail.com', 1],
        ['data2', 'data2@naver.com', 2]], dtype=object))

In [52]:
# 데이터 수정, 추가

In [54]:
df.loc[0] = ["note", "note@sk.com", 3]
df

Unnamed: 0,name,email,id
0,note,note@sk.com,3
1,data2,data2@naver.com,2


In [55]:
df.loc[2] = ["note2", "note2@sk.com", 4]
df

Unnamed: 0,name,email,id
0,note,note@sk.com,3
1,data2,data2@naver.com,2
2,note2,note2@sk.com,4


In [57]:
df["id"] = [5, 6, 7]
df

Unnamed: 0,name,email,id
0,note,note@sk.com,5
1,data2,data2@naver.com,6
2,note2,note2@sk.com,7


In [61]:
# df["addr"] = "seoul"
df["addr"] = ["seoul", "pusan", "incheon"]
df

Unnamed: 0,name,email,id,addr
0,note,note@sk.com,5,seoul
1,data2,data2@naver.com,6,pusan
2,note2,note2@sk.com,7,incheon


In [62]:
# 데이터 삭제

In [65]:
df.drop(columns=["addr"], inplace=True)
df

Unnamed: 0,name,email,id
0,note,note@sk.com,5
1,data2,data2@naver.com,6
2,note2,note2@sk.com,7


In [70]:
df.drop(index=[2], inplace=True)
df

Unnamed: 0,name,email,id
0,note,note@sk.com,5
1,data2,data2@naver.com,6


In [28]:
# 3. 데이터 프레임 함수들
# apply, concat, groupby, merge, pivot

In [74]:
df["email"]

0        note@sk.com
1    data2@naver.com
Name: email, dtype: object

In [75]:
df["domain"] = df["email"].apply(
    lambda email: email.split("@")[1].split(".")[0])
df

Unnamed: 0,name,email,id,domain
0,note,note@sk.com,5,sk
1,data2,data2@naver.com,6,naver


In [78]:
df1 = df.copy()
df2 = df.copy()

In [79]:
df1

Unnamed: 0,name,email,id,domain
0,note,note@sk.com,5,sk
1,data2,data2@naver.com,6,naver


In [80]:
df2

Unnamed: 0,name,email,id,domain
0,note,note@sk.com,5,sk
1,data2,data2@naver.com,6,naver


In [81]:
pd.concat([df1, df2])

Unnamed: 0,name,email,id,domain
0,note,note@sk.com,5,sk
1,data2,data2@naver.com,6,naver
0,note,note@sk.com,5,sk
1,data2,data2@naver.com,6,naver


In [82]:
pd.concat([df1, df2], axis=1)

Unnamed: 0,name,email,id,domain,name.1,email.1,id.1,domain.1
0,note,note@sk.com,5,sk,note,note@sk.com,5,sk
1,data2,data2@naver.com,6,naver,data2,data2@naver.com,6,naver


In [83]:
df3 = pd.concat([df1, df2])
df3

Unnamed: 0,name,email,id,domain
0,note,note@sk.com,5,sk
1,data2,data2@naver.com,6,naver
0,note,note@sk.com,5,sk
1,data2,data2@naver.com,6,naver


In [87]:
df3.groupby("domain").size()

domain
naver    2
sk       2
dtype: int64

In [86]:
df3.groupby("domain").size().reset_index(name="count")

Unnamed: 0,domain,count
0,naver,2
1,sk,2


In [97]:
df1 = pd.DataFrame({
    "name": list("ABCD"),
    "addr": ["seoul", "pusan", "incheon", "deagu"],
})

In [98]:
df2 = pd.DataFrame({
    "addr": ["seoul", "pusan", "incheon"],
    "population": [1000, 300, 200],
})

In [101]:
pd.merge(df1, df2, how="outer") 

Unnamed: 0,name,addr,population
0,A,seoul,1000.0
1,B,pusan,300.0
2,C,incheon,200.0
3,D,deagu,


In [102]:
# pivot : titanic
# 특정 컬럼을 index, column, value로 설정해서 데이터 프레임을 출력

In [103]:
df = pd.read_csv("train.csv")
df.tail(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [105]:
result = df.pivot_table("Name", "Survived", "Pclass", aggfunc=len)
result

Pclass,1,2,3
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,80,97,372
1,136,87,119


In [109]:
result / np.sum(np.sum(result))

Pclass,1,2,3
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.089787,0.108866,0.417508
1,0.152637,0.097643,0.133558
