In [37]:
#출처:https://doorbw.tistory.com/172
#pandas
import pandas as pd
import numpy as np

# 1. Pandas 자료구조
## 1-1. Series
## 1-2. DataFrame

1-1. Series

In [11]:
# Series 정의
obj = pd.Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [12]:
# Series 값만 확인
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [13]:
# Series 인덱스만 확인
obj.index

RangeIndex(start=0, stop=4, step=1)

In [14]:
# Series 자료형 확인
obj.dtypes

dtype('int64')

In [15]:
# 인덱스 바꾸기
obj2 = pd.Series([4,7,-5,3],index=['a','b','c','d'])
obj2

a    4
b    7
c   -5
d    3
dtype: int64

In [16]:
# Python의 dictionary 자료형 -> Series data
# dictionary의 key -> Series의 index
sdata = {'apple':3000,'banana':5000,'strawberry':6000,'orange':2000}
obj3 = pd.Series(sdata)
obj3

apple         3000
banana        5000
strawberry    6000
orange        2000
dtype: int64

In [17]:
obj3.name = 'Fruits'
obj3.index.name="Names"
obj3

Names
apple         3000
banana        5000
strawberry    6000
orange        2000
Name: Fruits, dtype: int64

In [18]:
# index 변경
obj3.index = ['A','B','C','D']
obj3

A    3000
B    5000
C    6000
D    2000
Name: Fruits, dtype: int64

1-2. Data Frame

In [19]:
# Data Frame 정의
# DataFrame에 들어갈 데이터 정의해야함. python의 dictionary or numpy의 array로 정의
data={'name':['apple','banana','orange','orange','strawberry'], 'price':[2000,4000,3000,3000,5000], 'points':[1.5,2.8,3.3,4.5,1.0]}
df = pd.DataFrame(data)
df

Unnamed: 0,name,price,points
0,apple,2000,1.5
1,banana,4000,2.8
2,orange,3000,3.3
3,orange,3000,4.5
4,strawberry,5000,1.0


In [20]:
# 행 & 열

In [21]:
#행 방향 index
df.index

RangeIndex(start=0, stop=5, step=1)

In [22]:
#열 방향 index
df.columns

Index(['name', 'price', 'points'], dtype='object')

In [23]:
#값 얻기
df.values

array([['apple', 2000, 1.5],
       ['banana', 4000, 2.8],
       ['orange', 3000, 3.3],
       ['orange', 3000, 4.5],
       ['strawberry', 5000, 1.0]], dtype=object)

In [24]:
# 인덱스에 대한 이름 설정
df.index.name = 'Number'
df.columns.name = 'Info'
df

Info,name,price,points
Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,apple,2000,1.5
1,banana,4000,2.8
2,orange,3000,3.3
3,orange,3000,4.5
4,strawberry,5000,1.0


In [25]:
# DataFrame을 만들면서 columns와 index를 설정할 수 있다.
df2 = pd.DataFrame(data, columns=['year', 'name', 'points', 'penalty'],
                  index=['one', 'two', 'three', 'four', 'five'])
df2

Unnamed: 0,year,name,points,penalty
one,,apple,1.5,
two,,banana,2.8,
three,,orange,3.3,
four,,orange,4.5,
five,,strawberry,1.0,


In [26]:
# describe() 함수는 DataFrame의 계산 가능한 값들에 대한 다양한 계산 값을 보여준다.
df2.describe()

Unnamed: 0,points
count,5.0
mean,2.62
std,1.406058
min,1.0
25%,1.5
50%,2.8
75%,3.3
max,4.5


# 2. DataFrame Indexing

In [27]:
data = {"names": ["Melon", "Apple", "Orange", "Strawberry", "WaterMelon"],
           "price": [5000, 2000, 3000, 4000, 5000],
           "points": [1.5, 1.7, 3.6, 2.4, 2.9]}
df = pd.DataFrame(data, columns=["names", "points", "price", "year"],
                          index=["one", "two", "three", "four", "five"])
df

Unnamed: 0,names,points,price,year
one,Melon,1.5,5000,
two,Apple,1.7,2000,
three,Orange,3.6,3000,
four,Strawberry,2.4,4000,
five,WaterMelon,2.9,5000,


## 2.1 DataFrame에서 열 선택 & 조작

In [28]:
df['year']

one      NaN
two      NaN
three    NaN
four     NaN
five     NaN
Name: year, dtype: object

In [29]:
df[['year','names']]

Unnamed: 0,year,names
one,,Melon
two,,Apple
three,,Orange
four,,Strawberry
five,,WaterMelon


In [32]:
# 특정 열에 대해 위와 같이 선택하고, 우리가 원하는 값을 대입할 수 있다.
df['points'] = 0.5

# 또는
df['points'] = [0.1, 0.2, 0.3, 0.4, 0.5] # python의 List나 numpy의 array

df

Unnamed: 0,names,points,price,year,penalty
one,Melon,0.1,5000,,0.1
two,Apple,0.2,2000,,0.2
three,Orange,0.3,3000,,0.3
four,Strawberry,0.4,4000,,0.4
five,WaterMelon,0.5,5000,,0.5


In [33]:
# 새로운 열을 추가하기
df['zeros'] = np.arange(5)
df

Unnamed: 0,names,points,price,year,penalty,zeros
one,Melon,0.1,5000,,0.1,0
two,Apple,0.2,2000,,0.2,1
three,Orange,0.3,3000,,0.3,2
four,Strawberry,0.4,4000,,0.4,3
five,WaterMelon,0.5,5000,,0.5,4


In [35]:
# Series를 추가
val = pd.Series(['Red','Orange','Red'], index=['two','three','four'])
df['color']=val
df

Unnamed: 0,names,points,price,year,penalty,zeros,color
one,Melon,0.1,5000,,0.1,0,
two,Apple,0.2,2000,,0.2,1,Red
three,Orange,0.3,3000,,0.3,2,Orange
four,Strawberry,0.4,4000,,0.4,3,Red
five,WaterMelon,0.5,5000,,0.5,4,


In [36]:
#열 삭제
del df['zeros']
df

Unnamed: 0,names,points,price,year,penalty,color
one,Melon,0.1,5000,,0.1,
two,Apple,0.2,2000,,0.2,Red
three,Orange,0.3,3000,,0.3,Orange
four,Strawberry,0.4,4000,,0.4,Red
five,WaterMelon,0.5,5000,,0.5,


## 2.2 DataFrame에서 행 선택&조작