# 데이터 프레임

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame([['a', 'b', 'c'], ['a', 'a', 'g'], ['a', 'a']])
df

Unnamed: 0,0,1,2
0,a,b,c
1,a,a,g
2,a,a,


In [4]:
# 열 데이터를 dict로 작성한는 것이 일반적임
df1 = pd.DataFrame({'A':[90, 80, 70], 'B':[85, 98, 75], 'C':[88, 99, 77], 'D':[87, 89, 86]}, index=[1, 2, 3])
df1

Unnamed: 0,A,B,C,D
1,90,85,88,87
2,80,98,99,89
3,70,75,77,86


csv 데이터로부터 Dataframe 생성

In [5]:
train_data = pd.read_csv('./data/train.csv')
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


read_csv 함수 파라미터
- sep : 각 데이터 값을 구별하기 위한 구분자 설정
- header : header를 무시할 경우, None 설정
- index_col : index로 사용할 column설정
- usecols : 실제로 dataframe에 로딩할 columns만 설정

In [6]:
train_data = pd.read_csv('./data/train.csv', index_col = 'PassengerId', usecols=['PassengerId', 'Survived', 'Name', 'Sex', 'Age'])

In [7]:
train_data.columns

Index(['Survived', 'Name', 'Sex', 'Age'], dtype='object')

In [8]:
train_data.head()

Unnamed: 0_level_0,Survived,Name,Sex,Age
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,"Braund, Mr. Owen Harris",male,22.0
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0
3,1,"Heikkinen, Miss. Laina",female,26.0
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0
5,0,"Allen, Mr. William Henry",male,35.0


In [9]:
print(train_data.size)

3564


In [10]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 891 entries, 1 to 891
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Name      891 non-null    object 
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
dtypes: float64(1), int64(1), object(2)
memory usage: 34.8+ KB


In [11]:
train_data.describe()

Unnamed: 0,Survived,Age
count,891.0,714.0
mean,0.383838,29.699118
std,0.486592,14.526497
min,0.0,0.42
25%,0.0,20.125
50%,0.0,28.0
75%,1.0,38.0
max,1.0,80.0


In [12]:
data = {"2015":[9904312, 3448737, 2890451, 2466052],
       "2010":[9631482, 3393191, 2632035, 2000002],
       "2005":[9762546, 3512547, 2517680, 2456016],
       "2000":[9853972, 3655437, 2466338, 2473990],
       "지역":['수도권', '경상권', '수도권', '경상권'],
       "2010-2015 증가율":[0.0283, 0.0163, 0.0982, 0.0141]}

df3 = pd.DataFrame(data)
df3

Unnamed: 0,2015,2010,2005,2000,지역,2010-2015 증가율
0,9904312,9631482,9762546,9853972,수도권,0.0283
1,3448737,3393191,3512547,3655437,경상권,0.0163
2,2890451,2632035,2517680,2466338,수도권,0.0982
3,2466052,2000002,2456016,2473990,경상권,0.0141


In [13]:
columns = ['지역', '2000', '2005', '2010', '2015', '2010-2015 증가율']
index = ['서울', '부산', '인천', '대구']
df3 = pd.DataFrame(data, index=index, columns = columns)
df3

Unnamed: 0,지역,2000,2005,2010,2015,2010-2015 증가율
서울,수도권,9853972,9762546,9631482,9904312,0.0283
부산,경상권,3655437,3512547,3393191,3448737,0.0163
인천,수도권,2466338,2517680,2632035,2890451,0.0982
대구,경상권,2473990,2456016,2000002,2466052,0.0141


In [14]:
df3['2010-2015 증가율']*100

서울    2.83
부산    1.63
인천    9.82
대구    1.41
Name: 2010-2015 증가율, dtype: float64

In [15]:
df3['2010-2015 증가율'] = df3['2010-2015 증가율']*100

In [16]:
df3

Unnamed: 0,지역,2000,2005,2010,2015,2010-2015 증가율
서울,수도권,9853972,9762546,9631482,9904312,2.83
부산,경상권,3655437,3512547,3393191,3448737,1.63
인천,수도권,2466338,2517680,2632035,2890451,9.82
대구,경상권,2473990,2456016,2000002,2466052,1.41


In [18]:
# 열 추가
df3['비고'] = ['특별시', '광역시', '특례시', '특례시']
df3

Unnamed: 0,지역,2000,2005,2010,2015,2010-2015 증가율,비고
서울,수도권,9853972,9762546,9631482,9904312,2.83,특별시
부산,경상권,3655437,3512547,3393191,3448737,1.63,광역시
인천,수도권,2466338,2517680,2632035,2890451,9.82,특례시
대구,경상권,2473990,2456016,2000002,2466052,1.41,특례시


In [19]:
del df3['비고']
df3

Unnamed: 0,지역,2000,2005,2010,2015,2010-2015 증가율
서울,수도권,9853972,9762546,9631482,9904312,2.83
부산,경상권,3655437,3512547,3393191,3448737,1.63
인천,수도권,2466338,2517680,2632035,2890451,9.82
대구,경상권,2473990,2456016,2000002,2466052,1.41


In [24]:
# 가공열 추가
df3['2005-2015 증가율'] = ((df3['2015']-df3['2005'])/df3['2005']*100).round(2)

In [26]:
df3

Unnamed: 0,지역,2000,2005,2010,2015,2010-2015 증가율,2005-2015 증가율
서울,수도권,9853972,9762546,9631482,9904312,2.83,1.45
부산,경상권,3655437,3512547,3393191,3448737,1.63,-1.82
인천,수도권,2466338,2517680,2632035,2890451,9.82,14.81
대구,경상권,2473990,2456016,2000002,2466052,1.41,0.41


In [27]:
del df3['2005-2015 증가율']
df3

Unnamed: 0,지역,2000,2005,2010,2015,2010-2015 증가율
서울,수도권,9853972,9762546,9631482,9904312,2.83
부산,경상권,3655437,3512547,3393191,3448737,1.63
인천,수도권,2466338,2517680,2632035,2890451,9.82
대구,경상권,2473990,2456016,2000002,2466052,1.41


In [28]:
# loc 인덱서 사용
df3.loc['광주']=['호남권', 2470000, 2456000, 2453000, 246000, 1.00]
df3

Unnamed: 0,지역,2000,2005,2010,2015,2010-2015 증가율
서울,수도권,9853972,9762546,9631482,9904312,2.83
부산,경상권,3655437,3512547,3393191,3448737,1.63
인천,수도권,2466338,2517680,2632035,2890451,9.82
대구,경상권,2473990,2456016,2000002,2466052,1.41
광주,호남권,2470000,2456000,2453000,246000,1.0


In [29]:
df3[:'서울']

Unnamed: 0,지역,2000,2005,2010,2015,2010-2015 증가율
서울,수도권,9853972,9762546,9631482,9904312,2.83
