# 데이터 랭글링 : 데이터 전처리의 한 단계

In [11]:
import pandas as pd

In [18]:
url='https://raw.githubusercontent.com/chrisalbon/sim_data/master/titanic.csv'

In [19]:
dataframe=pd.read_csv(url)

In [20]:
dataframe.head(5)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0,1
4,"Allison, Master Hudson Trevor",1st,0.92,male,1,0


## 데이터프레임 만들기

In [21]:
import pandas as pd

In [39]:
dataframe=pd.DataFrame()

In [40]:
# 열 추가하기

dataframe['Name']=['Jacky Jackson', 'Steven Stevenson']
dataframe['Age']=[38,25]
dataframe['Driver']=[True,False]

In [45]:
dataframe

Unnamed: 0,Name,Age,Driver
0,Jacky Jackson,38,True
1,Steven Stevenson,25,False


In [46]:
# 기존 데이터프레임에 새로운 행을 추가하는 방법

new_person=pd.Series(['Yoo Naul',40,True], index=['Name','Age','Driver'])

dataframe.append(new_person, ignore_index=True)  # 인덱스가 같다면, ignore_index=True

Unnamed: 0,Name,Age,Driver
0,Jacky Jackson,38,True
1,Steven Stevenson,25,False
2,Yoo Naul,40,True


In [47]:
import numpy as np

In [48]:
# 넘파이 배열을 통하여 데이터를 전달하는 방법

data=[['Kim minsoo', 24,True],['Lee Chul soo',18,False]]

data

[['Kim minsoo', 24, True], ['Lee Chul soo', 18, False]]

In [49]:
matrix=np.array(data)
matrix

array([['Kim minsoo', '24', 'True'],
       ['Lee Chul soo', '18', 'False']], dtype='<U12')

In [53]:
# numpy로 array를 만들어서 데이터프레임을 만드는 방법

pd.DataFrame(matrix, columns=['Name','Age','Driver'])

Unnamed: 0,Name,Age,Driver
0,Kim minsoo,24,True
1,Lee Chul soo,18,False


In [54]:
# 원본 데이터를 사용하여 데이터프레임을 만드는 방법

pd.DataFrame(data,columns=['Name','Age','Driver'])

Unnamed: 0,Name,Age,Driver
0,Kim minsoo,24,True
1,Lee Chul soo,18,False


In [55]:
# 딕셔너리를 통해 데이터프레임을 만드는 방법

data={'Name':['Kim minsoo','Lee Chul soo'],
     'Age':[38,25],
     'Driver':[True, False]}

pd.DataFrame(data)

Unnamed: 0,Name,Age,Driver
0,Kim minsoo,38,True
1,Lee Chul soo,25,False


In [56]:
# 딕셔너리를 샘플마다 구성한 경우에도 데이터프레임을 만들 수 있음

data=[{'Name':'Kim minsoo','Age':38,'Driver':True},
     {'Name':'Lee chul soo','Age':13,'Driver':False}]

pd.DataFrame(data)

Unnamed: 0,Name,Age,Driver
0,Kim minsoo,38,True
1,Lee chul soo,13,False


In [57]:
# index 를 매개변수로 지정할 수 있음

pd.DataFrame(data, index=['Row 1', 'Row 2'])

Unnamed: 0,Name,Age,Driver
Row 1,Kim minsoo,38,True
Row 2,Lee chul soo,13,False


## 데이터 설명하기

In [58]:
url='https://raw.githubusercontent.com/chrisalbon/sim_data/master/titanic.csv'

In [59]:
dataframe=pd.read_csv(url)

In [60]:
dataframe.head(3)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0


In [62]:
# 차원확인 (행,렬)

dataframe.shape

(1313, 6)

In [64]:
# 숫자로 된 열의 통계값 확인하기   - 개수, 평균, 표준편차, 최소값, 1,2,3 사분면값, 최대값

dataframe.describe()

Unnamed: 0,Age,Survived,SexCode
count,756.0,1313.0,1313.0
mean,30.397989,0.342727,0.351866
std,14.259049,0.474802,0.477734
min,0.17,0.0,0.0
25%,21.0,0.0,0.0
50%,28.0,0.0,0.0
75%,39.0,1.0,1.0
max,71.0,1.0,1.0


## 데이터프레임 탐색하기

In [65]:
# 첫번째 행 선택

dataframe.iloc[0]

Name        Allen, Miss Elisabeth Walton
PClass                               1st
Age                                   29
Sex                               female
Survived                               1
SexCode                                1
Name: 0, dtype: object

In [67]:
# 원하는 행 선택

dataframe.iloc[1:4]  # 2~4 번째 행 선택 (인덱스 넘버로는 1~3 반환)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0,1


In [68]:
# 인덱스 설정하는 방법

dataframe=dataframe.set_index(dataframe['Name'])

In [70]:
# 인덱스가 레이블일때 확인

dataframe.loc['Allen, Miss Elisabeth Walton']

Name        Allen, Miss Elisabeth Walton
PClass                               1st
Age                                   29
Sex                               female
Survived                               1
SexCode                                1
Name: Allen, Miss Elisabeth Walton, dtype: object

#### loc  : 데이터프레임의 인덱스가 레이블(ex: 문자열) 일 때 사용
#### iloc : 데이터프레임의 위치참조 ( ex: iloc[0]은 정수 혹은 문자열 인덱스에 상관없이 첫번째 행 반환)

In [71]:
# 슬라이싱을 통해 열을 선택할 수 있음

'''
Allison, Miss Helen Loraine 이전까지 행에서 Age 열과 Sex 열만 선택
'''

dataframe.loc[:'Allison, Miss Helen Loraine', 'Age':'Sex']

Unnamed: 0_level_0,Age,Sex
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
"Allen, Miss Elisabeth Walton",29.0,female
"Allison, Miss Helen Loraine",2.0,female


In [72]:
dataframe[:'Allison, Miss Helen Loraine']

Unnamed: 0_level_0,Name,PClass,Age,Sex,Survived,SexCode
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Allen, Miss Elisabeth Walton","Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
"Allison, Miss Helen Loraine","Allison, Miss Helen Loraine",1st,2.0,female,0,1


In [73]:
dataframe[['Age','Sex']].head(2)

Unnamed: 0_level_0,Age,Sex
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
"Allen, Miss Elisabeth Walton",29.0,female
"Allison, Miss Helen Loraine",2.0,female


## 조건에 따라 행 선택하기

In [74]:
# 'sex' 열이 'female' 인 행 중 처음 두 개 출력

dataframe[dataframe['Sex']=='female'].head(2)

Unnamed: 0_level_0,Name,PClass,Age,Sex,Survived,SexCode
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Allen, Miss Elisabeth Walton","Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
"Allison, Miss Helen Loraine","Allison, Miss Helen Loraine",1st,2.0,female,0,1


In [77]:
# 조건 여러개 사용 가능

'''
65세 이상의 여성 선택
'''

dataframe[(dataframe['Age']>=65)& (dataframe['Sex']=='female')]

Unnamed: 0_level_0,Name,PClass,Age,Sex,Survived,SexCode
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Crosby, Mrs Edward Gifford (Catherine Elizabeth Halstead)","Crosby, Mrs Edward Gifford (Catherine Elizabet...",1st,69.0,female,1,1


## 값 치환하기

In [80]:
url='https://raw.githubusercontent.com/chrisalbon/sim_data/master/titanic.csv'

In [86]:
dataframe=pd.read_csv(url)

In [82]:
# female 을 woman 으로 치환

dataframe['Sex'].replace('female','Woman').head(2)

0    Woman
1    Woman
Name: Sex, dtype: object

In [88]:
# 동시에 여러 개의 값도 바꿀 수 있음

dataframe=pd.read_csv(url)
dataframe['Sex'].replace(['female','male'],['Woman','Man']).head(6)

0    Woman
1    Woman
2      Man
3    Woman
4      Man
5      Man
Name: Sex, dtype: object

In [89]:
# 전체 데이터프레임 객체에서 값을 찾아 바꿀 수도 있음

dataframe=pd.read_csv(url)
dataframe.replace(1,'One').head(3)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29,female,One,One
1,"Allison, Miss Helen Loraine",1st,2,female,0,One
2,"Allison, Mr Hudson Joshua Creighton",1st,30,male,0,0


In [90]:
# replace 함수는 정규표현식도 인식함

dataframe=pd.read_csv(url)
dataframe.replace(r'1st','First', regex=True).head(3)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",First,29.0,female,1,1
1,"Allison, Miss Helen Loraine",First,2.0,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",First,30.0,male,0,0


In [91]:
# 한번에 여러개의 값을 동일하게 바꿀 수도 있음

dataframe=pd.read_csv(url)
dataframe.replace(['female','male'],'person').head(3)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,person,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,person,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,person,0,0


In [92]:
# 딕셔너리로 바꿀 값을 매핑하여 전달할 수 있음

dataframe=pd.read_csv(url)
dataframe.replace({'female':1,'male':0}).head(5)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,1,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,1,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,0,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,1,0,1
4,"Allison, Master Hudson Trevor",1st,0.92,0,1,0


## 열 이름 바꾸기

In [93]:
dataframe.rename(columns={'PClass':'Passenger Class'}).head(3)

Unnamed: 0,Name,Passenger Class,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0


In [98]:
# 딕셔너리를 통하여 바꾸려는 열을 여러개 지정 가능

dataframe=pd.read_csv(url)
dataframe.rename(columns={'PClass':'Passenger Class','Sex':'Gender'}).head(5)

Unnamed: 0,Name,Passenger Class,Age,Gender,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0,1
4,"Allison, Master Hudson Trevor",1st,0.92,male,1,0


In [99]:
# 전체 열의 이름을 동시에 바꿀때 딕셔너리를 활용하는것이 용이

import collections

column_names=collections.defaultdict(str)

In [106]:
# 키 생성 - 데이터프레임.columns  -> 리스트 형태

i=1
for name in dataframe.columns:
    column_names[name]=i
    i+=1

In [107]:
column_names

defaultdict(str,
            {'Name': 1,
             'PClass': 2,
             'Age': 3,
             'Sex': 4,
             'Survived': 5,
             'SexCode': 6})

In [108]:
# index 매개변수를 사용하여 인덱스를 바꿀 수 있음

dataframe.rename(index={0:-1}).head(2)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
-1,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1


In [110]:
'''
변환 함수를 전달하고 axis 매개변수에 'columns' 또는 'index' 를 지정할 수 있음
'''

dataframe.rename(str.lower, axis='columns').head(3)

Unnamed: 0,name,pclass,age,sex,survived,sexcode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0


## 최소값, 최대값, 합, 평균 계산 및 개수 세기

In [111]:
# Pandas 는 기술통계를 위한 메소드를 제공한다

print('최대값: ', dataframe['Age'].max())
print('최소값: ',dataframe['Age'].min())
print('평균: ',dataframe['Age'].mean())
print('합: ',dataframe['Age'].sum())
print('카운트: ',dataframe['Age'].count())

최대값:  71.0
최소값:  0.17
평균:  30.397989417989415
합:  22980.88
카운트:  756


In [119]:
# 메소드는 전체 데이터프레임에 적용 가능

dataframe.count()

Name        1313
PClass      1313
Age          756
Sex         1313
Survived    1313
SexCode     1313
dtype: int64

#### 분산(var), 표준편차(std), 첨도(kurt), 왜도-비대칭도(skew), 평균의 표준오차(sem), 중간값(median), 공분산(cov), 상관계수(corr) 도 있음

첨도가 3에 가까우면 정규분포와 비슷, 3보다 작을 경우 정규분포보다 납작, 3보다 클 경우 더 뾰족

왜도가 음수일 경우 정규분포보다 오른쪽으로 치우쳐 있음 (평균, 중앙값, 최빈값)
       양수일 경우 정규분포보다 왼쪽으로 치우쳐 있음 (최빈값, 중앙값, 평균)
          
평균의 표준오차 : 샘플링된 표본의 평균에 대한 표준편차

In [122]:
dataframe.cov()

Unnamed: 0,Age,Survived,SexCode
Age,203.32047,-0.430491,-0.382054
Survived,-0.430491,0.225437,0.11407
SexCode,-0.382054,0.11407,0.22823


In [123]:
dataframe.corr()

Unnamed: 0,Age,Survived,SexCode
Age,1.0,-0.061254,-0.055138
Survived,-0.061254,1.0,0.502891
SexCode,-0.055138,0.502891,1.0
