# Chapter3 Data Wragling 

- 작성자: 박하람
- 파트: 3.0 소개 ~ 3.10 열 삭제하기
- 원본 데이터를 정제하고 사용가능한 형태로 구성하기 위한 변환 과정을 광범위하게 의미하는 비공식적 용어 
- Pandas를 활용한 전처리 작업 목적

In [3]:
import pandas as pd 
url = 'https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/titanic.csv'
df = pd.read_csv(url)
df.head(5)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0,1
4,"Allison, Master Hudson Trevor",1st,0.92,male,1,0


## 데이터프레임 만들기

In [4]:
# numpy에 주입해 만들기 
import numpy as np 

data = [['Haram Park', 26, True], ['Young-joo Lee', 21, False]]
matrix = np.array(data)
pd.DataFrame(matrix, columns=['Name','Age','Driver'])

Unnamed: 0,Name,Age,Driver
0,Haram Park,26,True
1,Young-joo Lee,21,False


## 데이터 특징 파악하기

In [5]:
df.shape

(1313, 6)

In [6]:
df.describe()

Unnamed: 0,Age,Survived,SexCode
count,756.0,1313.0,1313.0
mean,30.397989,0.342727,0.351866
std,14.259049,0.474802,0.477734
min,0.17,0.0,0.0
25%,21.0,0.0,0.0
50%,28.0,0.0,0.0
75%,39.0,1.0,1.0
max,71.0,1.0,1.0


## 데이터프레임 탐색하기

In [7]:
df.iloc[1:4]

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0,1


In [8]:
df[['Age','Sex']].head()

Unnamed: 0,Age,Sex
0,29.0,female
1,2.0,female
2,30.0,male
3,25.0,female
4,0.92,male


## 조건에 따라 행 선택하기

In [9]:
df[df['Sex'] == 'female'].head(2)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1


In [10]:
df[(df['Sex'] == 'female') & (df['Age'] >= 65)]

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
73,"Crosby, Mrs Edward Gifford (Catherine Elizabet...",1st,69.0,female,1,1


## 값 치환하기 

In [11]:
# 남성, 여성 바꾸기 
df['Sex'].replace(['female','male'], ['Woman','Man']).head(5)

0    Woman
1    Woman
2      Man
3    Woman
4      Man
Name: Sex, dtype: object

In [12]:
# 정규표현식도 인식함
df.replace(r'1st', 'First', regex=True).head(2)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",First,29.0,female,1,1
1,"Allison, Miss Helen Loraine",First,2.0,female,0,1


In [13]:
# 여러개의 원소를 동일한 단어로 바꾸는 것 가능 
df.replace(['female','male'],'person').head(3)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,person,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,person,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,person,0,0


In [14]:
# female=1, male=0
df.replace({'female':1, 'male':0}).head(3)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,1,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,1,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,0,0,0


## 열 이름 바꾸기 

In [16]:
df.rename(columns={'PClass': 'Passenger Class', 'Sex':'Gender'}).head(2)

Unnamed: 0,Name,Passenger Class,Age,Gender,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1


In [19]:
# 전체 열의 이름을 동시에 바꾸려면! 
import collections 

column_names = collections.defaultdict(str)

for name in df.columns: 
    column_names[name]
    
column_names

defaultdict(str,
            {'Name': '',
             'PClass': '',
             'Age': '',
             'Sex': '',
             'Survived': '',
             'SexCode': ''})

In [20]:
# 인덱스 0을 -1로 바꾸기 
df.rename(index={0:-1}).head(2)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
-1,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1


In [21]:
# 열 이름을 소문자로 바꿈 
df.rename(str.lower, axis='columns').head(2)

Unnamed: 0,name,pclass,age,sex,survived,sexcode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1


## 기술통계값 계산하기

In [23]:
print(df['Age'].max())
print(df['Age'].min())
print(df['Age'].mean())
print(df['Age'].sum())
print(df['Age'].count())

71.0
0.17
30.397989417989418
22980.88
756


In [24]:
df.corr()

Unnamed: 0,Age,Survived,SexCode
Age,1.0,-0.061254,-0.055138
Survived,-0.061254,1.0,0.502891
SexCode,-0.055138,0.502891,1.0


## 고유한 값 찾기

In [25]:
df['Sex'].value_counts()

male      851
female    462
Name: Sex, dtype: int64

In [28]:
# 고유한 값의 개수
df['PClass'].nunique()

4

In [27]:
df.nunique()

Name        1310
PClass         4
Age           75
Sex            2
Survived       2
SexCode        2
dtype: int64

## 누락된 값 다루기

In [29]:
df[df['Age'].isnull()].head(2)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
12,"Aubert, Mrs Leontine Pauline",1st,,female,1,1
13,"Barkworth, Mr Algernon H",1st,,male,1,0


In [30]:
# NaN 값으로 바꾸려면 np.nan 사용! 
df['Sex'] = df['Sex'].replace('male', np.nan)

In [32]:
# NaN 등을 누락값으로 하지 않고 특정값을 누락으로하고 싶을 때! 
df = pd.read_csv(url, na_values=['female'], keep_default_na=False)
df[12:14]

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
12,"Aubert, Mrs Leontine Pauline",1st,,,1,1
13,"Barkworth, Mr Algernon H",1st,,male,1,0


In [33]:
# 누락된 값 지정! 
df = pd.read_csv(url, na_values=[np.nan, 'NONE', -999])

## 열 삭제하기 

In [34]:
df.drop(['Age','Sex'], axis=1).head(2)

Unnamed: 0,Name,PClass,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,1,1
1,"Allison, Miss Helen Loraine",1st,0,1


In [35]:
df.drop(df.columns[1], axis=1).head(2)

Unnamed: 0,Name,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",29.0,female,1,1
1,"Allison, Miss Helen Loraine",2.0,female,0,1


## 행 삭제하기

In [36]:
df[df.index != 0].head(2)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0


## 중복된 행 삭제하기

In [37]:
# 일부 열만 대상으로 중복된 행 검사할 때!
df.drop_duplicates(subset=['Sex'])

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0


In [38]:
# 중복된 행 삭제
df.drop_duplicates(subset=['Sex'], keep='last')

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
1307,"Zabour, Miss Tamini",3rd,,female,0,1
1312,"Zimmerman, Leo",3rd,29.0,male,0,0


In [40]:
time_index = pd.date_range('06/06/2017', periods=100000, freq='30S')
df = pd.DataFrame(index=time_index)
df['Sale_Amount'] = np.random.randint(1, 10, 100000)
df.head(3)

Unnamed: 0,Sale_Amount
2017-06-06 00:00:00,4
2017-06-06 00:00:30,3
2017-06-06 00:01:00,1


In [41]:
# 주 단위로 행을 그룹핑한 다음 합을 계산 
df.resample('W').sum()

Unnamed: 0,Sale_Amount
2017-06-11,86377
2017-06-18,101426
2017-06-25,101166
2017-07-02,101474
2017-07-09,100675
2017-07-16,10446


In [42]:
# 2주 기준!
df.resample('2W').mean()

Unnamed: 0,Sale_Amount
2017-06-11,4.998669
2017-06-25,5.024603
2017-07-09,5.013616
2017-07-23,5.022115


In [43]:
# 한달 간격! 
df.resample('M').count()

Unnamed: 0,Sale_Amount
2017-06-30,72000
2017-07-31,28000


In [44]:
# 시작 날짜로 바꾸기 
df.resample('MS').count()

Unnamed: 0,Sale_Amount
2017-06-01,72000
2017-07-01,28000


## 열 원소 순회하기

In [45]:
import pandas as pd 
url = 'https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/titanic.csv'
df = pd.read_csv(url)
df.head(5)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0,1
4,"Allison, Master Hudson Trevor",1st,0.92,male,1,0


In [46]:
[name.upper() for name in df['Name'][0:2]]

['ALLEN, MISS ELISABETH WALTON', 'ALLISON, MISS HELEN LORAINE']

## 모든 열 원소에 함수 적용하기

In [47]:
# map은 dict.를 넣을 수 있음
df['Survived'].map({1:'Live', 0:'Dead'})[:5]

0    Live
1    Dead
2    Dead
3    Dead
4    Live
Name: Survived, dtype: object

In [48]:
df['Age'].apply(lambda x, age: x < age, age=30)[:5]

0     True
1     True
2    False
3     True
4     True
Name: Age, dtype: bool

In [50]:
# apply는 df 전체에, applymap은 열의 각 원소에! 
print(df.apply(lambda x: max(x)))

def truncate_string(x): 
    if type(x) == str: 
        return x[:20]

print(df.applymap(truncate_string)[:5])

Name        del Carlo, Mrs Sebastiano (Argenia Genovese)
PClass                                               3rd
Age                                                   71
Sex                                                 male
Survived                                               1
SexCode                                                1
dtype: object
                   Name PClass   Age     Sex Survived SexCode
0  Allen, Miss Elisabet    1st  None  female     None    None
1  Allison, Miss Helen     1st  None  female     None    None
2  Allison, Mr Hudson J    1st  None    male     None    None
3  Allison, Mrs Hudson     1st  None  female     None    None
4  Allison, Master Huds    1st  None    male     None    None
