## 원하는 형태로 Data 가공하기
| 함수          | 기능              |
|---------------|-------------------|
| query()       | 행 추출           |
| df[]          | 열(변수) 추출     |
| sort_values() | 정렬              |
| groupby()     | 집단별로 나누기   |
| agg()         | 통계치 구하기     |
| merge()       | 데이터 합치기(열) |
| concat()      | 데이터 합치기(행) |

## 조건에 맞는 Data 추출

In [1]:
import pandas as pd
exam = pd.read_csv('Data/Exam.csv')
exam

Unnamed: 0,id,nclass,math,english,science
0,1,1,50,98,50
1,2,1,60,97,60
2,3,1,45,86,78
3,4,1,30,98,58
4,5,2,25,80,65
5,6,2,50,89,98
6,7,2,80,90,45
7,8,2,90,78,25
8,9,3,20,98,15
9,10,3,50,98,45


In [2]:
# 경고 메시지 무시
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
exam.query('nclass == 1')

Unnamed: 0,id,nclass,math,english,science
0,1,1,50,98,50
1,2,1,60,97,60
2,3,1,45,86,78
3,4,1,30,98,58


In [4]:
exam['nclass'] == 1

0      True
1      True
2      True
3      True
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
Name: nclass, dtype: bool

In [5]:
exam.query('nclass != 1')

Unnamed: 0,id,nclass,math,english,science
4,5,2,25,80,65
5,6,2,50,89,98
6,7,2,80,90,45
7,8,2,90,78,25
8,9,3,20,98,15
9,10,3,50,98,45
10,11,3,65,65,65
11,12,3,45,85,32
12,13,4,46,98,65
13,14,4,48,87,12


In [6]:
exam.query('math > 50')

Unnamed: 0,id,nclass,math,english,science
1,2,1,60,97,60
6,7,2,80,90,45
7,8,2,90,78,25
10,11,3,65,65,65
14,15,4,75,56,78
15,16,4,58,98,65
16,17,5,65,68,98
17,18,5,80,78,90
18,19,5,89,68,87
19,20,5,78,83,58


In [10]:
exam.query('nclass == 1 & math >= 50')

Unnamed: 0,id,nclass,math,english,science
0,1,1,50,98,50
1,2,1,60,97,60


In [11]:
exam.query('math >= 90 | english >= 90')

Unnamed: 0,id,nclass,math,english,science
0,1,1,50,98,50
1,2,1,60,97,60
3,4,1,30,98,58
6,7,2,80,90,45
7,8,2,90,78,25
8,9,3,20,98,15
9,10,3,50,98,45
12,13,4,46,98,65
15,16,4,58,98,65


In [12]:
exam.query('nclass == 1 | nclass ==3 | nclass == 5')

Unnamed: 0,id,nclass,math,english,science
0,1,1,50,98,50
1,2,1,60,97,60
2,3,1,45,86,78
3,4,1,30,98,58
8,9,3,20,98,15
9,10,3,50,98,45
10,11,3,65,65,65
11,12,3,45,85,32
16,17,5,65,68,98
17,18,5,80,78,90


In [15]:
exam.query('nclass in [1,3,5]')

Unnamed: 0,id,nclass,math,english,science
0,1,1,50,98,50
1,2,1,60,97,60
2,3,1,45,86,78
3,4,1,30,98,58
8,9,3,20,98,15
9,10,3,50,98,45
10,11,3,65,65,65
11,12,3,45,85,32
16,17,5,65,68,98
17,18,5,80,78,90


In [16]:
nclass1 = exam.query('nclass == 1')
nclass2 = exam.query('nclass == 2')

In [19]:
nclass1.math.mean()

46.25

In [20]:
nclass2.math.mean()

61.25

In [21]:
nclass1['math'].mean()

46.25

In [22]:
nclass2['math'].mean()

61.25

In [23]:
exam.query('nclass == 1')['math'].mean()

46.25

In [24]:
exam.query('nclass == 2')['math'].mean()

61.25

In [25]:
df = pd.DataFrame({'gender'  : ['F', 'M', 'F', 'M'],
                   'country' : ['Korea', 'China', 'Japan', 'USA']})
df

Unnamed: 0,gender,country
0,F,Korea
1,M,China
2,F,Japan
3,M,USA


In [33]:
df.query( "gender  == 'F' & country == 'Korea' ")

Unnamed: 0,gender,country
0,F,Korea


## 변수 값을 이용해서 Query 조건 만들기

In [35]:
var = 3
exam.query('nclass == @var')

Unnamed: 0,id,nclass,math,english,science
8,9,3,20,98,15
9,10,3,50,98,45
10,11,3,65,65,65
11,12,3,45,85,32


In [37]:
a = int(input('몇반 학생의 Data ?'))
exam.query('nclass == @a')   

몇반 학생의 Data ? 3


Unnamed: 0,id,nclass,math,english,science
8,9,3,20,98,15
9,10,3,50,98,45
10,11,3,65,65,65
11,12,3,45,85,32


In [38]:
## 필요한 항목만 추출

exam['math']

0     50
1     60
2     45
3     30
4     25
5     50
6     80
7     90
8     20
9     50
10    65
11    45
12    46
13    48
14    75
15    58
16    65
17    80
18    89
19    78
Name: math, dtype: int64

In [39]:
exam[['math']]

Unnamed: 0,math
0,50
1,60
2,45
3,30
4,25
5,50
6,80
7,90
8,20
9,50


In [42]:
exam[['nclass','math','english']]

Unnamed: 0,nclass,math,english
0,1,50,98
1,1,60,97
2,1,45,86
3,1,30,98
4,2,25,80
5,2,50,89
6,2,80,90
7,2,90,78
8,3,20,98
9,3,50,98


In [44]:
exam.drop(columns = 'math') # inplace = True


Unnamed: 0,id,nclass,english,science
0,1,1,98,50
1,2,1,97,60
2,3,1,86,78
3,4,1,98,58
4,5,2,80,65
5,6,2,89,98
6,7,2,90,45
7,8,2,78,25
8,9,3,98,15
9,10,3,98,45


In [45]:
exam.drop(columns = [ 'math', 'english'])

Unnamed: 0,id,nclass,science
0,1,1,50
1,2,1,60
2,3,1,78
3,4,1,58
4,5,2,65
5,6,2,98
6,7,2,45
7,8,2,25
8,9,3,15
9,10,3,45


## query() 와 [] 조합

In [46]:
exam.query('nclass == 1')['english']

0    98
1    97
2    86
3    98
Name: english, dtype: int64

In [47]:
exam.query('math >= 50')[['id','math']]

Unnamed: 0,id,math
0,1,50
1,2,60
5,6,50
6,7,80
7,8,90
9,10,50
10,11,65
14,15,75
15,16,58
16,17,65


In [48]:
exam.query('math >= 50')[['id','math']].head()

Unnamed: 0,id,math
0,1,50
1,2,60
5,6,50
6,7,80
7,8,90


In [50]:
exam.sort_values('math')

Unnamed: 0,id,nclass,math,english,science
8,9,3,20,98,15
4,5,2,25,80,65
3,4,1,30,98,58
2,3,1,45,86,78
11,12,3,45,85,32
12,13,4,46,98,65
13,14,4,48,87,12
0,1,1,50,98,50
9,10,3,50,98,45
5,6,2,50,89,98


In [51]:
exam.sort_values('math', ascending = True)

Unnamed: 0,id,nclass,math,english,science
8,9,3,20,98,15
4,5,2,25,80,65
3,4,1,30,98,58
2,3,1,45,86,78
11,12,3,45,85,32
12,13,4,46,98,65
13,14,4,48,87,12
0,1,1,50,98,50
9,10,3,50,98,45
5,6,2,50,89,98


In [57]:
exam.sort_values(['nclass', 'math'], ascending = True)

Unnamed: 0,id,nclass,math,english,science
3,4,1,30,98,58
2,3,1,45,86,78
0,1,1,50,98,50
1,2,1,60,97,60
4,5,2,25,80,65
5,6,2,50,89,98
6,7,2,80,90,45
7,8,2,90,78,25
8,9,3,20,98,15
11,12,3,45,85,32


In [58]:
exam.sort_values(['nclass', 'math'], ascending = [True, False])

Unnamed: 0,id,nclass,math,english,science
1,2,1,60,97,60
0,1,1,50,98,50
2,3,1,45,86,78
3,4,1,30,98,58
7,8,2,90,78,25
6,7,2,80,90,45
5,6,2,50,89,98
4,5,2,25,80,65
10,11,3,65,65,65
9,10,3,50,98,45


In [59]:
exam['total'] = exam['math'] +  exam['english'] + exam['science']
exam.head()

Unnamed: 0,id,nclass,math,english,science,total
0,1,1,50,98,50,198
1,2,1,60,97,60,217
2,3,1,45,86,78,209
3,4,1,30,98,58,186
4,5,2,25,80,65,170


In [61]:
exam.assign(total = exam['math'] +  exam['english'] + exam['science'])
## assign :  pandas 라이브러리에서 사용
## 새로운 열을 추가하거나 기존 열의 값을 수정하는 데 사용

Unnamed: 0,id,nclass,math,english,science,total
0,1,1,50,98,50,198
1,2,1,60,97,60,217
2,3,1,45,86,78,209
3,4,1,30,98,58,186
4,5,2,25,80,65,170
5,6,2,50,89,98,237
6,7,2,80,90,45,215
7,8,2,90,78,25,193
8,9,3,20,98,15,133
9,10,3,50,98,45,193


In [63]:
exam.assign(total = exam['math'] +  exam['english'] + exam['science'] ,
             mean = (exam['math'] +  exam['english'] + exam['science'])/3 )

Unnamed: 0,id,nclass,math,english,science,total,mean
0,1,1,50,98,50,198,66.0
1,2,1,60,97,60,217,72.333333
2,3,1,45,86,78,209,69.666667
3,4,1,30,98,58,186,62.0
4,5,2,25,80,65,170,56.666667
5,6,2,50,89,98,237,79.0
6,7,2,80,90,45,215,71.666667
7,8,2,90,78,25,193,64.333333
8,9,3,20,98,15,133,44.333333
9,10,3,50,98,45,193,64.333333


## assign() 에 np.where() 적용 / 조건절

In [64]:
import numpy as np

In [65]:
exam.assign(test = np.where (exam['science'] >= 60, 'pass','fail'))
## np.where(condition, x, y) : condition 이 True일때 x, False일때 y

Unnamed: 0,id,nclass,math,english,science,total,test
0,1,1,50,98,50,198,fail
1,2,1,60,97,60,217,pass
2,3,1,45,86,78,209,pass
3,4,1,30,98,58,186,fail
4,5,2,25,80,65,170,pass
5,6,2,50,89,98,237,pass
6,7,2,80,90,45,215,fail
7,8,2,90,78,25,193,fail
8,9,3,20,98,15,133,fail
9,10,3,50,98,45,193,fail


In [66]:
exam.assign(total = exam['math'] + exam['english'] + exam['science']).sort_values('total')

Unnamed: 0,id,nclass,math,english,science,total
8,9,3,20,98,15,133
13,14,4,48,87,12,147
11,12,3,45,85,32,162
4,5,2,25,80,65,170
3,4,1,30,98,58,186
9,10,3,50,98,45,193
7,8,2,90,78,25,193
10,11,3,65,65,65,195
0,1,1,50,98,50,198
2,3,1,45,86,78,209


## groupby() 와 agg()

In [67]:
## DATAfRAME.GROUPBY(기준항목).AGG(시용하고자 하는 fUNCTION 들)

exam.groupby('nclass').agg(mean_math = ('math', 'mean'))

Unnamed: 0_level_0,mean_math
nclass,Unnamed: 1_level_1
1,46.25
2,61.25
3,45.0
4,56.75
5,78.0


In [68]:
exam.groupby('nclass').agg(mean_math = ('math', 'mean'), #수학점수 평균
                          sum_math = ('math', 'sum'), #수학점수 합계
                          median_math = ('math', 'median'),# 수학점수 중앙값
                          n = ('nclass', 'count')) # 반 학생수(빈도)

Unnamed: 0_level_0,mean_math,sum_math,median_math,n
nclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,46.25,185,47.5,4
2,61.25,245,65.0,4
3,45.0,180,47.5,4
4,56.75,227,53.0,4
5,78.0,312,79.0,4


In [69]:
exam.groupby('nclass').mean()

Unnamed: 0_level_0,id,math,english,science,total
nclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2.5,46.25,94.75,61.5,202.5
2,6.5,61.25,84.25,58.25,203.75
3,10.5,45.0,86.5,39.25,170.75
4,14.5,56.75,84.75,55.0,196.5
5,18.5,78.0,74.25,83.25,235.5
