# 라이브러리 불러오기

In [53]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl

print(np.__version__)
print(pd.__version__)
print(sns.__version__)
print(mpl.__version__)

2.2.4
2.2.3
0.13.2
3.10.1


# 샘플 데이터 가져오기

In [54]:
iris = sns.load_dataset("iris")
iris.head(1)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa


In [55]:
tips = sns.load_dataset("tips")
tips.tail(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
241,22.67,2.0,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2
243,18.78,3.0,Female,No,Thur,Dinner,2


# 결측치 확인
- 데이터가 비어 있는지?

In [56]:
iris.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [57]:
# 샘플 개수 세기
iris.shape

(150, 5)

In [58]:
tips.shape

(244, 7)

In [59]:
# series
a = tips['day']
type(a)

pandas.core.series.Series

In [60]:
type(tips)

pandas.core.frame.DataFrame

In [61]:
tips['day'].value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

# 상위 5개만 보는 메서드 
- nlargest

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.nlargest.html
- 관련 메서드...
    + DataFrame.nsmallest
    + DataFrame.sort_values
    + DataFrame.head

In [62]:
# 숫자열을 sort() 내림차순 정렬
# 상위 5개만 인덱싱
iris.nlargest(5, "sepal_length")

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
131,7.9,3.8,6.4,2.0,virginica
117,7.7,3.8,6.7,2.2,virginica
118,7.7,2.6,6.9,2.3,virginica
122,7.7,2.8,6.7,2.0,virginica
135,7.7,3.0,6.1,2.3,virginica


# 필터링
- Numpy와 문법 동일

In [63]:
# tips의 평균 구한 후
# 평균보다 큰 데이터만 조회
mean_tip = tips['tip'].mean()

# numpy 원리 그대로 ==> a[a>12]
tips[tips['tip'] > mean_tip]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
7,26.88,3.12,Male,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
232,11.61,3.39,Male,No,Sat,Dinner,2
234,15.53,3.00,Male,Yes,Sat,Dinner,2
238,35.83,4.67,Female,No,Sat,Dinner,3
239,29.03,5.92,Male,No,Sat,Dinner,3


In [64]:
# 객체 생성 안 하고 그냥 간단히 이렇게...
tips[tips['tip'] > tips['tip'].mean()].head(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
2,21.01,3.5,Male,No,Sun,Dinner,3


- smoker가 No인 것만 조회

In [65]:
tips[tips['smoker'] == 'No']

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
235,10.07,1.25,Male,No,Sat,Dinner,2
238,35.83,4.67,Female,No,Sat,Dinner,3
239,29.03,5.92,Male,No,Sat,Dinner,3
242,17.82,1.75,Male,No,Sat,Dinner,2


- day가 Sat인 것만 조회

In [66]:
tips[tips['day'] == 'Sat']

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
19,20.65,3.35,Male,No,Sat,Dinner,3
20,17.92,4.08,Male,No,Sat,Dinner,2
21,20.29,2.75,Female,No,Sat,Dinner,2
22,15.77,2.23,Female,No,Sat,Dinner,2
23,39.42,7.58,Male,No,Sat,Dinner,4
...,...,...,...,...,...,...,...
238,35.83,4.67,Female,No,Sat,Dinner,3
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2


- time이 Dinner인 것만 조회

In [67]:
tips[tips['time'] == 'Dinner']

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [68]:
tips['day'].value_counts

<bound method IndexOpsMixin.value_counts of 0       Sun
1       Sun
2       Sun
3       Sun
4       Sun
       ... 
239     Sat
240     Sat
241     Sat
242     Sat
243    Thur
Name: day, Length: 244, dtype: category
Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun']>

In [69]:
tips[tips['day'] == 'Sat'].shape

(87, 7)

In [70]:
# day가 Sat인 것만
# 인덱스 번호 0번째부터 재정렬
tips[tips['day'] == 'Sat'].reset_index(drop=True)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,20.65,3.35,Male,No,Sat,Dinner,3
1,17.92,4.08,Male,No,Sat,Dinner,2
2,20.29,2.75,Female,No,Sat,Dinner,2
3,15.77,2.23,Female,No,Sat,Dinner,2
4,39.42,7.58,Male,No,Sat,Dinner,4
...,...,...,...,...,...,...,...
82,35.83,4.67,Female,No,Sat,Dinner,3
83,29.03,5.92,Male,No,Sat,Dinner,3
84,27.18,2.00,Female,Yes,Sat,Dinner,2
85,22.67,2.00,Male,Yes,Sat,Dinner,2


# ioc와 iloc 비교

## loc
- 특정한 값을 읽을 때

In [71]:
# tips.loc[tips['day'] == 'Sat']
# 문법 : tip.loc[행, 열]
tips.loc[0:1, ['total_bill', 'tip', 'day']]

Unnamed: 0,total_bill,tip,day
0,16.99,1.01,Sun
1,10.34,1.66,Sun


In [72]:
# 전체 데이터 가져오기
tips.loc[:, :]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [73]:
# total_bill 11 이하인 것만 조회 
tips.loc[tips['total_bill'] <= 11, :].head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,10.34,1.66,Male,No,Sun,Dinner,3
6,8.77,2.0,Male,No,Sun,Dinner,2
10,10.27,1.71,Male,No,Sun,Dinner,2


In [79]:
# time이 Dinner이면서 bill이 11 이하인 것 조회
# 대충 로직은 이럼.. 
result = tips.loc[tips['time'] == 'Dinner', :]
result.loc[result['total_bill'] <= 11, :]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,10.34,1.66,Male,No,Sun,Dinner,3
6,8.77,2.0,Male,No,Sun,Dinner,2
10,10.27,1.71,Male,No,Sun,Dinner,2
16,10.33,1.67,Female,No,Sun,Dinner,3
30,9.55,1.45,Male,No,Sat,Dinner,2
43,9.68,1.32,Male,No,Sun,Dinner,2
51,10.29,2.6,Female,No,Sun,Dinner,2
53,9.94,1.56,Male,No,Sun,Dinner,2
67,3.07,1.0,Female,Yes,Sat,Dinner,1
75,10.51,1.25,Male,No,Sat,Dinner,2


In [80]:
# 조건식 2개 이렇게 표현하기
# tips.loc[(조건식1) & (조건식2)]
tips.loc[(tips['time'] == 'Dinner') & (tips['total_bill'] <= 11), :]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,10.34,1.66,Male,No,Sun,Dinner,3
6,8.77,2.0,Male,No,Sun,Dinner,2
10,10.27,1.71,Male,No,Sun,Dinner,2
16,10.33,1.67,Female,No,Sun,Dinner,3
30,9.55,1.45,Male,No,Sat,Dinner,2
43,9.68,1.32,Male,No,Sun,Dinner,2
51,10.29,2.6,Female,No,Sun,Dinner,2
53,9.94,1.56,Male,No,Sun,Dinner,2
67,3.07,1.0,Female,Yes,Sat,Dinner,1
75,10.51,1.25,Male,No,Sat,Dinner,2


In [35]:
tips.loc[tips['day'] == 'Sat', ['total_bill', 'tip', 'day']].reset_index(drop=True)

Unnamed: 0,total_bill,tip,day
0,20.65,3.35,Sat
1,17.92,4.08,Sat
2,20.29,2.75,Sat
3,15.77,2.23,Sat
4,39.42,7.58,Sat
...,...,...,...
82,35.83,4.67,Sat
83,29.03,5.92,Sat
84,27.18,2.00,Sat
85,22.67,2.00,Sat


In [34]:
# iloc
tips.iloc[0:1, [0,1,4]]

Unnamed: 0,total_bill,tip,day
0,16.99,1.01,Sun


In [83]:
# iris
# 품종이 viginica 이거나 sepal_length >= 5인 값만 가져오고
# 컬럼은 sepal_length, petal_length, species만 가져오기
# loc 기반으로

iris.loc[(iris['species'] == 'virginica') | (iris['sepal_length'] >= 5), ['sepal_length', 'petal_length', 'species']]

Unnamed: 0,sepal_length,petal_length,species
0,5.1,1.4,setosa
4,5.0,1.4,setosa
5,5.4,1.7,setosa
7,5.0,1.5,setosa
10,5.4,1.5,setosa
...,...,...,...
145,6.7,5.2,virginica
146,6.3,5.0,virginica
147,6.5,5.2,virginica
148,6.2,5.4,virginica


# 파일 입출력
- csv
- excel

In [85]:
c

Unnamed: 0,sepal_length,species
0,5.1,setosa
1,4.9,setosa
2,4.7,setosa
3,4.6,setosa
4,5.0,setosa
...,...,...
145,6.7,virginica
146,6.3,virginica
147,6.5,virginica
148,6.2,virginica


## CSV

In [88]:
# 특정 폴더 안으로 파일을 내보내고 싶을 때
result.to_csv("dataset/iris_result.csv", index=False)

In [89]:
# 파일 불러오기
iris_df = pd.read_csv("dataset/iris_result.csv")
iris_df

Unnamed: 0,sepal_length,species
0,5.1,setosa
1,4.9,setosa
2,4.7,setosa
3,4.6,setosa
4,5.0,setosa
...,...,...
145,6.7,virginica
146,6.3,virginica
147,6.5,virginica
148,6.2,virginica
