# Pandas 활용하기
* 데이터 분석에 특화된 데이터 구조를 제공하는 라이브러리(테이블 형태의 DataFrame, 리스트 형태의 Series)
* 다양한 데이터 분석 함수 제공
* 데이터베이스에서 쉽게 자료를 가져올 수 있음
* json, csv, excel, html의 table 요소 등 다양한 데이터를 불러올 수 있음
* numpy를 내부적으로 사용, numpy와 pandas 간 변환이 쉬움 https://pandas.pydata.org/docs/index.html

# pandas 에서 다루는 데이터 타입 2가지
## 1) DataFreame: 2차원 형태의 표형식 데이터
* python의 dict를 기반으로 만들어진 데이터 하나의 key에 리스트 자료가 value로 들어간 형태
* {key1: [value1, value2, value3], key2: [value1, value2, value3]
* key는 데이터프레임의 컬럼명이 됨, value는 컬럼의 자료
* 데이터프레임은 python의 dict 사용법과 거의 비슷

## 2) Series: 1차원 형태의 벡터형식 데이터
* python의 list, tuple, ndarray의 1차원 상태와 비슷
* 이름이 있는 list
* list 사용법과 비슷, 인덱싱, 슬라이싱 가능, 반복문 사용 가능

In [1]:
import pandas as pd

# 데이터 프레임 만들기
* pd.DataFrame({key1: [value1, value2, value3], key2: [value1, value2, value3]})

In [5]:
df = pd.DataFrame(
    {'이름' : ['홍길동', '둘리', '또치', '도우너'],
    '주소' : ['서울', '의정부', '고양시', '성남시'],
    '취미' : ['음악감상', '놀기', '달리기', '바이올린']}
)

In [6]:
df

Unnamed: 0,이름,주소,취미
0,홍길동,서울,음악감상
1,둘리,의정부,놀기
2,또치,고양시,달리기
3,도우너,성남시,바이올린


# 넘파이 기반으로 만들어졌기 때문에 넘파이 메서드 사용 가능

In [7]:
df.shape

(4, 3)

In [8]:
df.ndim

2

In [10]:
arr = df.to_numpy()
arr

array([['홍길동', '서울', '음악감상'],
       ['둘리', '의정부', '놀기'],
       ['또치', '고양시', '달리기'],
       ['도우너', '성남시', '바이올린']], dtype=object)

In [11]:
pd.DataFrame(arr)

Unnamed: 0,0,1,2
0,홍길동,서울,음악감상
1,둘리,의정부,놀기
2,또치,고양시,달리기
3,도우너,성남시,바이올린


In [12]:
df.columns

Index(['이름', '주소', '취미'], dtype='object')

In [13]:
pd.DataFrame(arr, columns=df.columns)

Unnamed: 0,이름,주소,취미
0,홍길동,서울,음악감상
1,둘리,의정부,놀기
2,또치,고양시,달리기
3,도우너,성남시,바이올린


# 데이터 프레임 슬라이싱, .loc(인덱스명, 컬럼이름), .iloc(행인덱스, 열인덱스)

In [20]:
df.iloc[:2, :2]

Unnamed: 0,이름,주소
0,홍길동,서울
1,둘리,의정부


# 데이터프레임에서 컬럼 1개만 출력할 때
변수명[컬럼명]

In [27]:
list(df['이름'])

['홍길동', '둘리', '또치', '도우너']

In [22]:
d = {'이름' : ['홍길동', '둘리', '또치', '도우너'],
    '주소' : ['서울', '의정부', '고양시', '성남시'],
    '취미' : ['음악감상', '놀기', '달리기', '바이올린']}

In [23]:
d

{'이름': ['홍길동', '둘리', '또치', '도우너'],
 '주소': ['서울', '의정부', '고양시', '성남시'],
 '취미': ['음악감상', '놀기', '달리기', '바이올린']}

In [24]:
d['이름']

['홍길동', '둘리', '또치', '도우너']

In [28]:
for name in df['이름']:
    print(name)

홍길동
둘리
또치
도우너


In [29]:
df['이름'][1]

'둘리'

# 컬럼을 2개 이상 잘라 오고 싶을 때

In [30]:
df[['이름', '취미']]

Unnamed: 0,이름,취미
0,홍길동,음악감상
1,둘리,놀기
2,또치,달리기
3,도우너,바이올린


# 컬럼 순서를 바꾸고 싶을 때

In [31]:
df

Unnamed: 0,이름,주소,취미
0,홍길동,서울,음악감상
1,둘리,의정부,놀기
2,또치,고양시,달리기
3,도우너,성남시,바이올린


In [32]:
df[['취미', '이름', '주소']]

Unnamed: 0,취미,이름,주소
0,음악감상,홍길동,서울
1,놀기,둘리,의정부
2,달리기,또치,고양시
3,바이올린,도우너,성남시


# Series 만들기
* pd.Series([리스트/튜플], name="컬럼명"
* 이름을 가지고 있는 list/벡터
* DataFrame에서 컬럼 1개를 잘라온 형태

In [34]:
ages = pd.Series([22, 33, 53], name="나이")
ages

0    22
1    33
2    53
Name: 나이, dtype: int64

In [35]:
ages.dtype

dtype('int64')

In [36]:
ages.shape

(3,)

In [37]:
ages.ndim

1

In [38]:
ages[1]

np.int64(33)

In [39]:
ages[1:3]

1    33
2    53
Name: 나이, dtype: int64

In [40]:
ages[::-1]

2    53
1    33
0    22
Name: 나이, dtype: int64

In [41]:
ages.min()

np.int64(22)

In [42]:
ages.max()

np.int64(53)

In [43]:
ages.sum()

np.int64(108)

In [44]:
ages.cumsum()

0     22
1     55
2    108
Name: 나이, dtype: int64

In [45]:
ages.mean()

np.float64(36.0)

In [46]:
ages.median()

np.float64(33.0)

# 판다스에서 자료 불러오기, 저장하기
* csv, tsv, excel, json, html. sql
* pd.read_확장자명(파일경로, 옵션)

# excel 파일 불러오기
* xls, xlsx 두가지 파일 형식이 있음
* excel 파일을 불러올 때는 openpyxl 라이브러리를 따로 설치

In [63]:
# !pip install openpyxl

In [72]:
%%time
df_excel = pd.read_excel("./data/Online Retail.xlsx")
df_excel

FileNotFoundError: [Errno 2] No such file or directory: './data/Online Retail.xlsx'

In [75]:
df_excel = pd.read_excel("./data/아파트(매매)_실거래가_20240806113828.xlsx",header=12)
df_excel

  warn("Workbook contains no default style, apply openpyxl's default")


Unnamed: 0,NO,시군구,번지,본번,부번,단지명,전용면적(㎡),계약년월,계약일,거래금액(만원),...,층,매수자,매도자,건축년도,도로명,해제사유발생일,거래유형,중개사소재지,등기일자,주택유형
0,1,서울특별시 영등포구 당산동5가,42,42,0,당산삼성래미안4차,115.7800,202407,31,139000,...,4,개인,개인,2003,당산로 214,-,직거래,-,-,아파트
1,2,강원특별자치도 원주시 지정면 가곡리,1512,1512,0,원주롯데캐슬골드파크1차(5단지),84.9644,202407,31,38900,...,10,개인,개인,2019,신지정로 250,20240802,중개거래,강원 원주시,-,아파트
2,3,대구광역시 달성군 화원읍 구라리,1650,1650,0,대곡역래미안,84.5958,202407,31,29500,...,11,개인,개인,2007,비슬로539길 35,-,중개거래,대구 달성군,-,아파트
3,4,광주광역시 남구 진월동,312-7,312,7,광명,70.6250,202407,31,16900,...,1,개인,개인,1998,광복마을길 63,-,중개거래,광주 남구,-,아파트
4,5,광주광역시 남구 진월동,331-93,331,93,중흥,84.8400,202407,31,17500,...,13,개인,개인,1992,서문대로749번마길 30,-,직거래,-,24.07.31,아파트
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37575,37576,서울특별시 송파구 장지동,849,849,0,송파파인타운3단지,84.9800,202407,1,117500,...,14,개인,개인,2008,충민로 152,-,중개거래,서울 송파구,-,아파트
37576,37577,부산광역시 사상구 모라동,552,552,0,백양그린,41.3000,202407,1,7000,...,8,개인,개인,1992,모라로110번길 88,-,중개거래,부산 사상구,24.07.10,아파트
37577,37578,광주광역시 광산구 비아동,152-5,152,5,호반,59.8800,202407,1,15400,...,5,개인,개인,1998,비아로 185,-,중개거래,광주 광산구,24.07.26,아파트
37578,37579,부산광역시 사상구 모라동,552,552,0,백양그린,41.3000,202407,1,7500,...,10,개인,개인,1992,모라로110번길 88,-,중개거래,부산 사상구,24.07.19,아파트


# csv 파일 읽어오기 (, 로 구분된 자료가 있는 파일)

In [67]:
%%time
df_csv = pd.read_csv("./data/Online Retail.csv")
df_csv

CPU times: total: 766 ms
Wall time: 790 ms


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France


In [77]:
df_csv2 = pd.read_csv("./data/아파트(매매)_실거래가_20240806113828.csv", header=12, usecols=[1, 5, 6, 7, 8, 9])
df_csv2

Unnamed: 0,시군구,단지명,전용면적(㎡),계약년월,계약일,거래금액(만원)
0,서울특별시 영등포구 당산동5가,당산삼성래미안4차,115.7800,202407,31,139000
1,강원특별자치도 원주시 지정면 가곡리,원주롯데캐슬골드파크1차(5단지),84.9644,202407,31,38900
2,대구광역시 달성군 화원읍 구라리,대곡역래미안,84.5958,202407,31,29500
3,광주광역시 남구 진월동,광명,70.6250,202407,31,16900
4,광주광역시 남구 진월동,중흥,84.8400,202407,31,17500
...,...,...,...,...,...,...
37575,서울특별시 송파구 장지동,송파파인타운3단지,84.9800,202407,1,117500
37576,부산광역시 사상구 모라동,백양그린,41.3000,202407,1,7000
37577,광주광역시 광산구 비아동,호반,59.8800,202407,1,15400
37578,부산광역시 사상구 모라동,백양그린,41.3000,202407,1,7500


# json 파일 읽어오기

In [78]:
df_json = pd.read_json("./data/서울특별시_관광지입장정보_2011_2016.json")
df_json

Unnamed: 0,ForNum,NatNum,addrCd,gungu,resNm,rnum,sido,yyyymm
0,44722,75991,1111,종로구,창덕궁,1,서울특별시,201112
1,0,11017,1111,종로구,운현궁,2,서울특별시,201112
2,132399,237330,1111,종로구,경복궁,3,서울특별시,201112
3,3133,21267,1111,종로구,창경궁,4,서울특별시,201112
4,18226,24223,1111,종로구,종묘,5,서울특별시,201112
...,...,...,...,...,...,...,...,...
62,5588,44232,1141,서대문구,서대문형무소역사관,8,서울특별시,201512
63,0,22587,1141,서대문구,서대문자연사박물관,9,서울특별시,201512
64,29520,8822,1144,마포구,트릭아이미술관,10,서울특별시,201512
65,3,1647,1165,서초구,헌릉ㆍ인릉,11,서울특별시,201512


# HTML에서 table 데이터 읽어오기

In [80]:
df_html = pd.read_html("./data/corpList.htm")
df_html[0]

  df_html = pd.read_html("./data/corpList.htm")


ImportError: Missing optional dependency 'html5lib'.  Use pip or conda to install html5lib.

# DB에서 자료 읽어오기

In [81]:
from sqlalchemy import create_engine, text
import pymysql
pymysql.install_as_MySQLdb()

In [84]:
# !pip install cryptography

In [85]:
engine = create_engine("mysql+pymysql://root1234@localhost3306/ex_rate")
conn = engine.connect()
df_sql = pd.read_sql("ex_rate", con=conn)
conn.close()
df_sql

OperationalError: (pymysql.err.OperationalError) (2003, "Can't connect to MySQL server on 'localhost3306' ([Errno 11001] getaddrinfo failed)")
(Background on this error at: https://sqlalche.me/e/20/e3q8)

# 타이타닉 데이터셋으로 pandas 기능 익히기

In [86]:
df = pd.read_csv("./data/Titanic_train.csv")
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# 데이터의 일부만 보기 head(), tail()

In [87]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [88]:
df.head(50)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [89]:
df.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [90]:
df.tail(61)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
830,831,1,3,"Yasbeck, Mrs. Antoni (Selini Alexander)",female,15.00,1,0,2659,14.4542,,C
831,832,1,2,"Richards, Master. George Sibley",male,0.83,1,1,29106,18.7500,,S
832,833,0,3,"Saad, Mr. Amin",male,,0,0,2671,7.2292,,C
833,834,0,3,"Augustsson, Mr. Albert",male,23.00,0,0,347468,7.8542,,S
834,835,0,3,"Allum, Mr. Owen George",male,18.00,0,0,2223,8.3000,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.00,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.00,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.00,0,0,111369,30.0000,C148,C


# 각 컬럼의 데이터 타입.dtypes

In [91]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

# 컬럼명, non이 아닌 데이터 수, 데이터 타입 .info()

In [92]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


# 숫자로 된 자료의 기초 통계량을 보고 싶을 때 .describe()

In [94]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [95]:
df.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Dooley, Mr. Patrick",male,,,,347082.0,,G6,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [96]:
describe_result = df.describe(include='all')

In [97]:
describe_result[['Name', 'Embarked']]

Unnamed: 0,Name,Embarked
count,891,889
unique,891,3
top,"Dooley, Mr. Patrick",S
freq,1,644
mean,,
std,,
min,,
25%,,
50%,,
75%,,


In [99]:
describe_result[['Name', 'Embarked']].loc["count", 'Name']

np.int64(891)

In [100]:
describe_result.loc["25%", :]

PassengerId     223.5
Survived          0.0
Pclass            2.0
Name              NaN
Sex               NaN
Age            20.125
SibSp             0.0
Parch             0.0
Ticket            NaN
Fare           7.9104
Cabin             NaN
Embarked          NaN
Name: 25%, dtype: object

# 데이터 프레임에서 일부 컬럼만 가져오기

# 1개 컬럼만 가져오는 경우

In [101]:
df['Age']

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64

# 2개 이상의 컬럼을 가져오는 경우

In [102]:
df[['Name', 'Age']]

Unnamed: 0,Name,Age
0,"Braund, Mr. Owen Harris",22.0
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0
2,"Heikkinen, Miss. Laina",26.0
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0
4,"Allen, Mr. William Henry",35.0
...,...,...
886,"Montvila, Rev. Juozas",27.0
887,"Graham, Miss. Margaret Edith",19.0
888,"Johnston, Miss. Catherine Helen ""Carrie""",
889,"Behr, Mr. Karl Howell",26.0


# 컬럼명 추출하기 .columns

In [103]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [104]:
df[['Name', 'Age', 'Pclass', 'Survived']]

Unnamed: 0,Name,Age,Pclass,Survived
0,"Braund, Mr. Owen Harris",22.0,3,0
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,1
2,"Heikkinen, Miss. Laina",26.0,3,1
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,1
4,"Allen, Mr. William Henry",35.0,3,0
...,...,...,...,...
886,"Montvila, Rev. Juozas",27.0,2,0
887,"Graham, Miss. Margaret Edith",19.0,1,1
888,"Johnston, Miss. Catherine Helen ""Carrie""",,3,0
889,"Behr, Mr. Karl Howell",26.0,1,1


# loc, iloc를 사용해서 데이터 프레임 잘라오기
* loc: 데이터프레임의 인덱스명, 컬럼명으로 데이터의 일부를 추출
    * df.loc[시작row인덱스명 : 끝row인덱스명, 시작cloumn의 컬럼명 : 끝 column의 컬럼명]
* iloc: 데이터프레임의 인덱스 번호, 컬럼의 인덱스 번호로 데이터의 일부를 추출
    * df.iloc[시작인덱스번호 : 끝인덱스번호+1 : step, 시작column인덱스번호 : 끝 column 인덱스 번호 +1 : step]

In [105]:
df.loc[0:10, ['Name', 'Age', ' Pclass', 'Survived']]

KeyError: "[' Pclass'] not in index"

In [106]:
df.iloc[0:10, [3,5,2,1]]

Unnamed: 0,Name,Age,Pclass,Survived
0,"Braund, Mr. Owen Harris",22.0,3,0
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,1
2,"Heikkinen, Miss. Laina",26.0,3,1
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,1
4,"Allen, Mr. William Henry",35.0,3,0
5,"Moran, Mr. James",,3,0
6,"McCarthy, Mr. Timothy J",54.0,1,0
7,"Palsson, Master. Gosta Leonard",2.0,3,0
8,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",27.0,3,1
9,"Nasser, Mrs. Nicholas (Adele Achem)",14.0,2,1


In [108]:
df.iloc[::-1, ::-1]

Unnamed: 0,Embarked,Cabin,Fare,Ticket,Parch,SibSp,Age,Sex,Name,Pclass,Survived,PassengerId
890,Q,,7.7500,370376,0,0,32.0,male,"Dooley, Mr. Patrick",3,0,891
889,C,C148,30.0000,111369,0,0,26.0,male,"Behr, Mr. Karl Howell",1,1,890
888,S,,23.4500,W./C. 6607,2,1,,female,"Johnston, Miss. Catherine Helen ""Carrie""",3,0,889
887,S,B42,30.0000,112053,0,0,19.0,female,"Graham, Miss. Margaret Edith",1,1,888
886,S,,13.0000,211536,0,0,27.0,male,"Montvila, Rev. Juozas",2,0,887
...,...,...,...,...,...,...,...,...,...,...,...,...
4,S,,8.0500,373450,0,0,35.0,male,"Allen, Mr. William Henry",3,0,5
3,S,C123,53.1000,113803,0,1,35.0,female,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,1,4
2,S,,7.9250,STON/O2. 3101282,0,0,26.0,female,"Heikkinen, Miss. Laina",3,1,3
1,C,C85,71.2833,PC 17599,0,1,38.0,female,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,1,2


# 컬럼에서 유일값 찾기 unique(), 유일값의 개수 unique()

In [None]:
df['Pclass'].unique()

In [None]:
df['Pclass'].nunique()

유일값의 개수를 셀 때 value counts()

In [None]:
df['Pclass'].value_counts()

In [None]:
df['Pclass'].value_counts(sort=False)