In [1]:
import pandas as pd
import numpy as np

ser = pd.Series(np.arange(3.))
ser

0    0.0
1    1.0
2    2.0
dtype: float64

In [6]:
ser.loc[1] # 행의 이름을 지정
ser.iloc[2] # 행의 고유 인덱스 번호로 지정
# 현재는 결과가 같게 출력될 수밖에 없음
ser.iloc[-1] # 인덱싱이므로 사용 가능
# ser.loc[-1] # 에러 출력 : -1 이라는 행의 이름이 존재하지 않음

2.0

In [10]:
ser2 = pd.Series(np.arange(3.), index=["a", "b", "c"])
ser2[:2]
ser2.iloc[:2] # 위와 동일함

a    0.0
b    1.0
dtype: float64

In [12]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=["Ohio", "Colorado", "Utah", "New York"],
                    columns=["one", "two", "three", "four"])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [15]:
# data의 one열에 모두 1을 저장
data['one'] = 1
data

Unnamed: 0,one,two,three,four
Ohio,1,1,2,3
Colorado,1,5,6,7
Utah,1,9,10,11
New York,1,13,14,15


In [18]:
data.loc[:, 'one'] = 2
data

Unnamed: 0,one,two,three,four
Ohio,2,1,2,3
Colorado,2,5,6,7
Utah,2,9,10,11
New York,2,13,14,15


In [20]:
# four 열의 값이 5보다 큰 데이터를 추출
data[data['four']>5] # 많이 쓰이는 방식이므로 잘 적응할 것

Unnamed: 0,one,two,three,four
Colorado,2,5,6,7
Utah,2,9,10,11
New York,2,13,14,15


In [32]:
data.loc['Utah'] = 5
data

Unnamed: 0,one,two,three,four
Ohio,2,1,2,3
Colorado,2,5,6,7
Utah,5,5,5,5
New York,2,13,14,15


In [33]:
# three 열이 5인 자료 추출
data[data['three'] == 5]

Unnamed: 0,one,two,three,four
Utah,5,5,5,5


In [43]:
# three 열이 5인 자료의 two 열값 추출
data['two'][data['three'] == 5]
data[data['three'] == 5]['two'] # 동일한 결과임

Utah    5
Name: two, dtype: int32

In [49]:
data.loc[data['three'] == 5,'two']=10
data.loc[data.three == 5,'two']=10 # 위와 같은 결과를 보임

In [50]:
data

Unnamed: 0,one,two,three,four
Ohio,2,1,2,3
Colorado,2,5,6,7
Utah,5,10,5,5
New York,2,13,14,15


In [47]:
data.loc[data['three'] == 5, 'two']

Utah    10
Name: two, dtype: int32

In [48]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=["a", "c", "e", "f", "g"])
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [51]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [52]:
s1+s2 # 하나라도 연산시 값이 없으면 NaN으로 값을 출력함

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [57]:
# df1 = pd.DataFrame(np.arange(9.).reshape(3,3), columns=['b','c','d']) # 컬럼명을 바꿀땐 리스트구조로 입력해야함
df1 = pd.DataFrame(np.arange(9.).reshape(3,3), columns=list('bcd'), index=["Ohio", "Texas", "Colorado"])
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [58]:
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list("bde"),
                   index=["Utah", "Ohio", "Texas", "Oregon"])
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [60]:
df1+df2 # 행중에는 ohio, texas, 열중에는 b, d만 겹치므로 연산이 되고 나머지 열은 NaN으로 표기됨
# 즉 4 + NaN은 NaN이 되는 것임;

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [61]:
df1 = pd.DataFrame({"A": [1, 2]})
df2 = pd.DataFrame({"B": [3, 4]})
df1

Unnamed: 0,A
0,1
1,2


In [62]:
df2

Unnamed: 0,B
0,3
1,4


In [63]:
# ...결과는?
df1+df2

Unnamed: 0,A,B
0,,
1,,


In [64]:
# 스타벅스 최적 입지 선정 프로젝트
# - 인구, 스타벅스 매장 정보, 유동인구, 회사(관공소) 개수

In [66]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), columns=list("abcd"))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), columns=list("abcde"))
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [67]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [68]:
# 결과는??
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [82]:
# df2에서 6을 추출
df2.b[1]
# df2['b'][1]
# df2.loc[1, 'b']
# df2.loc[1,"b"]
# df2.iloc[1]['b']

6.0

In [84]:
df2.b[1] = np.nan # 결측값을 의미함
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [85]:
df1.add(df2, fill_value=0) # nan이 있으면 nan을 0으로 채우기
# NaN을 안보고 싶을 때 사용하면 됨
# fill value부터 진행하여 공간의 크기를 맞춘 뒤 연산작업

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [88]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [89]:
1/df1 # 0/0 = inf가 나옴

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [91]:
# df1.div(2) # df1/2
df1.rdiv(1) # 위와 동일한 결과가 나옴= 1/df1
# rdiv의 r은 reverse임. 그래서 dif1.rdiv(2)는 2/df1 이됨

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [103]:
df1.reindex(columns=df2.columns, fill_value=0) # 다른 데이터프레임의 열과 같도록 형태를 변경

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


In [105]:
arr = np.arange(12.).reshape((3, 4))
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [107]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                     columns=list("bde"),
                     index=["Utah", "Ohio", "Texas", "Oregon"])
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [114]:
ser = pd.Series(np.arange(3.), index=list('bde'))
ser

b    0.0
d    1.0
e    2.0
dtype: float64

In [113]:
frame - ser

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [115]:
series2 = pd.Series(np.arange(3), index=["b", "e", "f"])
series2

b    0
e    1
f    2
dtype: int32

In [116]:
frame - series2

Unnamed: 0,b,d,e,f
Utah,0.0,,1.0,
Ohio,3.0,,4.0,
Texas,6.0,,7.0,
Oregon,9.0,,10.0,


In [120]:
frame.sub(series2) # 위와 결과는 동일

Unnamed: 0,b,d,e,f
Utah,0.0,,1.0,
Ohio,3.0,,4.0,
Texas,6.0,,7.0,
Oregon,9.0,,10.0,


In [121]:
frame = pd.DataFrame(np.random.standard_normal((4, 3)),
                     columns=list("bde"),
                     index=["Utah", "Ohio", "Texas", "Oregon"])
frame

Unnamed: 0,b,d,e
Utah,-0.926514,-0.794436,-1.084018
Ohio,-1.040381,-0.291428,-1.444613
Texas,-1.548116,0.577173,-0.856581
Oregon,0.911626,0.470724,-0.092252


In [123]:
np.abs(frame) # 절댓값 구하기

Unnamed: 0,b,d,e
Utah,0.926514,0.794436,1.084018
Ohio,1.040381,0.291428,1.444613
Texas,1.548116,0.577173,0.856581
Oregon,0.911626,0.470724,0.092252


In [126]:
frame

Unnamed: 0,b,d,e
Utah,-0.926514,-0.794436,-1.084018
Ohio,-1.040381,-0.291428,-1.444613
Texas,-1.548116,0.577173,-0.856581
Oregon,0.911626,0.470724,-0.092252


In [129]:
# 열 단위로 최대값, 최소값 구해서 차를 출력
frame['b'].max()
frame['b'].min()
frame['b'].max() - frame['b'].min()

2.4597416898839084

In [142]:
def f1(x): # f1함수에 각각의 열(파라미터 x)들이 하나씩 전달됨
    return x.max() - x.min()
    # print(x.max() - x.min())
    # print('='*50)
pd.DataFrame(frame.apply(f1), columns=['diff']).T # frame에 열단위로 입력할 f1함수를 적용

Unnamed: 0,b,d,e
diff,2.459742,1.371609,1.352361


In [145]:
frame.apply(f1, axis=1) # 행의 값들에 함수를 적용. axis=0 - default

Utah      0.289582
Ohio      1.153185
Texas     2.125289
Oregon    1.003878
dtype: float64

In [148]:
def f2(x):
    return x.max(), x.min()
frame.apply(f2, axis=0) # 반환되는 순서대로 행을 편성함

Unnamed: 0,b,d,e
0,0.911626,0.577173,-0.092252
1,-1.548116,-0.794436,-1.444613


In [149]:
def f2(x):
    return pd.Series([x.max(), x.min()], index=['max', 'min'])
frame.apply(f2, axis=0)

Unnamed: 0,b,d,e
max,0.911626,0.577173,-0.092252
min,-1.548116,-0.794436,-1.444613


In [None]:
print(f"{1.161038:.2f}")

In [157]:
def my_format(x):
    return f"{x:.2f}"
frame.applymap(my_format) # 각각의 데이터에 대해서 적용하기 위해서 applymap을 사용할 수 있음.
# applymap은 시리즈 데이터에서는 사용할 수 없음

Unnamed: 0,b,d,e
Utah,-0.93,-0.79,-1.08
Ohio,-1.04,-0.29,-1.44
Texas,-1.55,0.58,-0.86
Oregon,0.91,0.47,-0.09


In [159]:
def my_format(x):
    return f"{x:.2f}"
frame['b'].map(my_format) #시리즈에서는 map함수를 사용하여 각 데이터를 변환
# 파이썬 문법에서 map 함수 사용할 때와 다소 비슷함

Utah      -0.93
Ohio      -1.04
Texas     -1.55
Oregon     0.91
Name: b, dtype: object

In [162]:
import seaborn as sns
tips = sns.load_dataset('tips')
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [167]:
tips['tip']

0      1.01
1      1.66
2      3.50
3      3.31
4      3.61
       ... 
239    5.92
240    2.00
241    2.00
242    1.75
243    3.00
Name: tip, Length: 244, dtype: float64

In [197]:
def mm(x):
    return x.max(), x.min()
pd.DataFrame(tips['tip']).apply(mm)

Unnamed: 0,tip
0,10.0
1,1.0


In [191]:
def mm(x):
    if(x.name == 'tip'):
        return x.max(), x.min()
tips.apply(mm)['tip']

(10.0, 1.0)

In [198]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=["three", "one"],
                     columns=["d", "a", "b", "c"])
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [201]:
frame.sort_index(axis='columns', ascending=False) # 정렬기능

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [202]:
obj = pd.Series([4, 7, -3, 2])
obj

0    4
1    7
2   -3
3    2
dtype: int64

In [205]:
obj.sort_values(ascending=False)

1    7
0    4
3    2
2   -3
dtype: int64

In [206]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2]) # nan이 포함된 데이터
obj

0    4.0
1    NaN
2    7.0
3    NaN
4   -3.0
5    2.0
dtype: float64

In [212]:
obj.sort_values() # nan은 맨 뒤로 가며 인덱스순으로 정렬됨
obj.sort_values(na_position='first') # 기본은 last로 설정되어 있음

1    NaN
3    NaN
4   -3.0
5    2.0
0    4.0
2    7.0
dtype: float64

In [213]:
frame = pd.DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]})
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [215]:
# series에서 가능한 함수라고 Dataframe에서 사용가능하다고 생각하면 안됨. 
# 즉 외워야 하는 요소까지는 아님. 
# 다만 다양한 코드를 다룰 줄은 알아야 하므로 많은 연습이 필요함.
# 특히 논리적 사용법을 익혀야함

In [217]:
frame.sort_values(['a', 'b']) # 이와 같이 작성하면 정렬하되 1순위로 a, 후순위로 b를 정렬함.
# 위와 같이 안하고 a만 입력했다면 a를 정렬하고 나머지는 인덱스 순으로 정렬됨

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


In [218]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [224]:
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [221]:
obj.rank() # 순위를 매김.. 이해가 조금 안됨; 일단 작은 숫자가 1등. 순위가 겹치는 경우 소수점을 추가
obj.rank(method='first')
obj.rank(ascending=False)

0    1.5
1    7.0
2    1.5
3    3.5
4    5.0
5    6.0
6    3.5
dtype: float64

In [223]:
frame = pd.DataFrame({"b": [4.3, 7, -3, 2], "a": [0, 1, 0, 1],
                      "c": [-2, 5, 8, -2.5]})
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [230]:
frame.rank() # frame.rank(axis=0)
frame.rank(axis=1) # frame.rank(axis='columns')

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


In [231]:
obj = pd.Series(np.arange(5), index=["a", "a", "b", "b", "c"])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int32

In [233]:
obj.unique() # 해당 객체의 데이터중 유일한 데이터

array([0, 1, 2, 3, 4])

In [235]:
obj.index.is_unique # 인덱스에 a와 b가 중복되므로 False

False

In [237]:
obj['a'] # 중복되므로 출력도 중복되는 데이터를 전부 출력

a    0
a    1
dtype: int32

In [238]:
df = pd.DataFrame(np.random.standard_normal((5, 3)),
                  index=["a", "a", "b", "b", "c"])
df

Unnamed: 0,0,1,2
a,-0.274018,-1.006773,-0.20083
a,-1.927918,0.427839,0.217648
b,0.534902,-0.055104,-1.892504
b,-0.179964,0.266583,0.35238
c,0.683776,1.234125,0.929753


In [241]:
df.loc['b'] # b만 출력하면 중복이 있으므로 중복되는 인덱스는 전부 출력

Unnamed: 0,0,1,2
b,0.534902,-0.055104,-1.892504
b,-0.179964,0.266583,0.35238


In [242]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=["a", "b", "c", "d"],
                  columns=["one", "two"])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [250]:
df.sum(axis=0) # 행들의 합 = df.sum(axis='index')
df.sum(axis=1) # 열들의 합 = df.sum(axis='columns')

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [253]:
df.sum(axis='index', skipna=False) # 원래는 NaN값을 스킵하고 연산을 진행하지만 skip하지 않으면 Nan 데이터가 들어가는 순간 결과도 NaN이 되어버림

one   NaN
two   NaN
dtype: float64

In [254]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [256]:
df.mean(axis=1)

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [259]:
df.idxmax() # 최댓값을 가진 인덱스를 추출

one    b
two    d
dtype: object

In [264]:
df.cumsum() # 기본적으로 skipna가 True로 되어 있음

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [277]:
df.info() # 데이터의 형태를 보여줌
df.describe() # 데이터를 읽었을 때 데이터의 기술통계를 보여줌

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, a to d
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   one     3 non-null      float64
 1   two     2 non-null      float64
dtypes: float64(2)
memory usage: 268.0+ bytes


Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [299]:
df = pd.read_csv('data/datasets/titanic/train.csv')
# df = pd.read_csv('../../data/datasets/titanic/train.csv') 상위 폴더를 여는 방법
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [307]:
df = pd.read_csv('pew.csv')
df.head()

Unnamed: 0,religion,<$10k,$10-20k,$20-30k,$30-40k,$40-50k,$50-75k,$75-100k,$100-150k,>150k,Don't know/refused
0,Agnostic,27,34,60,81,76,137,122,109,84,96
1,Atheist,12,27,37,52,35,70,73,59,74,76
2,Buddhist,27,21,30,34,33,58,62,39,53,54
3,Catholic,418,617,732,670,638,1116,949,792,633,1489
4,Don’t know/refused,15,14,15,11,10,35,21,17,18,116


In [308]:
df.describe()

Unnamed: 0,<$10k,$10-20k,$20-30k,$30-40k,$40-50k,$50-75k,$75-100k,$100-150k,>150k,Don't know/refused
count,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0
mean,107.222222,154.5,186.5,183.444444,171.388889,288.055556,221.666667,177.611111,144.888889,340.055556
std,168.931784,255.172433,309.891869,291.470354,271.144446,458.442436,345.078849,275.679724,205.224952,530.523878
min,1.0,2.0,3.0,4.0,2.0,7.0,3.0,4.0,4.0,8.0
25%,12.25,14.75,17.0,15.75,15.0,34.25,25.25,22.5,23.75,41.25
50%,20.0,27.0,33.5,40.0,34.0,66.5,65.5,48.5,53.5,74.5
75%,170.0,193.0,192.0,198.75,166.75,201.5,128.75,103.5,134.25,294.75
max,575.0,869.0,1064.0,982.0,881.0,1486.0,949.0,792.0,634.0,1529.0


In [309]:
df = pd.read_csv('tesla_stock_quandl.csv')
df

Unnamed: 0,Date,Open,High,Low,Close,Volume,ExDividend,SplitRatio,AdjOpen,AdjHigh,AdjLow,AdjClose,AdjVolume
0,2018-03-27,304.00,304.2700,277.18,279.18,13696168.0,0.0,1.0,304.00,304.2700,277.18,279.18,13696168.0
1,2018-03-26,307.34,307.5900,291.36,304.18,8324639.0,0.0,1.0,307.34,307.5900,291.36,304.18,8324639.0
2,2018-03-23,311.25,311.6100,300.45,301.54,6600538.0,0.0,1.0,311.25,311.6100,300.45,301.54,6600538.0
3,2018-03-22,313.89,318.8200,308.18,309.10,4914307.0,0.0,1.0,313.89,318.8200,308.18,309.10,4914307.0
4,2018-03-21,310.25,322.4400,310.19,316.53,5927881.0,0.0,1.0,310.25,322.4400,310.19,316.53,5927881.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1944,2010-07-06,20.00,20.0000,15.83,16.11,6866900.0,0.0,1.0,20.00,20.0000,15.83,16.11,6866900.0
1945,2010-07-02,23.00,23.1000,18.71,19.20,5139800.0,0.0,1.0,23.00,23.1000,18.71,19.20,5139800.0
1946,2010-07-01,25.00,25.9200,20.27,21.96,8218800.0,0.0,1.0,25.00,25.9200,20.27,21.96,8218800.0
1947,2010-06-30,25.79,30.4192,23.30,23.83,17187100.0,0.0,1.0,25.79,30.4192,23.30,23.83,17187100.0


In [318]:
df = pd.read_csv('gapminder.tsv', sep='\t',) # 그냥하면 파싱 에러가 출력됨. 
# 그러므로 sep(seperator: 분리 기호)를 사용하여 분리되는 기준을 작성하여 실행하면 됨
df

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.853030
2,Afghanistan,Asia,1962,31.997,10267083,853.100710
3,Afghanistan,Asia,1967,34.020,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
...,...,...,...,...,...,...
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.449960
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623


In [325]:
df.head()
df.tail()
df.info() # 문자열은 Dtype이 object로 출력됨
# df.columns
# df.dtypes 컬럼과 dtype은 info에서 확인이 가능함

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    1704 non-null   object 
 1   continent  1704 non-null   object 
 2   year       1704 non-null   int64  
 3   lifeExp    1704 non-null   float64
 4   pop        1704 non-null   int64  
 5   gdpPercap  1704 non-null   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB


Index(['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap'], dtype='object')

In [328]:
# 짐바브웨의 데이터만 추출해볼것
df[df.country == 'Zimbabwe']
# lifeexp는 기대수명인데.. 올라갔다가 줄어듦...? 와중에 인구는 증가하는 추세임;; gdp는 들쭉날쭉...

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
1692,Zimbabwe,Africa,1952,48.451,3080907,406.884115
1693,Zimbabwe,Africa,1957,50.469,3646340,518.764268
1694,Zimbabwe,Africa,1962,52.358,4277736,527.272182
1695,Zimbabwe,Africa,1967,53.995,4995432,569.795071
1696,Zimbabwe,Africa,1972,55.635,5861135,799.362176
1697,Zimbabwe,Africa,1977,57.674,6642107,685.587682
1698,Zimbabwe,Africa,1982,60.363,7636524,788.855041
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.44996


In [332]:
df[(df.continent == 'Asia') & (df.country == 'Japan')]


Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
792,Japan,Asia,1952,63.03,86459025,3216.956347
793,Japan,Asia,1957,65.5,91563009,4317.694365
794,Japan,Asia,1962,68.73,95831757,6576.649461
795,Japan,Asia,1967,71.43,100825279,9847.788607
796,Japan,Asia,1972,73.42,107188273,14778.78636
797,Japan,Asia,1977,75.38,113872473,16610.37701
798,Japan,Asia,1982,77.11,118454974,19384.10571
799,Japan,Asia,1987,78.67,122091325,22375.94189
800,Japan,Asia,1992,79.36,124329269,26824.89511
801,Japan,Asia,1997,80.69,125956499,28816.58499


In [333]:
# 연습문제

In [338]:
df

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.853030
2,Afghanistan,Asia,1962,31.997,10267083,853.100710
3,Afghanistan,Asia,1967,34.020,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
...,...,...,...,...,...,...
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.449960
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623


In [385]:
# 1. 기대수명 데이터셋으로 아래의 작업을 수행하시오.
import pandas as pd
import numpy as np
df = pd.read_csv('gapminder.tsv', sep='\t')

# 1) iloc 속성으로 행 데이터 추출하기
df.iloc[1]
# 2) loc 속성으로 행 단위 데이터 추출하기
df.loc[2]
df.loc[2:4]
# 1) 열 단위로 데이터 추출하기
df.country
df['year']
# 4) 'year', 'pop' 데이터 추출하기
df[['year', 'pop']]
df.loc[:, ['year', 'pop']]
# 5) country continent  year 데이터 추출하기
df[['country', 'continent', 'year']]
df.loc[:, ['country', 'continent', 'year']]
# 6) country  year       pop 데이터 추출하기
df[['country', 'year', 'pop']]
# 7) loc, iloc 속성 자유자재로 사용하기
# 출력예시)
#          country  lifeExp    gdpPercap
# 0    Afghanistan   28.801   779.445314
# 99    Bangladesh   43.453   721.186086
# 999     Mongolia   51.253  1226.041130

df.loc[[0,99,999], ['country', 'lifeExp', 'gdpPercap']]

# 출력예시)
#         country  lifeExp    gdpPercap
# 10  Afghanistan   42.129   726.734055
# 11  Afghanistan   43.828   974.580338
# 12      Albania   55.230  1601.056136
# 13      Albania   59.280  1942.284244

df.iloc[10:14, [0,3,-1]]

Unnamed: 0,country,lifeExp,gdpPercap
10,Afghanistan,42.129,726.734055
11,Afghanistan,43.828,974.580338
12,Albania,55.23,1601.056136
13,Albania,59.28,1942.284244


In [521]:
# 2. 행과 열에 라벨을 가지는 5 x 5 이상의 크기를 가지는 데이터프레임을 만든다.
# - 10가지 이상의 방법으로 특정한 행과 열을 선택한다.

# 한 학생의 한학기 성적표를 봤을 때를 가정
# 비율 => 출석 : 10, 중간,기말: 35, 수행평가: 15, 가산점: 5 
man = pd.DataFrame({'국어': [5, 8, 51, 70, 5],
                    '수학': [8, 7,  8, 38, 1],
                    '사회': [9, 1, 45, 59, 1],
                    '과학': [6, 13, 76, 45, 0],
                    '영어': [5, 2, 65, 52, 0],
                    '정보': [10, 15, '수', '우', 3],
              }, index=['출석','수행평가','중간고사','기말고사','가산점'])
man

Unnamed: 0,국어,수학,사회,과학,영어,정보
출석,5,8,9,6,5,10
수행평가,8,7,1,13,2,15
중간고사,51,8,45,76,65,수
기말고사,70,38,59,45,52,우
가산점,5,1,1,0,0,3


In [546]:
# 1.  각 과목 합계
def total(x):
    if x.dtype == 'int64':
        return sum(x)
man.apply(total)

# 2. 비율에 맞춰서 보는 합계
def total2(x):
    if x.dtype == 'int64':
        return sum([x.출석, x.수행평가, (x.중간고사*0.35), (x.기말고사*0.35), x.가산점])
man.apply(total)

# 3. 중간고사와 기말고사를 추출
man.loc[['중간고사','기말고사']]

# 4. 사회가 50점을 넘었을 때의 데이터를 추출
man[man.사회 > 50]

# 5 정보의 시험 점수가 '수'였을 때의 점수를 추출
man[man.정보 == '수']

# 6. 정보 점수 확인
man.정보

# 7. 수: 35, 우: 30, 미: 25, 양: 20, 가: 10 로 변환
scores = dict(zip(list('수우미양가'), [35,30,25,20,10]))
def ko_to_num(x):
    if type(x) != int:
        x = scores.get(x)*(100/35)
        return x
    else:
        return x
man2 = man.applymap(ko_to_num)
man2

# 8. 위의 데이터를 이용해서 각 과목의 합계를 다시 구하라
def total3(x):
    return sum([x.출석, x.수행평가, (x.중간고사*0.35), (x.기말고사*0.35), x.가산점])
total_score = man2.apply(total3)
total_score
# 9. 전체과목 점수 행을 추가
man2.loc['전체점수']= total_score
man2

# 10. 전체점수에 대해서 랭킹을 구하라
man2.loc['전체점수'].rank()

국어    4.0
수학    1.0
사회    2.0
과학    5.0
영어    3.0
정보    6.0
Name: 전체점수, dtype: float64