In [2]:
import numpy as np
import pandas as pd

np.random.seed(10)
# 1부터 5까지 범위에서 균일 간격으로 나뉜 4개 숫자를 가진 NumPy 배열 생성
df = pd.DataFrame({'A': np.linspace(1, 5, 4)}) 
# 표준 정규 분포(평균 0, 표준 편차 1)를 따르는 3x3 난수 행렬 생성
# 생성된 난수 행렬에 **'B', 'C', 'D'**라는 열 이름을 붙여 새로운 데이터프레임 생성
# 기존 df에 생성된 데이터프레임을 열 방향으로 결합
df = pd.concat([df, pd.DataFrame(np.random.randn(3, 3), columns=list('BCD'))], axis = 1)
df.iloc[1, 2] = np.nan # [1, 2] 위치를 nan으로 설정
df.style # 보기좋게 스타일을 넣어 출력

Unnamed: 0,A,B,C,D
0,1.0,1.331587,0.715279,-1.5454
1,2.333333,-0.008384,,-0.720086
2,3.666667,0.265512,0.108549,0.004291
3,5.0,,,


In [3]:
df.style.set_properties(**{
    'background-color': 'black', 
    'color': 'lawngreen', 
    'border-color': 'black'
})

Unnamed: 0,A,B,C,D
0,1.0,1.331587,0.715279,-1.5454
1,2.333333,-0.008384,,-0.720086
2,3.666667,0.265512,0.108549,0.004291
3,5.0,,,


In [4]:
ysobject = pd.Series([4, 7, -5, 3])
ysobject

0    4
1    7
2   -5
3    3
dtype: int64

In [5]:
print(ysobject.values)
print(ysobject.index)
print(ysobject.dtypes)

[ 4  7 -5  3]
RangeIndex(start=0, stop=4, step=1)
int64


In [6]:
ysobject2 = pd.Series([4, 7, -5, 3], index = ['d', 'b', 'a', 'c']) # 인덱스 변경
ysobject2

d    4
b    7
a   -5
c    3
dtype: int64

In [7]:
# 딕셔너리 자료형을 Series data로 사용 가능
ysSeriesData = {'lew': 57000, 'lee': 32000, 'Choi': 49000}
ysobject3 = pd.Series(ysSeriesData)
ysobject3

lew     57000
lee     32000
Choi    49000
dtype: int64

In [8]:
ysobject3.name = 'Salary'
ysobject3.index.name = 'first names' # 인덱스 항목 이름 지정
ysobject3

first names
lew     57000
lee     32000
Choi    49000
Name: Salary, dtype: int64

In [9]:
ysdata = {
    'name': ['lew', 'lee', 'Park'], 
    'year': [2012, 2015, 2018],
    'trial': [6, 2, 7],
    'points': [3.6, 4.4, 3.9],
}

df = pd.DataFrame(ysdata)
df

Unnamed: 0,name,year,trial,points
0,lew,2012,6,3.6
1,lee,2015,2,4.4
2,Park,2018,7,3.9


In [10]:
print(df.index)
print(df.columns)
print(df.values)

RangeIndex(start=0, stop=3, step=1)
Index(['name', 'year', 'trial', 'points'], dtype='object')
[['lew' 2012 6 3.6]
 ['lee' 2015 2 4.4]
 ['Park' 2018 7 3.9]]


In [11]:
df.index.name = 'Num'
df.columns.name = 'Info'
df

Info,name,year,trial,points
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,lew,2012,6,3.6
1,lee,2015,2,4.4
2,Park,2018,7,3.9


In [12]:
df2 = pd.DataFrame(
    ysdata, 
    columns=['year', 'name', 'points', 'trial'], 
    index=['one', 'two', 'three'])

df2

Unnamed: 0,year,name,points,trial
one,2012,lew,3.6,6
two,2015,lee,4.4,2
three,2018,Park,3.9,7


In [13]:
df2.describe()

Unnamed: 0,year,points,trial
count,3.0,3.0,3.0
mean,2015.0,3.966667,5.0
std,3.0,0.404145,2.645751
min,2012.0,3.6,2.0
25%,2013.5,3.75,4.0
50%,2015.0,3.9,6.0
75%,2016.5,4.15,6.5
max,2018.0,4.4,7.0


In [14]:
dftest = df
print(dftest['year'])
print(dftest.year)
dftest[['year', 'points']]

Num
0    2012
1    2015
2    2018
Name: year, dtype: int64
Num
0    2012
1    2015
2    2018
Name: year, dtype: int64


Info,year,points
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2012,3.6
1,2015,4.4
2,2018,3.9


In [15]:
dftest['points'] = 0.5
dftest

Info,name,year,trial,points
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,lew,2012,6,0.5
1,lee,2015,2,0.5
2,Park,2018,7,0.5


In [16]:
dftest['trial'] = [900, 700, 400]
dftest

Info,name,year,trial,points
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,lew,2012,900,0.5
1,lee,2015,700,0.5
2,Park,2018,400,0.5


In [17]:
dftest['zeros'] = np.arange(3) # 새 열 추가
dftest

Info,name,year,trial,points,zeros
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,lew,2012,900,0.5,0
1,lee,2015,700,0.5,1
2,Park,2018,400,0.5,2


In [18]:
val = pd.Series([200, 1300, 900], index=[2, 0, 1])
dftest['debt'] = val # 새 Series 추가
dftest

Info,name,year,trial,points,zeros,debt
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,lew,2012,900,0.5,0,1300
1,lee,2015,700,0.5,1,900
2,Park,2018,400,0.5,2,200


In [19]:
dftest['H_points'] = dftest['points'] - dftest['trial']
dftest['L_points'] = dftest['H_points'] < -500
dftest

Info,name,year,trial,points,zeros,debt,H_points,L_points
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,lew,2012,900,0.5,0,1300,-899.5,True
1,lee,2015,700,0.5,1,900,-699.5,True
2,Park,2018,400,0.5,2,200,-399.5,False


In [20]:
del dftest['H_points']
del dftest['L_points']
dftest

Info,name,year,trial,points,zeros,debt
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,lew,2012,900,0.5,0,1300
1,lee,2015,700,0.5,1,900
2,Park,2018,400,0.5,2,200


In [21]:
dftest[1:2]

Info,name,year,trial,points,zeros,debt
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,lee,2015,700,0.5,1,900


In [22]:
dftest.loc[1] # 1행 정보 가져오기

Info
name       lee
year      2015
trial      700
points     0.5
zeros        1
debt       900
Name: 1, dtype: object

In [23]:
dftest.loc[7,:] = ['Jun', 4.0, 0.1, 2013, 2.3, 100]
print(dftest)
print(dftest.loc[2:7])
print(dftest.loc[1:2, 'trial'])
print(dftest.loc[:, 'year'])

Info  name    year  trial  points  zeros    debt
Num                                             
0      lew  2012.0  900.0     0.5    0.0  1300.0
1      lee  2015.0  700.0     0.5    1.0   900.0
2     Park  2018.0  400.0     0.5    2.0   200.0
7      Jun     4.0    0.1  2013.0    2.3   100.0
Info  name    year  trial  points  zeros   debt
Num                                            
2     Park  2018.0  400.0     0.5    2.0  200.0
7      Jun     4.0    0.1  2013.0    2.3  100.0
Num
1    700.0
2    400.0
Name: trial, dtype: float64
Num
0    2012.0
1    2015.0
2    2018.0
7       4.0
Name: year, dtype: float64


In [24]:
dftest.iloc[3]

Info
name         Jun
year         4.0
trial        0.1
points    2013.0
zeros        2.3
debt       100.0
Name: 7, dtype: object

In [25]:
print(dftest)
print()
print(dftest.iloc[1, 1])
print()
print(dftest.iloc[3:5, 0:2])
print()
print(dftest.iloc[3:5, 3:6])

Info  name    year  trial  points  zeros    debt
Num                                             
0      lew  2012.0  900.0     0.5    0.0  1300.0
1      lee  2015.0  700.0     0.5    1.0   900.0
2     Park  2018.0  400.0     0.5    2.0   200.0
7      Jun     4.0    0.1  2013.0    2.3   100.0

2015.0

Info name  year
Num            
7     Jun   4.0

Info  points  zeros   debt
Num                       
7     2013.0    2.3  100.0


In [26]:
print(dftest)
print()
print(dftest['year'] > 2014)
print()
print(dftest.loc[df['year'] > 2014, :])

Info  name    year  trial  points  zeros    debt
Num                                             
0      lew  2012.0  900.0     0.5    0.0  1300.0
1      lee  2015.0  700.0     0.5    1.0   900.0
2     Park  2018.0  400.0     0.5    2.0   200.0
7      Jun     4.0    0.1  2013.0    2.3   100.0

Num
0    False
1     True
2     True
7    False
Name: year, dtype: bool

Info  name    year  trial  points  zeros   debt
Num                                            
1      lee  2015.0  700.0     0.5    1.0  900.0
2     Park  2018.0  400.0     0.5    2.0  200.0


In [27]:
dftest.iloc[0, 2] = np.nan
dftest.iloc[2, 2] = np.nan
print(dftest)
dftest = dftest.dropna(how='any') # 행 값 중 하나라도 nan이면 해당 행 삭제 후 다시 대입
print()
dftest
# dftest.drapna(how='all') # 행의 모든 값이 nan이면 해당 행 삭제

Info  name    year  trial  points  zeros    debt
Num                                             
0      lew  2012.0    NaN     0.5    0.0  1300.0
1      lee  2015.0  700.0     0.5    1.0   900.0
2     Park  2018.0    NaN     0.5    2.0   200.0
7      Jun     4.0    0.1  2013.0    2.3   100.0



Info,name,year,trial,points,zeros,debt
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,lee,2015.0,700.0,0.5,1.0,900.0
7,Jun,4.0,0.1,2013.0,2.3,100.0


In [28]:
dftest = df
dftest.iloc[0, 2] = np.nan
dftest.iloc[2, 2] = np.nan
dftest

Info,name,year,trial,points,zeros,debt
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,lew,2012.0,,0.5,0.0,1300.0
1,lee,2015.0,700.0,0.5,1.0,900.0
2,Park,2018.0,,0.5,2.0,200.0
7,Jun,4.0,0.1,2013.0,2.3,100.0


In [29]:
dftest = dftest.fillna(value=0.77777) # nan 값에 지정값 넣기
dftest

Info,name,year,trial,points,zeros,debt
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,lew,2012.0,0.77777,0.5,0.0,1300.0
1,lee,2015.0,700.0,0.5,1.0,900.0
2,Park,2018.0,0.77777,0.5,2.0,200.0
7,Jun,4.0,0.1,2013.0,2.3,100.0


In [30]:
dftest = df
dftest.iloc[0, 2] = np.nan
dftest.iloc[2, 2] = np.nan
dftest.isnull() # nan 여부 확인

Info,name,year,trial,points,zeros,debt
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,False,False,True,False,False,False
1,False,False,False,False,False,False
2,False,False,True,False,False,False
7,False,False,False,False,False,False


In [31]:
ysdata = [[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]]
dftest = pd.DataFrame(ysdata, columns=["one", "two"], index=["a", "b", "c", "d"])
dftest

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [32]:
dftest.sum(axis=0)

one    9.25
two   -5.80
dtype: float64

In [33]:
dftest.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [34]:
dftest['one'].sum()

np.float64(9.25)

In [35]:
dftest2 = pd.DataFrame(
    np.random.randn(3, 4), 
    columns=["A", "B", "C", "D"],
    index=pd.date_range("20160701", periods=3))
dftest2

Unnamed: 0,A,B,C,D
2016-07-01,-0.1746,0.433026,1.203037,-0.965066
2016-07-02,1.028274,0.22863,0.445138,-1.136602
2016-07-03,0.135137,1.484537,-1.079805,-1.977728


In [36]:
dftest2.sort_index(axis=0, ascending=False) # 행방향 내림차순 정렬

Unnamed: 0,A,B,C,D
2016-07-03,0.135137,1.484537,-1.079805,-1.977728
2016-07-02,1.028274,0.22863,0.445138,-1.136602
2016-07-01,-0.1746,0.433026,1.203037,-0.965066


In [37]:
dftest2.sort_index(axis=1, ascending=False) # 열방향 내림차순 정렬

Unnamed: 0,D,C,B,A
2016-07-01,-0.965066,1.203037,0.433026,-0.1746
2016-07-02,-1.136602,0.445138,0.22863,1.028274
2016-07-03,-1.977728,-1.079805,1.484537,0.135137
