In [1]:
import pandas as pd
import numpy as np


In [4]:
# 인덱스 생성(시계열)
t_idx = pd.date_range('2020-01-01', periods=8, freq='M')
t_idx

DatetimeIndex(['2020-01-31', '2020-02-29', '2020-03-31', '2020-04-30',
               '2020-05-31', '2020-06-30', '2020-07-31', '2020-08-31'],
              dtype='datetime64[ns]', freq='M')

In [5]:
# 시리즈 생성
s = pd.Series(np.random.randn(8),index = t_idx)
s

2020-01-31   -0.947804
2020-02-29    0.621019
2020-03-31    0.526869
2020-04-30    1.208103
2020-05-31   -0.112500
2020-06-30   -0.739076
2020-07-31    1.419800
2020-08-31   -2.781923
Freq: M, dtype: float64

In [6]:
# 데이터프레임 생성
df = pd.DataFrame({'One' : [1,2,3,4,5,6,7,8], 'Two' : [8,7,6,5,4,3,2,1,]}, index=t_idx)
df

Unnamed: 0,One,Two
2020-01-31,1,8
2020-02-29,2,7
2020-03-31,3,6
2020-04-30,4,5
2020-05-31,5,4
2020-06-30,6,3
2020-07-31,7,2
2020-08-31,8,1


In [7]:
s.head()

2020-01-31   -0.947804
2020-02-29    0.621019
2020-03-31    0.526869
2020-04-30    1.208103
2020-05-31   -0.112500
Freq: M, dtype: float64

In [8]:
s.tail(6)

2020-03-31    0.526869
2020-04-30    1.208103
2020-05-31   -0.112500
2020-06-30   -0.739076
2020-07-31    1.419800
2020-08-31   -2.781923
Freq: M, dtype: float64

In [10]:
# 시리즈 shape
s.shape

(8,)

In [11]:
# 데이터프레임 shape
df.shape

(8, 2)

In [12]:
# 인덱스 shape
t_idx.shape

(8,)

In [15]:
# 0~3번째의 행을 출력
df[:3]

Unnamed: 0,One,Two
2020-01-31,1,8
2020-02-29,2,7
2020-03-31,3,6


In [16]:
# 2~5번째의 행을 출력
df[2:5]

Unnamed: 0,One,Two
2020-03-31,3,6
2020-04-30,4,5
2020-05-31,5,4


In [17]:
# 시리즈 내부의 값
s.array

<PandasArray>
[ -0.9478040550309271,   0.6210189401447063,   0.5268690969841191,
   1.2081027538889946, -0.11250048407157869,  -0.7390759558578995,
   1.4198001438584995,   -2.781922728141299]
Length: 8, dtype: float64

In [18]:
# 시리즈 인덱스 값(시계열)
s.index.array

<DatetimeArray>
['2020-01-31 00:00:00', '2020-02-29 00:00:00', '2020-03-31 00:00:00',
 '2020-04-30 00:00:00', '2020-05-31 00:00:00', '2020-06-30 00:00:00',
 '2020-07-31 00:00:00', '2020-08-31 00:00:00']
Length: 8, dtype: datetime64[ns]

In [20]:
# to_numpy 메소드 사용
s.to_numpy()

array([-0.94780406,  0.62101894,  0.5268691 ,  1.20810275, -0.11250048,
       -0.73907596,  1.41980014, -2.78192273])

In [21]:
# np.asarray 메소드 사용
np.asarray(s)

array([-0.94780406,  0.62101894,  0.5268691 ,  1.20810275, -0.11250048,
       -0.73907596,  1.41980014, -2.78192273])

In [24]:
# 타임존을 설정한 시계열 시리즈 생성
t_s = pd.Series(pd.date_range('2020', periods=2, tz='CET'))

In [25]:
t_s

0   2020-01-01 00:00:00+01:00
1   2020-01-02 00:00:00+01:00
dtype: datetime64[ns, CET]

In [28]:
# 자료형 Object 
t_s.to_numpy(dtype=object)

array([Timestamp('2020-01-01 00:00:00+0100', tz='CET', freq='D'),
       Timestamp('2020-01-02 00:00:00+0100', tz='CET', freq='D')],
      dtype=object)

In [29]:
# 자료형 datetime64[ns]
t_s.to_numpy(dtype='datetime64[ns]')

array(['2019-12-31T23:00:00.000000000', '2020-01-01T23:00:00.000000000'],
      dtype='datetime64[ns]')

In [30]:
df.to_numpy()

array([[1, 8],
       [2, 7],
       [3, 6],
       [4, 5],
       [5, 4],
       [6, 3],
       [7, 2],
       [8, 1]])

In [34]:
# 정수, 소수, 문자형이 섞여있는 데이터프레임 생성
df2 = pd.DataFrame({'A' : [1,2,np.nan,'a'],
                   'B' : [3,4,5,6]})

In [35]:
df2

Unnamed: 0,A,B
0,1,3
1,2,4
2,,5
3,a,6


In [36]:
df2.to_numpy()

array([[1, 3],
       [2, 4],
       [nan, 5],
       ['a', 6]], dtype=object)

In [37]:
# 정수와 누락값(소수)으로 구성되어 있는 데이터프레임 생성
df3 = pd.DataFrame({'A' : [1,2,np.nan,3], 
                   'B' : [4,5,6,7]}) 

In [38]:
df3

Unnamed: 0,A,B
0,1.0,4
1,2.0,5
2,,6
3,3.0,7


In [44]:
df3.to_numpy()

array([[ 1.,  4.],
       [ 2.,  5.],
       [nan,  6.],
       [ 3.,  7.]])

In [4]:
df = pd.read_csv('../Pandas/premier_league.csv')

In [5]:
df

Unnamed: 0,home_team,away_team,home_goals,away_goals,result,season
0,TottenhamHotspur,ManchesterCity,0,0,D,2010-2011
1,AstonVilla,WestHamUnited,3,0,H,2010-2011
2,BlackburnRovers,Everton,1,0,H,2010-2011
3,BoltonWanderers,Fulham,0,0,D,2010-2011
4,Sunderland,BirminghamCity,2,2,D,2010-2011
...,...,...,...,...,...,...
3663,Liverpool,Southampton,4,0,H,
3664,NewcastleUnited,NorwichCity,0,0,D,
3665,Watford,Everton,2,3,A,
3666,WestHamUnited,BrightonandHoveAlbion,3,3,D,


In [11]:
import bottleneck as bn
import time 

In [12]:
start = time.time() # 시작시간 측정
print(bn.nanmean(df['home_goals']), time.time() - start)

1.5517993456924755 0.0002181529998779297


In [15]:
start = time.time() # 시작시간 측정
print(np.nanmean(df['home_goals']), time.time() - start)

1.5517993456924755 0.00026988983154296875
