In [1]:
import pandas as pd
import numpy as np


In [4]:
# 인덱스 생성(시계열)
t_idx = pd.date_range('2020-01-01', periods=8, freq='M')
t_idx

DatetimeIndex(['2020-01-31', '2020-02-29', '2020-03-31', '2020-04-30',
               '2020-05-31', '2020-06-30', '2020-07-31', '2020-08-31'],
              dtype='datetime64[ns]', freq='M')

In [5]:
# 시리즈 생성
s = pd.Series(np.random.randn(8),index = t_idx)
s

2020-01-31   -0.947804
2020-02-29    0.621019
2020-03-31    0.526869
2020-04-30    1.208103
2020-05-31   -0.112500
2020-06-30   -0.739076
2020-07-31    1.419800
2020-08-31   -2.781923
Freq: M, dtype: float64

In [6]:
# 데이터프레임 생성
df = pd.DataFrame({'One' : [1,2,3,4,5,6,7,8], 'Two' : [8,7,6,5,4,3,2,1,]}, index=t_idx)
df

Unnamed: 0,One,Two
2020-01-31,1,8
2020-02-29,2,7
2020-03-31,3,6
2020-04-30,4,5
2020-05-31,5,4
2020-06-30,6,3
2020-07-31,7,2
2020-08-31,8,1


In [7]:
s.head()

2020-01-31   -0.947804
2020-02-29    0.621019
2020-03-31    0.526869
2020-04-30    1.208103
2020-05-31   -0.112500
Freq: M, dtype: float64

In [8]:
s.tail(6)

2020-03-31    0.526869
2020-04-30    1.208103
2020-05-31   -0.112500
2020-06-30   -0.739076
2020-07-31    1.419800
2020-08-31   -2.781923
Freq: M, dtype: float64

In [10]:
# 시리즈 shape
s.shape

(8,)

In [11]:
# 데이터프레임 shape
df.shape

(8, 2)

In [12]:
# 인덱스 shape
t_idx.shape

(8,)

In [15]:
# 0~3번째의 행을 출력
df[:3]

Unnamed: 0,One,Two
2020-01-31,1,8
2020-02-29,2,7
2020-03-31,3,6


In [16]:
# 2~5번째의 행을 출력
df[2:5]

Unnamed: 0,One,Two
2020-03-31,3,6
2020-04-30,4,5
2020-05-31,5,4


In [17]:
# 시리즈 내부의 값
s.array

<PandasArray>
[ -0.9478040550309271,   0.6210189401447063,   0.5268690969841191,
   1.2081027538889946, -0.11250048407157869,  -0.7390759558578995,
   1.4198001438584995,   -2.781922728141299]
Length: 8, dtype: float64

In [18]:
# 시리즈 인덱스 값(시계열)
s.index.array

<DatetimeArray>
['2020-01-31 00:00:00', '2020-02-29 00:00:00', '2020-03-31 00:00:00',
 '2020-04-30 00:00:00', '2020-05-31 00:00:00', '2020-06-30 00:00:00',
 '2020-07-31 00:00:00', '2020-08-31 00:00:00']
Length: 8, dtype: datetime64[ns]

In [20]:
# to_numpy 메소드 사용
s.to_numpy()

array([-0.94780406,  0.62101894,  0.5268691 ,  1.20810275, -0.11250048,
       -0.73907596,  1.41980014, -2.78192273])

In [21]:
# np.asarray 메소드 사용
np.asarray(s)

array([-0.94780406,  0.62101894,  0.5268691 ,  1.20810275, -0.11250048,
       -0.73907596,  1.41980014, -2.78192273])

In [24]:
# 타임존을 설정한 시계열 시리즈 생성
t_s = pd.Series(pd.date_range('2020', periods=2, tz='CET'))

In [25]:
t_s

0   2020-01-01 00:00:00+01:00
1   2020-01-02 00:00:00+01:00
dtype: datetime64[ns, CET]

In [28]:
# 자료형 Object 
t_s.to_numpy(dtype=object)

array([Timestamp('2020-01-01 00:00:00+0100', tz='CET', freq='D'),
       Timestamp('2020-01-02 00:00:00+0100', tz='CET', freq='D')],
      dtype=object)

In [29]:
# 자료형 datetime64[ns]
t_s.to_numpy(dtype='datetime64[ns]')

array(['2019-12-31T23:00:00.000000000', '2020-01-01T23:00:00.000000000'],
      dtype='datetime64[ns]')

In [30]:
df.to_numpy()

array([[1, 8],
       [2, 7],
       [3, 6],
       [4, 5],
       [5, 4],
       [6, 3],
       [7, 2],
       [8, 1]])

In [34]:
# 정수, 소수, 문자형이 섞여있는 데이터프레임 생성
df2 = pd.DataFrame({'A' : [1,2,np.nan,'a'],
                   'B' : [3,4,5,6]})

In [35]:
df2

Unnamed: 0,A,B
0,1,3
1,2,4
2,,5
3,a,6


In [36]:
df2.to_numpy()

array([[1, 3],
       [2, 4],
       [nan, 5],
       ['a', 6]], dtype=object)

In [37]:
# 정수와 누락값(소수)으로 구성되어 있는 데이터프레임 생성
df3 = pd.DataFrame({'A' : [1,2,np.nan,3], 
                   'B' : [4,5,6,7]}) 

In [38]:
df3

Unnamed: 0,A,B
0,1.0,4
1,2.0,5
2,,6
3,3.0,7


In [44]:
df3.to_numpy()

array([[ 1.,  4.],
       [ 2.,  5.],
       [nan,  6.],
       [ 3.,  7.]])

In [4]:
df = pd.read_csv('../Pandas/premier_league.csv')

In [5]:
df

Unnamed: 0,home_team,away_team,home_goals,away_goals,result,season
0,TottenhamHotspur,ManchesterCity,0,0,D,2010-2011
1,AstonVilla,WestHamUnited,3,0,H,2010-2011
2,BlackburnRovers,Everton,1,0,H,2010-2011
3,BoltonWanderers,Fulham,0,0,D,2010-2011
4,Sunderland,BirminghamCity,2,2,D,2010-2011
...,...,...,...,...,...,...
3663,Liverpool,Southampton,4,0,H,
3664,NewcastleUnited,NorwichCity,0,0,D,
3665,Watford,Everton,2,3,A,
3666,WestHamUnited,BrightonandHoveAlbion,3,3,D,


In [11]:
import bottleneck as bn
import time 

In [12]:
start = time.time() # 시작시간 측정
print(bn.nanmean(df['home_goals']), time.time() - start)

1.5517993456924755 0.0002181529998779297


In [15]:
start = time.time() # 시작시간 측정
print(np.nanmean(df['home_goals']), time.time() - start)

1.5517993456924755 0.00026988983154296875


In [2]:
df = pd.DataFrame({'One' : pd.Series(np.random.randn(4), index=['a','b','c','d']),
                    'Two' : pd.Series(np.random.randn(3), index=['a','c','d']),
                   'Three' : pd.Series(np.random.randn(3), index=['b','c','d'])})

In [17]:
df

Unnamed: 0,One,Two,Three
a,1.008765,-0.175352,
b,1.748897,,0.624603
c,1.078679,-0.866894,-0.950405
d,-1.57386,0.170444,0.322723


In [18]:
# 연산을 할 시리즈 생성(데이터프레임에서 한 행을 추출)
row = df.iloc[1]

In [19]:
row

One      1.748897
Two           NaN
Three    0.624603
Name: b, dtype: float64

In [20]:
df.sub(row, axis='columns')

Unnamed: 0,One,Two,Three
a,-0.740132,,
b,0.0,,0.0
c,-0.670218,,-1.575008
d,-3.322757,,-0.301881


In [22]:
df.sub(row, axis=1)

Unnamed: 0,One,Two,Three
a,-0.740132,,
b,0.0,,0.0
c,-0.670218,,-1.575008
d,-3.322757,,-0.301881


In [7]:
# 연산을 할 시리즈 생성(데이터프레임에서 한 열을 추출)
column = df['One']

In [8]:
column

a    2.371463
b   -2.182147
c   -0.555885
d   -1.374381
Name: One, dtype: float64

In [25]:
df.sub(column, axis=0)

Unnamed: 0,One,Two,Three
a,0.0,-1.184117,
b,0.0,,-1.124294
c,0.0,-1.945573,-2.029084
d,0.0,1.744304,1.896582


In [26]:
df.sub(column, axis='index')

Unnamed: 0,One,Two,Three
a,0.0,-1.184117,
b,0.0,,-1.124294
c,0.0,-1.945573,-2.029084
d,0.0,1.744304,1.896582


In [3]:
# 데이터프레임 사본 생성
m_df = df.copy()

In [4]:
# 멀티인덱스 생성  
m_df.index = pd.MultiIndex.from_tuples([(1,'a'),(1,'b'),(1,'c'),(2,'a')], names=['first','second'])
m_df

Unnamed: 0_level_0,Unnamed: 1_level_0,One,Two,Three
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,a,2.371463,0.820267,
1,b,-2.182147,,1.225875
1,c,-0.555885,-1.185652,-0.985316
2,a,-1.374381,-0.908853,-1.045511


In [5]:
print(m_df.to_markdown())

|          |       One |        Two |      Three |
|:---------|----------:|-----------:|-----------:|
| (1, 'a') |  2.37146  |   0.820267 | nan        |
| (1, 'b') | -2.18215  | nan        |   1.22587  |
| (1, 'c') | -0.555885 |  -1.18565  |  -0.985316 |
| (2, 'a') | -1.37438  |  -0.908853 |  -1.04551  |


In [11]:
m_dfs = m_df.sub(column, axis=0, level='second')

Unnamed: 0_level_0,Unnamed: 1_level_0,One,Two,Three
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,a,0.0,-1.551196,
1,b,0.0,,3.408022
1,c,0.0,-0.629766,-0.42943
2,a,-3.745844,-3.280316,-3.416973


In [10]:
print(m_dfs.to_markdown())

|          |      One |        Two |     Three |
|:---------|---------:|-----------:|----------:|
| (1, 'a') |  0       |  -1.5512   | nan       |
| (1, 'b') |  0       | nan        |   3.40802 |
| (1, 'c') |  0       |  -0.629766 |  -0.42943 |
| (2, 'a') | -3.74584 |  -3.28032  |  -3.41697 |


In [42]:
# 0~10까지의 정수를 원소로하는 시리즈 생성
s = pd.Series(np.arange(10))

In [43]:
s

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64

In [44]:
div, rem = divmod(s, 3)

In [45]:
# s를 3으로 나눈 몫
div

0    0
1    0
2    0
3    1
4    1
5    1
6    2
7    2
8    2
9    3
dtype: int64

In [46]:
# s를 3으로 나눈 나머지  
rem

0    0
1    1
2    2
3    0
4    1
5    2
6    0
7    1
8    2
9    0
dtype: int64

In [48]:
div, rem = divmod(s, [2,2,2,3,3,3,4,4,4,5])

In [49]:
div

0    0
1    0
2    1
3    1
4    1
5    1
6    1
7    1
8    2
9    1
dtype: int64

In [51]:
rem

0    0
1    1
2    0
3    0
4    1
5    2
6    2
7    3
8    0
9    4
dtype: int64

In [12]:
df = pd.DataFrame({'One' : {'a': 1.394981, 'b' : 0.343054, 'c' : 0.695246},
                  'Two' : {'a' : 1.772517, 'b' : 1.912123, 'c' : 1.478369, 'd' : 0.279344},
                  'Three' : {'b' : -0.050390, 'c' : 1.227435, 'd' : -0.613172}})

In [13]:
df

Unnamed: 0,One,Two,Three
a,1.394981,1.772517,
b,0.343054,1.912123,-0.05039
c,0.695246,1.478369,1.227435
d,,0.279344,-0.613172


In [14]:
df2 = pd.DataFrame({'One' : {'a' : 1.394981, 'b' : 0.343054, 'c' : 0.695246},
                   'Two' : {'a' : 1.772517, 'b' : 1.912123, 'c' : 1.478369, 'd' : 0.279344},
                   'Three' : {'a' : 1.000000 , 'b' : -0.050390, 'c' : 1.227435, 'd' : -0.613172}})

In [15]:
df2

Unnamed: 0,One,Two,Three
a,1.394981,1.772517,1.0
b,0.343054,1.912123,-0.05039
c,0.695246,1.478369,1.227435
d,,0.279344,-0.613172


In [20]:
print(df2.to_markdown())

|    |        One |      Two |     Three |
|:---|-----------:|---------:|----------:|
| a  |   1.39498  | 1.77252  |  1        |
| b  |   0.343054 | 1.91212  | -0.05039  |
| c  |   0.695246 | 1.47837  |  1.22744  |
| d  | nan        | 0.279344 | -0.613172 |


In [23]:
df + df2

Unnamed: 0,One,Two,Three
a,2.789962,3.545034,
b,0.686108,3.824246,-0.10078
c,1.390492,2.956738,2.45487
d,,0.558688,-1.226344


In [26]:
df.add(df2, fill_value=0)

Unnamed: 0,One,Two,Three
a,2.789962,3.545034,1.0
b,0.686108,3.824246,-0.10078
c,1.390492,2.956738,2.45487
d,,0.558688,-1.226344


In [29]:
df.add(df2, fill_value=1)

Unnamed: 0,One,Two,Three
a,2.789962,3.545034,2.0
b,0.686108,3.824246,-0.10078
c,1.390492,2.956738,2.45487
d,,0.558688,-1.226344
