In [1]:
import pandas as pd
import numpy as np


In [4]:
# 인덱스 생성(시계열)
t_idx = pd.date_range('2020-01-01', periods=8, freq='M')
t_idx

DatetimeIndex(['2020-01-31', '2020-02-29', '2020-03-31', '2020-04-30',
               '2020-05-31', '2020-06-30', '2020-07-31', '2020-08-31'],
              dtype='datetime64[ns]', freq='M')

In [5]:
# 시리즈 생성
s = pd.Series(np.random.randn(8),index = t_idx)
s

2020-01-31   -0.947804
2020-02-29    0.621019
2020-03-31    0.526869
2020-04-30    1.208103
2020-05-31   -0.112500
2020-06-30   -0.739076
2020-07-31    1.419800
2020-08-31   -2.781923
Freq: M, dtype: float64

In [6]:
# 데이터프레임 생성
df = pd.DataFrame({'One' : [1,2,3,4,5,6,7,8], 'Two' : [8,7,6,5,4,3,2,1,]}, index=t_idx)
df

Unnamed: 0,One,Two
2020-01-31,1,8
2020-02-29,2,7
2020-03-31,3,6
2020-04-30,4,5
2020-05-31,5,4
2020-06-30,6,3
2020-07-31,7,2
2020-08-31,8,1


In [7]:
s.head()

2020-01-31   -0.947804
2020-02-29    0.621019
2020-03-31    0.526869
2020-04-30    1.208103
2020-05-31   -0.112500
Freq: M, dtype: float64

In [8]:
s.tail(6)

2020-03-31    0.526869
2020-04-30    1.208103
2020-05-31   -0.112500
2020-06-30   -0.739076
2020-07-31    1.419800
2020-08-31   -2.781923
Freq: M, dtype: float64

In [10]:
# 시리즈 shape
s.shape

(8,)

In [11]:
# 데이터프레임 shape
df.shape

(8, 2)

In [12]:
# 인덱스 shape
t_idx.shape

(8,)

In [15]:
# 0~3번째의 행을 출력
df[:3]

Unnamed: 0,One,Two
2020-01-31,1,8
2020-02-29,2,7
2020-03-31,3,6


In [16]:
# 2~5번째의 행을 출력
df[2:5]

Unnamed: 0,One,Two
2020-03-31,3,6
2020-04-30,4,5
2020-05-31,5,4


In [17]:
# 시리즈 내부의 값
s.array

<PandasArray>
[ -0.9478040550309271,   0.6210189401447063,   0.5268690969841191,
   1.2081027538889946, -0.11250048407157869,  -0.7390759558578995,
   1.4198001438584995,   -2.781922728141299]
Length: 8, dtype: float64

In [18]:
# 시리즈 인덱스 값(시계열)
s.index.array

<DatetimeArray>
['2020-01-31 00:00:00', '2020-02-29 00:00:00', '2020-03-31 00:00:00',
 '2020-04-30 00:00:00', '2020-05-31 00:00:00', '2020-06-30 00:00:00',
 '2020-07-31 00:00:00', '2020-08-31 00:00:00']
Length: 8, dtype: datetime64[ns]

In [20]:
# to_numpy 메소드 사용
s.to_numpy()

array([-0.94780406,  0.62101894,  0.5268691 ,  1.20810275, -0.11250048,
       -0.73907596,  1.41980014, -2.78192273])

In [21]:
# np.asarray 메소드 사용
np.asarray(s)

array([-0.94780406,  0.62101894,  0.5268691 ,  1.20810275, -0.11250048,
       -0.73907596,  1.41980014, -2.78192273])

In [24]:
# 타임존을 설정한 시계열 시리즈 생성
t_s = pd.Series(pd.date_range('2020', periods=2, tz='CET'))

In [25]:
t_s

0   2020-01-01 00:00:00+01:00
1   2020-01-02 00:00:00+01:00
dtype: datetime64[ns, CET]

In [28]:
# 자료형 Object 
t_s.to_numpy(dtype=object)

array([Timestamp('2020-01-01 00:00:00+0100', tz='CET', freq='D'),
       Timestamp('2020-01-02 00:00:00+0100', tz='CET', freq='D')],
      dtype=object)

In [29]:
# 자료형 datetime64[ns]
t_s.to_numpy(dtype='datetime64[ns]')

array(['2019-12-31T23:00:00.000000000', '2020-01-01T23:00:00.000000000'],
      dtype='datetime64[ns]')

In [30]:
df.to_numpy()

array([[1, 8],
       [2, 7],
       [3, 6],
       [4, 5],
       [5, 4],
       [6, 3],
       [7, 2],
       [8, 1]])

In [34]:
# 정수, 소수, 문자형이 섞여있는 데이터프레임 생성
df2 = pd.DataFrame({'A' : [1,2,np.nan,'a'],
                   'B' : [3,4,5,6]})

In [35]:
df2

Unnamed: 0,A,B
0,1,3
1,2,4
2,,5
3,a,6


In [36]:
df2.to_numpy()

array([[1, 3],
       [2, 4],
       [nan, 5],
       ['a', 6]], dtype=object)

In [37]:
# 정수와 누락값(소수)으로 구성되어 있는 데이터프레임 생성
df3 = pd.DataFrame({'A' : [1,2,np.nan,3], 
                   'B' : [4,5,6,7]}) 

In [38]:
df3

Unnamed: 0,A,B
0,1.0,4
1,2.0,5
2,,6
3,3.0,7


In [44]:
df3.to_numpy()

array([[ 1.,  4.],
       [ 2.,  5.],
       [nan,  6.],
       [ 3.,  7.]])

In [4]:
df = pd.read_csv('../Pandas/premier_league.csv')

In [5]:
df

Unnamed: 0,home_team,away_team,home_goals,away_goals,result,season
0,TottenhamHotspur,ManchesterCity,0,0,D,2010-2011
1,AstonVilla,WestHamUnited,3,0,H,2010-2011
2,BlackburnRovers,Everton,1,0,H,2010-2011
3,BoltonWanderers,Fulham,0,0,D,2010-2011
4,Sunderland,BirminghamCity,2,2,D,2010-2011
...,...,...,...,...,...,...
3663,Liverpool,Southampton,4,0,H,
3664,NewcastleUnited,NorwichCity,0,0,D,
3665,Watford,Everton,2,3,A,
3666,WestHamUnited,BrightonandHoveAlbion,3,3,D,


In [11]:
import bottleneck as bn
import time 

In [12]:
start = time.time() # 시작시간 측정
print(bn.nanmean(df['home_goals']), time.time() - start)

1.5517993456924755 0.0002181529998779297


In [15]:
start = time.time() # 시작시간 측정
print(np.nanmean(df['home_goals']), time.time() - start)

1.5517993456924755 0.00026988983154296875


In [2]:
df = pd.DataFrame({'One' : pd.Series(np.random.randn(4), index=['a','b','c','d']),
                    'Two' : pd.Series(np.random.randn(3), index=['a','c','d']),
                   'Three' : pd.Series(np.random.randn(3), index=['b','c','d'])})

In [17]:
df

Unnamed: 0,One,Two,Three
a,1.008765,-0.175352,
b,1.748897,,0.624603
c,1.078679,-0.866894,-0.950405
d,-1.57386,0.170444,0.322723


In [18]:
# 연산을 할 시리즈 생성(데이터프레임에서 한 행을 추출)
row = df.iloc[1]

In [19]:
row

One      1.748897
Two           NaN
Three    0.624603
Name: b, dtype: float64

In [20]:
df.sub(row, axis='columns')

Unnamed: 0,One,Two,Three
a,-0.740132,,
b,0.0,,0.0
c,-0.670218,,-1.575008
d,-3.322757,,-0.301881


In [22]:
df.sub(row, axis=1)

Unnamed: 0,One,Two,Three
a,-0.740132,,
b,0.0,,0.0
c,-0.670218,,-1.575008
d,-3.322757,,-0.301881


In [7]:
# 연산을 할 시리즈 생성(데이터프레임에서 한 열을 추출)
column = df['One']

In [8]:
column

a    2.371463
b   -2.182147
c   -0.555885
d   -1.374381
Name: One, dtype: float64

In [25]:
df.sub(column, axis=0)

Unnamed: 0,One,Two,Three
a,0.0,-1.184117,
b,0.0,,-1.124294
c,0.0,-1.945573,-2.029084
d,0.0,1.744304,1.896582


In [26]:
df.sub(column, axis='index')

Unnamed: 0,One,Two,Three
a,0.0,-1.184117,
b,0.0,,-1.124294
c,0.0,-1.945573,-2.029084
d,0.0,1.744304,1.896582


In [3]:
# 데이터프레임 사본 생성
m_df = df.copy()

In [4]:
# 멀티인덱스 생성  
m_df.index = pd.MultiIndex.from_tuples([(1,'a'),(1,'b'),(1,'c'),(2,'a')], names=['first','second'])
m_df

Unnamed: 0_level_0,Unnamed: 1_level_0,One,Two,Three
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,a,2.371463,0.820267,
1,b,-2.182147,,1.225875
1,c,-0.555885,-1.185652,-0.985316
2,a,-1.374381,-0.908853,-1.045511


In [5]:
print(m_df.to_markdown())

|          |       One |        Two |      Three |
|:---------|----------:|-----------:|-----------:|
| (1, 'a') |  2.37146  |   0.820267 | nan        |
| (1, 'b') | -2.18215  | nan        |   1.22587  |
| (1, 'c') | -0.555885 |  -1.18565  |  -0.985316 |
| (2, 'a') | -1.37438  |  -0.908853 |  -1.04551  |


In [11]:
m_dfs = m_df.sub(column, axis=0, level='second')

Unnamed: 0_level_0,Unnamed: 1_level_0,One,Two,Three
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,a,0.0,-1.551196,
1,b,0.0,,3.408022
1,c,0.0,-0.629766,-0.42943
2,a,-3.745844,-3.280316,-3.416973


In [10]:
print(m_dfs.to_markdown())

|          |      One |        Two |     Three |
|:---------|---------:|-----------:|----------:|
| (1, 'a') |  0       |  -1.5512   | nan       |
| (1, 'b') |  0       | nan        |   3.40802 |
| (1, 'c') |  0       |  -0.629766 |  -0.42943 |
| (2, 'a') | -3.74584 |  -3.28032  |  -3.41697 |


In [42]:
# 0~10까지의 정수를 원소로하는 시리즈 생성
s = pd.Series(np.arange(10))

In [43]:
s

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64

In [44]:
div, rem = divmod(s, 3)

In [45]:
# s를 3으로 나눈 몫
div

0    0
1    0
2    0
3    1
4    1
5    1
6    2
7    2
8    2
9    3
dtype: int64

In [46]:
# s를 3으로 나눈 나머지  
rem

0    0
1    1
2    2
3    0
4    1
5    2
6    0
7    1
8    2
9    0
dtype: int64

In [48]:
div, rem = divmod(s, [2,2,2,3,3,3,4,4,4,5])

In [49]:
div

0    0
1    0
2    1
3    1
4    1
5    1
6    1
7    1
8    2
9    1
dtype: int64

In [51]:
rem

0    0
1    1
2    0
3    0
4    1
5    2
6    2
7    3
8    0
9    4
dtype: int64

In [15]:
df = pd.DataFrame({'One' : {'a': 1.394981, 'b' : 0.343054, 'c' : 0.695246},
                  'Two' : {'a' : 1.772517, 'b' : 1.912123, 'c' : 1.478369, 'd' : 0.279344},
                  'Three' : {'b' : -0.050390, 'c' : 1.227435, 'd' : -0.613172}})

In [13]:
df

Unnamed: 0,One,Two,Three
a,1.394981,1.772517,
b,0.343054,1.912123,-0.05039
c,0.695246,1.478369,1.227435
d,,0.279344,-0.613172


In [3]:
df2 = pd.DataFrame({'One' : {'a' : 1.394981, 'b' : 0.343054, 'c' : 0.695246},
                   'Two' : {'a' : 1.772517, 'b' : 1.912123, 'c' : 1.478369, 'd' : 0.279344},
                   'Three' : {'a' : 1.000000 , 'b' : -0.050390, 'c' : 1.227435, 'd' : -0.613172}})

In [15]:
df2

Unnamed: 0,One,Two,Three
a,1.394981,1.772517,1.0
b,0.343054,1.912123,-0.05039
c,0.695246,1.478369,1.227435
d,,0.279344,-0.613172


In [20]:
print(df2.to_markdown())

|    |        One |      Two |     Three |
|:---|-----------:|---------:|----------:|
| a  |   1.39498  | 1.77252  |  1        |
| b  |   0.343054 | 1.91212  | -0.05039  |
| c  |   0.695246 | 1.47837  |  1.22744  |
| d  | nan        | 0.279344 | -0.613172 |


In [23]:
df + df2

Unnamed: 0,One,Two,Three
a,2.789962,3.545034,
b,0.686108,3.824246,-0.10078
c,1.390492,2.956738,2.45487
d,,0.558688,-1.226344


In [26]:
df.add(df2, fill_value=0)

Unnamed: 0,One,Two,Three
a,2.789962,3.545034,1.0
b,0.686108,3.824246,-0.10078
c,1.390492,2.956738,2.45487
d,,0.558688,-1.226344


In [29]:
df.add(df2, fill_value=1)

Unnamed: 0,One,Two,Three
a,2.789962,3.545034,2.0
b,0.686108,3.824246,-0.10078
c,1.390492,2.956738,2.45487
d,,0.558688,-1.226344


In [14]:
df.eq(df2)

Unnamed: 0,One,Two,Three
a,True,True,False
b,True,True,True
c,True,True,True
d,False,True,True


In [15]:
df.ne(df2)

Unnamed: 0,One,Two,Three
a,False,False,True
b,False,False,False
c,False,False,False
d,True,False,False


In [18]:
df.lt(df2)

Unnamed: 0,One,Two,Three
a,False,False,False
b,False,False,False
c,False,False,False
d,False,False,False


In [7]:
df.gt(df2)

Unnamed: 0,One,Two,Three
a,False,False,False
b,False,False,False
c,False,False,False
d,False,False,False


In [8]:
df.le(df2)

Unnamed: 0,One,Two,Three
a,True,True,False
b,True,True,True
c,True,True,True
d,False,True,True


In [9]:
df.ge(df2)

Unnamed: 0,One,Two,Three
a,True,True,False
b,True,True,True
c,True,True,True
d,False,True,True


In [19]:
(df > 0).all()

One      False
Two       True
Three    False
dtype: bool

In [20]:
(df > 0).any()

One      True
Two      True
Three    True
dtype: bool

In [22]:
(df > 0).any().any()

True

In [23]:
df.empty

False

In [24]:
pd.DataFrame(columns=list('ABC')).empty

True

In [25]:
pd.Series([True]).bool()

True

In [26]:
pd.Series([False]).bool()

False

In [28]:
pd.DataFrame([[True]]).bool()

True

In [29]:
pd.DataFrame([[False]]).bool()

False

In [43]:
bool(df.iloc[0][0])

True

In [44]:
df + df

Unnamed: 0,One,Two,Three
a,2.789962,3.545034,
b,0.686108,3.824246,-0.10078
c,1.390492,2.956738,2.45487
d,,0.558688,-1.226344


In [45]:
df * 2

Unnamed: 0,One,Two,Three
a,2.789962,3.545034,
b,0.686108,3.824246,-0.10078
c,1.390492,2.956738,2.45487
d,,0.558688,-1.226344


In [49]:
# False 끼리의 논리연산은 False를 반환한다.
df + df == df * 2

Unnamed: 0,One,Two,Three
a,True,True,False
b,True,True,True
c,True,True,True
d,False,True,True


In [50]:
(df + df).equals(df * 2)

True

In [51]:
pd.Series(['One', 'Two', 'Three']) == 'Two'

0    False
1     True
2    False
dtype: bool

In [52]:
pd.Index(['Four', 'Five', 'Six']) == 'Five'

array([False,  True, False])

In [53]:
pd.Series(['One', 'Two', 'Three']) == pd.Index(['One', 'Five', 'Six'])

0     True
1    False
2    False
dtype: bool

In [54]:
pd.Series(['One', 'Two', 'Three']) == np.array(['One', 'Five', 'Six'])

0     True
1    False
2    False
dtype: bool

In [56]:
np.array([1,2,3]) == np.array([1])

array([ True, False, False])

In [2]:
df1 = pd.DataFrame({'A' : [1., np.nan, 3., 5., np.nan],
                   'B' : [np.nan, 2., 3., np.nan, 6.]})

In [8]:
print(df1.to_markdown())

|    |   A |   B |
|---:|----:|----:|
|  0 |   1 | nan |
|  1 | nan |   2 |
|  2 |   3 |   3 |
|  3 |   5 | nan |
|  4 | nan |   6 |


In [4]:
df2 = pd.DataFrame({'A' : [5., 2., 4., np.nan, 3., 7.],
                   'B' : [np.nan, np.nan, 3., 4., 6., 8.]})

In [9]:
print(df2.to_markdown())

|    |   A |   B |
|---:|----:|----:|
|  0 |   5 | nan |
|  1 |   2 | nan |
|  2 |   4 |   3 |
|  3 | nan |   4 |
|  4 |   3 |   6 |
|  5 |   7 |   8 |


In [10]:
a = df1.combine_first(df2)

In [11]:
print(a.to_markdown())

|    |   A |   B |
|---:|----:|----:|
|  0 |   1 | nan |
|  1 |   2 |   2 |
|  2 |   3 |   3 |
|  3 |   5 |   4 |
|  4 |   3 |   6 |
|  5 |   7 |   8 |


In [12]:
b = df2.combine_first(df1)

In [13]:
print(b.to_markdown())

|    |   A |   B |
|---:|----:|----:|
|  0 |   5 | nan |
|  1 |   2 |   2 |
|  2 |   4 |   3 |
|  3 |   5 |   4 |
|  4 |   3 |   6 |
|  5 |   7 |   8 |


In [17]:
print(df.to_markdown())

|    |        One |      Two |      Three |
|:---|-----------:|---------:|-----------:|
| a  |   1.39498  | 1.77252  | nan        |
| b  |   0.343054 | 1.91212  |  -0.05039  |
| c  |   0.695246 | 1.47837  |   1.22744  |
| d  | nan        | 0.279344 |  -0.613172 |


In [18]:
df.mean(0)

One      0.811094
Two      1.360588
Three    0.187958
dtype: float64

In [19]:
df.mean(1)

a    1.583749
b    0.734929
c    1.133683
d   -0.166914
dtype: float64

In [21]:
df.sum(0, skipna=False)

One           NaN
Two      5.442353
Three         NaN
dtype: float64

In [22]:
df.sum(axis=1, skipna=True)

a    3.167498
b    2.204787
c    3.401050
d   -0.333828
dtype: float64

In [23]:
ts_stand = (df - df.mean(0)) / df.std()

In [24]:
ts_stand.std()

One      1.0
Two      1.0
Three    1.0
dtype: float64

In [25]:
xs_stand = df.sub(df.mean(1), axis=0).div(df.std(1), axis=0)

In [26]:
xs_stand

Unnamed: 0,One,Two,Three
a,-0.707107,0.707107,
b,-0.377425,1.133785,-0.756361
c,-1.096393,0.86195,0.234443
d,,0.707107,-0.707107


In [29]:
print(xs_stand.to_markdown())

|    |        One |      Two |      Three |
|:---|-----------:|---------:|-----------:|
| a  |  -0.707107 | 0.707107 | nan        |
| b  |  -0.377425 | 1.13379  |  -0.756361 |
| c  |  -1.09639  | 0.86195  |   0.234443 |
| d  | nan        | 0.707107 |  -0.707107 |


In [28]:
xs_stand.std(1)

a    1.0
b    1.0
c    1.0
d    1.0
dtype: float64

In [33]:
df.cumsum()

Unnamed: 0,One,Two,Three
a,1.394981,1.772517,
b,1.738035,3.68464,-0.05039
c,2.433281,5.163009,1.177045
d,,5.442353,0.563873


In [36]:
np.mean(df['One'])

0.8110936666666667

In [38]:
np.mean(df['One'].to_numpy())

nan

In [39]:
Series = pd.Series(np.random.randn(500))

In [41]:
Series[20:200] = np.nan

In [42]:
Series[10:20] = 5

In [43]:
Series.nunique()

311

In [44]:
series = pd.Series(np.random.randn(1000))

In [45]:
series[::2] = np.nan

In [47]:
series.describe()

count    500.000000
mean      -0.104046
std        0.973392
min       -3.459899
25%       -0.705669
50%       -0.057655
75%        0.559557
max        3.115283
dtype: float64

In [48]:
df = pd.DataFrame(np.random.randn(1000,5), columns=['a','b','c','d','e'])

In [50]:
df.iloc[::2] = np.nan

In [52]:
a = df.describe()

In [53]:
print(a.to_markdown())

|       |           a |           b |           c |           d |           e |
|:------|------------:|------------:|------------:|------------:|------------:|
| count | 500         | 500         | 500         | 500         | 500         |
| mean  |  -0.0098391 |  -0.0217725 |  -0.0133153 |   0.0329154 |   0.0208793 |
| std   |   1.04484   |   1.02411   |   1.03689   |   0.991465  |   1.01293   |
| min   |  -3.15347   |  -2.56002   |  -2.98677   |  -2.66778   |  -2.99676   |
| 25%   |  -0.736796  |  -0.682348  |  -0.716998  |  -0.616957  |  -0.754231  |
| 50%   |   0.0153571 |  -0.124131  |  -0.0624372 |   0.0335433 |   0.0098666 |
| 75%   |   0.683469  |   0.678965  |   0.686431  |   0.705677  |   0.736733  |
| max   |   3.26103   |   3.05544   |   3.11557   |   3.24525   |   2.79697   |


In [54]:
series.describe(percentiles=[.05, .25, .75, .95])

count    500.000000
mean      -0.104046
std        0.973392
min       -3.459899
5%        -1.804095
25%       -0.705669
50%       -0.057655
75%        0.559557
95%        1.402325
max        3.115283
dtype: float64

In [55]:
s = pd.Series(['a','a','a','b','b',np.nan,'c','d','a'])

In [56]:
s.describe()

count     8
unique    4
top       a
freq      4
dtype: object

In [57]:
df = pd.DataFrame({'a' : ['Yes','Yes','No','No'], 'b' : range(4)})

In [61]:
df.describe()

Unnamed: 0,b
count,4.0
mean,1.5
std,1.290994
min,0.0
25%,0.75
50%,1.5
75%,2.25
max,3.0


In [68]:
df.describe(include=['object'])

Unnamed: 0,a
count,4
unique,2
top,No
freq,2


In [71]:
df.describe(include=['number'])

Unnamed: 0,b
count,4.0
mean,1.5
std,1.290994
min,0.0
25%,0.75
50%,1.5
75%,2.25
max,3.0


In [74]:
df.describe(include='all')

Unnamed: 0,a,b
count,4,4.0
unique,2,
top,No,
freq,2,
mean,,1.5
std,,1.290994
min,,0.0
25%,,0.75
50%,,1.5
75%,,2.25


In [75]:
s1 = pd.Series(np.random.randn(5))

In [76]:
s1

0    0.013833
1    1.375255
2   -0.261117
3   -1.300798
4    0.550127
dtype: float64

In [77]:
s1.idxmin(), s1.idxmax()

(3, 1)

In [78]:
df1 = pd.DataFrame(np.random.randn(5, 3), columns=['A','B','C'])

In [80]:
print(df1.to_markdown())

|    |          A |         B |         C |
|---:|-----------:|----------:|----------:|
|  0 | -0.307946  |  0.96568  |  0.854274 |
|  1 | -0.734701  | -0.763467 |  0.805104 |
|  2 |  0.0317531 |  0.494293 | -0.720657 |
|  3 |  2.08982   |  0.295745 |  1.1382   |
|  4 |  0.260457  |  0.785826 |  0.109045 |


In [81]:
df1.idxmin(axis=0)

A    1
B    1
C    2
dtype: int64

In [82]:
df1.idxmax(axis=1)

0    B
1    C
2    B
3    A
4    B
dtype: object

In [83]:
df3 = pd.DataFrame([2, 1, 1, 3, np.nan], columns=['A'], index=list('edcba'))

In [85]:
print(df3.to_markdown())

|    |   A |
|:---|----:|
| e  |   2 |
| d  |   1 |
| c  |   1 |
| b  |   3 |
| a  | nan |


In [86]:
df3['A'].idxmin()

'd'

In [87]:
data = np.random.randint(0, 7, size=50)

In [88]:
data

array([5, 3, 6, 1, 4, 1, 3, 0, 0, 3, 2, 4, 5, 3, 1, 4, 6, 2, 2, 1, 5, 6,
       6, 0, 0, 4, 6, 5, 1, 4, 6, 3, 1, 3, 4, 6, 0, 4, 3, 6, 3, 6, 2, 2,
       1, 2, 5, 3, 0, 4])

In [89]:
s = pd.Series(data)

In [90]:
s.value_counts()

6    9
3    9
4    8
1    7
2    6
0    6
5    5
dtype: int64

In [92]:
pd.value_counts(data)

6    9
3    9
4    8
1    7
2    6
0    6
5    5
dtype: int64

In [93]:
s5 = pd.Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7])

In [94]:
s5.mode()

0    3
1    7
dtype: int64

In [95]:
df5 = pd.DataFrame({"A": np.random.randint(0, 7, size=50),
               "B": np.random.randint(-10, 15, size=50)})

In [98]:
df5.mode()

Unnamed: 0,A,B
0,1,3


In [99]:
arr = np.random.randn(20)

In [100]:
arr

array([-0.99357841,  1.57974431, -0.60909831, -1.36768082, -0.7160791 ,
       -0.02279333, -0.39298072, -1.45827772, -0.02566391,  0.92865122,
        0.82623857,  1.41247274,  1.16325227,  0.92048371, -1.81958929,
        0.73825483, -0.28654464,  1.06215868, -1.01476624,  0.31787862])

In [101]:
factor = pd.cut(arr, 4)

In [102]:
factor

[(-1.823, -0.97], (0.73, 1.58], (-0.97, -0.12], (-1.823, -0.97], (-0.97, -0.12], ..., (0.73, 1.58], (-0.97, -0.12], (0.73, 1.58], (-1.823, -0.97], (-0.12, 0.73]]
Length: 20
Categories (4, interval[float64]): [(-1.823, -0.97] < (-0.97, -0.12] < (-0.12, 0.73] < (0.73, 1.58]]

In [103]:
factor = pd.cut(arr, [-5, -1, 0, 1, 5])

In [104]:
factor

[(-1, 0], (1, 5], (-1, 0], (-5, -1], (-1, 0], ..., (0, 1], (-1, 0], (1, 5], (-5, -1], (0, 1]]
Length: 20
Categories (4, interval[int64]): [(-5, -1] < (-1, 0] < (0, 1] < (1, 5]]

In [105]:
arr = np.random.randn(30)

In [106]:
arr

array([ 1.15079381,  0.49939784, -0.15594853, -0.5517254 , -0.23522974,
       -1.69728499, -0.80471605, -0.72416246,  0.79870411,  1.28311701,
       -0.26005877,  0.4855533 ,  0.49194232, -1.89062664, -0.43397157,
        0.77145788, -0.32865781,  0.36810471, -1.76766844, -0.74308291,
       -0.05939578,  1.55420964, -0.82908721,  0.92783387,  0.55435694,
        0.10418023,  0.3697685 , -0.39919188,  1.13068469, -0.15891597])

In [107]:
factor = pd.qcut(arr, [0, .25, .5, .75, 1])

In [108]:
factor

[(0.541, 1.554], (-0.108, 0.541], (-0.522, -0.108], (-1.892, -0.522], (-0.522, -0.108], ..., (-0.108, 0.541], (-0.108, 0.541], (-0.522, -0.108], (0.541, 1.554], (-0.522, -0.108]]
Length: 30
Categories (4, interval[float64]): [(-1.892, -0.522] < (-0.522, -0.108] < (-0.108, 0.541] < (0.541, 1.554]]

In [109]:
pd.value_counts(factor)

(0.541, 1.554]      8
(-1.892, -0.522]    8
(-0.108, 0.541]     7
(-0.522, -0.108]    7
dtype: int64

In [110]:
arr = np.random.randn(20)

In [111]:
factor = pd.cut(arr, [-np.inf, 0, np.inf])

In [112]:
factor

[(0.0, inf], (0.0, inf], (0.0, inf], (-inf, 0.0], (-inf, 0.0], ..., (-inf, 0.0], (0.0, inf], (0.0, inf], (0.0, inf], (-inf, 0.0]]
Length: 20
Categories (2, interval[float64]): [(-inf, 0.0] < (0.0, inf]]

In [3]:
df = pd.read_csv('premier_league.csv')
df

Unnamed: 0,home_team,away_team,home_goals,away_goals,result,season
0,TottenhamHotspur,ManchesterCity,0,0,D,2010-2011
1,AstonVilla,WestHamUnited,3,0,H,2010-2011
2,BlackburnRovers,Everton,1,0,H,2010-2011
3,BoltonWanderers,Fulham,0,0,D,2010-2011
4,Sunderland,BirminghamCity,2,2,D,2010-2011
...,...,...,...,...,...,...
3663,Liverpool,Southampton,4,0,H,
3664,NewcastleUnited,NorwichCity,0,0,D,
3665,Watford,Everton,2,3,A,
3666,WestHamUnited,BrightonandHoveAlbion,3,3,D,


In [10]:
def extract_city_name(df):
    df['city_name'] = df['city_and_code'].str.split(",").str.get(0)
    return df

def add_country_name(df, country_name = None):
    col = 'city_name'
    df['city_and_country'] = df[col] + country_name
    return df

df_p = pd.DataFrame({'city_and_code' : ['Chicago, IL']})

In [14]:
tmp = add_country_name(extract_city_name(df_p), country_name='US')
print(tmp.to_markdown())

|    | city_and_code   | city_name   | city_and_country   |
|---:|:----------------|:------------|:-------------------|
|  0 | Chicago, IL     | Chicago     | ChicagoUS          |


In [15]:
tmp = (df_p.pipe(extract_city_name)
     .pipe(add_country_name, country_name='US'))
print(tmp.to_markdown())

|    | city_and_code   | city_name   | city_and_country   |
|---:|:----------------|:------------|:-------------------|
|  0 | Chicago, IL     | Chicago     | ChicagoUS          |


In [18]:
df['home_goals'].apply(np.mean)

0       0.0
1       3.0
2       1.0
3       0.0
4       2.0
       ... 
3663    4.0
3664    0.0
3665    2.0
3666    3.0
3667    0.0
Name: home_goals, Length: 3668, dtype: float64

In [32]:
df.apply('mean')

home_goals    1.551799
away_goals    1.198201
dtype: float64

In [43]:
tsdf = pd.DataFrame(np.random.randn(1000, 3), columns=['A','B','C'],
                   index=pd.date_range('1/1/2000', periods=1000))

tsdf.apply(lambda x : x.idxmax())

A   2001-11-28
B   2000-04-24
C   2002-08-15
dtype: datetime64[ns]

In [48]:
def subtract_and_divide(x, sub, divide = 1):
    return (x-sub) / divide

tmp = tsdf.apply(subtract_and_divide, args=(5,), divide=3)
print(tmp.to_markdown())

|                     |         A |         B |         C |
|:--------------------|----------:|----------:|----------:|
| 2000-01-01 00:00:00 | -1.745    | -1.67402  | -1.93762  |
| 2000-01-02 00:00:00 | -1.85958  | -2.01404  | -1.34984  |
| 2000-01-03 00:00:00 | -1.34948  | -1.62634  | -1.2855   |
| 2000-01-04 00:00:00 | -1.8898   | -1.89961  | -1.82673  |
| 2000-01-05 00:00:00 | -0.839412 | -1.496    | -1.11361  |
| 2000-01-06 00:00:00 | -1.9484   | -1.62996  | -1.20595  |
| 2000-01-07 00:00:00 | -1.24549  | -1.12449  | -1.33754  |
| 2000-01-08 00:00:00 | -1.6188   | -1.83663  | -1.88883  |
| 2000-01-09 00:00:00 | -1.56541  | -1.88771  | -1.12292  |
| 2000-01-10 00:00:00 | -1.78074  | -1.80143  | -1.73056  |
| 2000-01-11 00:00:00 | -1.37461  | -1.61463  | -1.79432  |
| 2000-01-12 00:00:00 | -1.82991  | -1.59375  | -2.16066  |
| 2000-01-13 00:00:00 | -1.45812  | -1.41979  | -1.98306  |
| 2000-01-14 00:00:00 | -1.56264  | -1.48197  | -1.71941  |
| 2000-01-15 00:00:00 | -2.12301  | -1.8

In [52]:
print(tsdf.apply(pd.Series.interpolate).to_markdown())

|                     |            A |            B |           C |
|:--------------------|-------------:|-------------:|------------:|
| 2000-01-01 00:00:00 | -0.234998    | -0.0220618   | -0.812864   |
| 2000-01-02 00:00:00 | -0.578754    | -1.04212     |  0.950494   |
| 2000-01-03 00:00:00 |  0.951572    |  0.120993    |  1.14351    |
| 2000-01-04 00:00:00 | -0.669386    | -0.698823    | -0.48018    |
| 2000-01-05 00:00:00 |  2.48176     |  0.512006    |  1.65917    |
| 2000-01-06 00:00:00 | -0.845202    |  0.110124    |  1.38216    |
| 2000-01-07 00:00:00 |  1.26354     |  1.62652     |  0.987382   |
| 2000-01-08 00:00:00 |  0.143601    | -0.509895    | -0.666476   |
| 2000-01-09 00:00:00 |  0.303776    | -0.66313     |  1.63123    |
| 2000-01-10 00:00:00 | -0.342223    | -0.404277    | -0.191693   |
| 2000-01-11 00:00:00 |  0.876158    |  0.156111    | -0.382975   |
| 2000-01-12 00:00:00 | -0.489744    |  0.218755    | -1.48198    |
| 2000-01-13 00:00:00 |  0.625647    |  0.740618