In [1]:
import numpy as np
import pandas as pd

In [2]:
import datetime
from datetime import datetime, date

In [3]:
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 8)
pd.set_option('display.max_rows', 10)
pd.set_option('display.width', 65)

In [4]:
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
np.random.seed(123456)
df = pd.DataFrame(np.random.randn(5, 4), columns=['A', 'B', 'C', 'D'])
df

          A         B         C         D
0  0.469112 -0.282863 -1.509059 -1.135632
1  1.212112 -0.173215  0.119209 -1.044236
2 -0.861849 -2.104569 -0.494929  1.071804
3  0.721555 -0.706771 -1.039575  0.271860
4 -0.424972  0.567020  0.276232 -1.087401

In [6]:
df * 2

          A         B         C         D
0  0.938225 -0.565727 -3.018117 -2.271265
1  2.424224 -0.346429  0.238417 -2.088472
2 -1.723698 -4.209138 -0.989859  2.143608
3  1.443110 -1.413542 -2.079150  0.543720
4 -0.849945  1.134041  0.552464 -2.174801

In [7]:
s = df.iloc[0]
s

A    0.469112
B   -0.282863
C   -1.509059
D   -1.135632
Name: 0, dtype: float64

In [8]:
diff = df - s
diff

          A         B         C         D
0  0.000000  0.000000  0.000000  0.000000
1  0.743000  0.109649  1.628267  0.091396
2 -1.330961 -1.821706  1.014129  2.207436
3  0.252443 -0.423908  0.469484  1.407492
4 -0.894085  0.849884  1.785291  0.048232

In [9]:
diff2 = s - df
diff2

          A         B         C         D
0  0.000000  0.000000  0.000000  0.000000
1 -0.743000 -0.109649 -1.628267 -0.091396
2  1.330961  1.821706 -1.014129 -2.207436
3 -0.252443  0.423908 -0.469484 -1.407492
4  0.894085 -0.849884 -1.785291 -0.048232

In [10]:
s2 = s[1:3]
s2['E'] = 0
s2

B   -0.282863
C   -1.509059
E    0.000000
Name: 0, dtype: float64

In [11]:
df + s2

    A         B         C   D   E
0 NaN -0.565727 -3.018117 NaN NaN
1 NaN -0.456078 -1.389850 NaN NaN
2 NaN -2.387433 -2.003988 NaN NaN
3 NaN -0.989634 -2.548633 NaN NaN
4 NaN  0.284157 -1.232826 NaN NaN

In [12]:
subframe = df[1:4][['B', 'C']]
subframe

          B         C
1 -0.173215  0.119209
2 -2.104569 -0.494929
3 -0.706771 -1.039575

In [13]:
df - subframe

    A    B    C   D
0 NaN  NaN  NaN NaN
1 NaN  0.0  0.0 NaN
2 NaN  0.0  0.0 NaN
3 NaN  0.0  0.0 NaN
4 NaN  NaN  NaN NaN

In [14]:
a_col = df['A']
a_col

0    0.469112
1    1.212112
2   -0.861849
3    0.721555
4   -0.424972
Name: A, dtype: float64

In [15]:
df.sub(a_col, axis=0)

     A         B         C         D
0  0.0 -0.751976 -1.978171 -1.604745
1  0.0 -1.385327 -1.092903 -2.256348
2  0.0 -1.242720  0.366920  1.933653
3  0.0 -1.428326 -1.761130 -0.449695
4  0.0  0.991993  0.701204 -0.662428

In [16]:
s = pd.Series(['a', 'a', 'b', 'c', np.NaN])
s

0      a
1      a
2      b
3      c
4    NaN
dtype: object

In [17]:
s.count()

4

In [18]:
s.unique()

array(['a', 'b', 'c', nan], dtype=object)

In [19]:
s.nunique()

3

In [20]:
s.nunique(dropna=False)

4

In [21]:
s.value_counts(dropna=False)

a      2
b      1
c      1
NaN    1
dtype: int64

In [22]:
omh = pd.read_csv('Data/omh.csv')

In [23]:
omh[['MSFT', 'AAPL']].max()

MSFT     48.84
AAPL    115.93
dtype: float64

In [24]:
omh[['MSFT', 'AAPL']].idxmin()

MSFT    11
AAPL    11
dtype: int64

In [25]:
omh[['MSFT', 'AAPL']].idxmax()

MSFT    3
AAPL    2
dtype: int64

In [26]:
omh.nsmallest(4, ['MSFT'])['MSFT']

11    45.16
12    45.74
21    46.45
10    46.67
Name: MSFT, dtype: float64

In [27]:
omh.nlargest(4, ['MSFT'])['MSFT']

3     48.84
0     48.62
1     48.46
16    48.45
Name: MSFT, dtype: float64

In [28]:
omh['MSFT'].nsmallest(4)

11    45.16
12    45.74
21    46.45
10    46.67
Name: MSFT, dtype: float64

In [29]:
pd.Series([1, 2, 3, 4]).cumprod()

0     1
1     2
2     6
3    24
dtype: int64

In [30]:
pd.Series([1, 2, 3, 4]).cumsum()

0     1
1     3
2     6
3    10
dtype: int64

In [31]:
omh.describe()

            MSFT        AAPL
count  22.000000   22.000000
mean   47.493182  112.411364
std     0.933077    2.388772
min    45.160000  106.750000
25%    46.967500  111.660000
50%    47.625000  112.530000
75%    48.125000  114.087500
max    48.840000  115.930000

In [32]:
omh.MSFT.describe()

count    22.000000
mean     47.493182
std       0.933077
min      45.160000
25%      46.967500
50%      47.625000
75%      48.125000
max      48.840000
Name: MSFT, dtype: float64

In [33]:
omh.MSFT.describe()['mean']

47.49318181818182

In [34]:
s = pd.Series(['a', 'a', 'b', 'c', np.NaN])
s.describe()

count     4
unique    3
top       a
freq      2
dtype: object

In [38]:
omh

          Date   MSFT    AAPL
0   2014-12-01  48.62  115.07
1   2014-12-02  48.46  114.63
2   2014-12-03  48.08  115.93
3   2014-12-04  48.84  115.49
4   2014-12-05  48.42  115.00
..         ...    ...     ...
17  2014-12-24  48.14  112.01
18  2014-12-26  47.88  113.99
19  2014-12-29  47.45  113.91
20  2014-12-30  47.02  112.52
21  2014-12-31  46.45  110.38

[22 rows x 3 columns]

In [37]:
omh.mean(numeric_only=True)

MSFT     47.493182
AAPL    112.411364
dtype: float64

In [41]:
omh.mean(axis=1, numeric_only=True).head()

0    81.845
1    81.545
2    82.005
3    82.165
4    81.710
dtype: float64

In [43]:
omh.median(numeric_only=True)

MSFT     47.625
AAPL    112.530
dtype: float64

In [45]:
s = pd.Series([1, 2, 3, 3, 5, 1])
s.mode()

0    1
1    3
dtype: int64

In [47]:
omh.var(numeric_only=True)

MSFT    0.870632
AAPL    5.706231
dtype: float64

In [48]:
omh.std(numeric_only=True)

MSFT    0.933077
AAPL    2.388772
dtype: float64

In [49]:
omh['MSFT'].cov(omh['AAPL'])

1.9261240259740264

In [50]:
omh.MSFT.corr(omh.AAPL)

0.8641560684381171

In [51]:
np.random.seed(123456)
dist = np.random.normal(size=10000)
dist

array([ 0.4691123 , -0.28286334, -1.5090585 , ...,  0.26296448,
       -0.83377412, -0.10418135])

In [54]:
dist.max()

3.6977832344318555

In [55]:
dist.mean(), dist.std()

(-0.002863324040906651, 1.008716203199891)

In [56]:
bins = pd.cut(dist, 5)
bins

[(-0.633, 0.81], (-0.633, 0.81], (-2.077, -0.633], (-2.077, -0.633], (0.81, 2.254], ..., (-2.077, -0.633], (-0.633, 0.81], (-0.633, 0.81], (-2.077, -0.633], (-0.633, 0.81]]
Length: 10000
Categories (5, interval[float64, right]): [(-3.528, -2.077] < (-2.077, -0.633] < (-0.633, 0.81] < (0.81, 2.254] < (2.254, 3.698]]

In [57]:
bins.categories

IntervalIndex([(-3.528, -2.077], (-2.077, -0.633], (-0.633, 0.81], (0.81, 2.254], (2.254, 3.698]], dtype='interval[float64, right]')

In [58]:
bins.codes

array([2, 2, 1, ..., 2, 1, 2], dtype=int8)

In [60]:
np.random.seed(123456)
ages = np.random.randint(6, 45, 50)
ages

array([ 7, 33, 38, 29, 42, 14, 16, 16, 18, 17, 26, 28, 44, 40, 20, 12,  8,
       10, 36, 29, 26, 26, 11, 29, 42, 17, 41, 35, 22, 40, 24, 21, 38, 33,
       26, 23, 16, 34, 26, 20, 18, 42, 27, 13, 37, 37, 10,  7, 10, 23])

In [62]:
ranges = [6, 12, 18, 35, 50]
labels = ['Youth', 'Youn Adult', 'Adult', 'Middle Aged']
agebins = pd.cut(ages, ranges, labels=labels)
agebins.describe()

             counts  freqs
categories                
Youth             8   0.16
Youn Adult        9   0.18
Adult            21   0.42
Middle Aged      12   0.24

In [63]:
qbin = pd.qcut(dist, 5)
qbin.describe()

                  counts  freqs
categories                     
(-3.522, -0.861]    2000    0.2
(-0.861, -0.241]    2000    0.2
(-0.241, 0.261]     2000    0.2
(0.261, 0.866]      2000    0.2
(0.866, 3.698]      2000    0.2