In [1]:
# %load mysettings.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

np.set_printoptions(precision=2, suppress=True)
pd.options.display.max_rows = 15
pd.options.display.precision=2

%matplotlib inline


In [2]:
arr = np.array([2, 4, -1, 5])
print(arr)

[ 2  4 -1  5]


In [3]:
ser = pd.Series([2, 4, -1, 5])
ser

0    2
1    4
2   -1
3    5
dtype: int64

In [4]:
ser.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
ser.values

array([ 2,  4, -1,  5], dtype=int64)

In [6]:
ser = pd.Series([2, 4, -1, 5], index=["d", "c", "a", "b"])
ser

d    2
c    4
a   -1
b    5
dtype: int64

In [7]:
ser.index

Index(['d', 'c', 'a', 'b'], dtype='object')

In [8]:
ser["a"] # explicit index

-1

In [9]:
ser[2] # implicit index

-1

In [10]:
np.random.seed(524)

In [11]:
np.random.randint(1, 1000, 10)

array([446, 533, 796, 775,  54, 706, 669, 852, 299, 207])

In [12]:
np.random.seed(524)
np.random.randint(1, 1000, 10)

array([446, 533, 796, 775,  54, 706, 669, 852, 299, 207])

In [19]:
pd.DataFrame({
    "col1": [1, 2, 3],
    "col2": ["a", "b", "c"],
    "col3": [True, False, True]
})

Unnamed: 0,col1,col2,col3
0,1,a,True
1,2,b,False
2,3,c,True


In [20]:
pd.DataFrame({
    "col1": [1, 2, 3],
    "col2": ["a", "b", "c"],
    "col3": [True, False]
})

ValueError: arrays must all be same length

In [18]:
pd.DataFrame({
    "col1": [1, 2, 3],
    "col2": ["a", "b", "c"],
    "col3": True
})

Unnamed: 0,col1,col2,col3
0,1,a,True
1,2,b,True
2,3,c,True


In [22]:
pd.DataFrame({
    "col2": ["a", "b", "c"],
    "col1": [1, 2, 3],
    "col3": [True, False, True]
})

Unnamed: 0,col2,col1,col3
0,a,1,True
1,b,2,False
2,c,3,True


In [25]:
pd.DataFrame({
    "col2": ["a", "b", "c"],
    "col1": [1, 2, 3],
    "col3": [True, False, True]
}, columns = ["col1", "col2", "col3"])

Unnamed: 0,col1,col2,col3
0,1,a,True
1,2,b,False
2,3,c,True


In [26]:
pd.DataFrame({
    "col2": ["a", "b", "c"],
    "col1": [1, 2, 3],
    "col3": [True, False, True]
}, columns = ["col1", "col2"])

Unnamed: 0,col1,col2
0,1,a
1,2,b
2,3,c


In [27]:
pd.DataFrame({
    "col2": ["a", "b", "c"],
    "col1": [1, 2, 3],
    "col3": [True, False, True]
}, columns = ["col1", "col2", "col4"])

Unnamed: 0,col1,col2,col4
0,1,a,
1,2,b,
2,3,c,


In [32]:
df = pd.DataFrame({
    "col2": ["a", "b", "c"],
    "col1": [1, 2, 3],
    "col3": [True, False, True]
}, columns = ["col1", "col2", "col3"])
df

Unnamed: 0,col1,col2,col3
0,1,a,True
1,2,b,False
2,3,c,True


In [33]:
df.index

RangeIndex(start=0, stop=3, step=1)

In [29]:
df = pd.DataFrame({
    "col2": ["a", "b", "c"],
    "col1": [1, 2, 3],
    "col3": [True, False, True]
}, columns = ["col1", "col2", "col3"], index = ["row1", "row2", "row3"])
df

Unnamed: 0,col1,col2,col3
row1,1,a,True
row2,2,b,False
row3,3,c,True


In [30]:
df.index

Index(['row1', 'row2', 'row3'], dtype='object')

In [31]:
df.columns

Index(['col1', 'col2', 'col3'], dtype='object')

In [43]:
obj3 = pd.Series(["blue", "pupple", "yellow"], index = [0, 2, 4])
obj3

0      blue
2    pupple
4    yellow
dtype: object

In [52]:
obj3.index

Int64Index([0, 2, 4], dtype='int64')

In [51]:
obj3[0], obj3[2], obj3[4]

('blue', 'pupple', 'yellow')

In [49]:
obj3.loc[0], obj3.loc[2], obj3.loc[4]

('blue', 'pupple', 'yellow')

In [50]:
obj3.iloc[0], obj3.iloc[1], obj3.iloc[2]

('blue', 'pupple', 'yellow')

In [53]:
ser = pd.Series(np.arange(3))
ser

0    0
1    1
2    2
dtype: int32

In [54]:
ser[-1]

KeyError: -1

In [55]:
ser.iloc[-1]

2

In [56]:
ser = pd.Series(np.arange(3), index = ["a", "b", "c"])
ser

a    0
b    1
c    2
dtype: int32

In [57]:
ser[-1]

2

In [58]:
ser = pd.Series(np.arange(3))
ser

0    0
1    1
2    2
dtype: int32

In [62]:
print(ser[:1])
print(ser.iloc[:1])
print(ser.loc[:1])

0    0
dtype: int32
0    0
dtype: int32
0    0
1    1
dtype: int32


In [63]:
names = np.array(["steve", "john", "tom"])

In [64]:
"steve".title()

'Steve'

In [65]:
str.title("steve")

'Steve'

In [66]:
str.title(names)

TypeError: descriptor 'title' requires a 'str' object but received a 'numpy.ndarray'

In [68]:
list(map(str.title, names))

['Steve', 'John', 'Tom']

In [69]:
obj = pd.Series(np.arange(4), index=["d", "a", "b", "c"])
obj

d    0
a    1
b    2
c    3
dtype: int32

In [70]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int32

In [71]:
obj.sort_values()

d    0
a    1
b    2
c    3
dtype: int32

In [72]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [73]:
price = pd.read_pickle("yahoo_price.pkl")
price.head()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-04,27.99,313.06,113.3,25.88
2010-01-05,28.04,311.68,111.94,25.89
2010-01-06,27.59,303.83,111.21,25.73
2010-01-07,27.54,296.75,110.82,25.47
2010-01-08,27.72,300.71,111.94,25.64


In [74]:
price.index

DatetimeIndex(['2010-01-04', '2010-01-05', '2010-01-06', '2010-01-07',
               '2010-01-08', '2010-01-11', '2010-01-12', '2010-01-13',
               '2010-01-14', '2010-01-15',
               ...
               '2016-10-10', '2016-10-11', '2016-10-12', '2016-10-13',
               '2016-10-14', '2016-10-17', '2016-10-18', '2016-10-19',
               '2016-10-20', '2016-10-21'],
              dtype='datetime64[ns]', name='Date', length=1714, freq=None)

In [75]:
price.columns

Index(['AAPL', 'GOOG', 'IBM', 'MSFT'], dtype='object')

In [80]:
pd.options.display.precision=4

In [82]:
returns = price.pct_change()

In [83]:
returns.corr()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,1.0,0.4079,0.3868,0.3897
GOOG,0.4079,1.0,0.4051,0.4659
IBM,0.3868,0.4051,1.0,0.4998
MSFT,0.3897,0.4659,0.4998,1.0


In [88]:
cor_IBM = returns.corrwith(returns.IBM).sort_values().drop("IBM")

In [89]:
cor_IBM

AAPL    0.3868
GOOG    0.4051
MSFT    0.4998
dtype: float64

In [92]:
np.random.seed(349)
data = pd.DataFrame(np.random.randint(1, 11, 20).reshape(5, -1), \
                    columns=["COL1", "COL2", "COL3", "COL4"])
data

Unnamed: 0,COL1,COL2,COL3,COL4
0,8,4,2,3
1,2,3,1,7
2,6,8,2,9
3,5,3,5,10
4,10,6,9,2


In [94]:
np.unique(data.values)

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [96]:
data.apply(pd.value_counts).fillna(0)

Unnamed: 0,COL1,COL2,COL3,COL4
1,0.0,0.0,1.0,0.0
2,1.0,0.0,2.0,1.0
3,0.0,2.0,0.0,1.0
4,0.0,1.0,0.0,0.0
5,1.0,0.0,1.0,0.0
6,1.0,1.0,0.0,0.0
7,0.0,0.0,0.0,1.0
8,1.0,1.0,0.0,0.0
9,0.0,0.0,1.0,1.0
10,1.0,0.0,0.0,1.0
