In [1]:
import pandas as pd
from pandas import Series, DataFrame

In [2]:
obj = pd.Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [3]:
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [4]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [6]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [7]:
obj2['a']

-5

In [8]:
obj2['d'] = 6

In [9]:
obj2[['c', 'a', 'd']]

c    3
a   -5
d    6
dtype: int64

In [10]:
obj2[obj2 > 0]

d    6
b    7
c    3
dtype: int64

In [11]:
obj2 * 2

d    12
b    14
a   -10
c     6
dtype: int64

In [13]:
import numpy as np
np.exp(obj2)

d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [14]:
'b' in obj2

True

In [15]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [16]:
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(sdata, index=states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [17]:
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [18]:
obj4.name = 'population'
obj4.index.name = 'state'
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [19]:
obj3 * obj4

California             NaN
Ohio          1.225000e+09
Oregon        2.560000e+08
Texas         5.041000e+09
Utah                   NaN
dtype: float64

<h2>Dataframes

In [20]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [21]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                      index=['one', 'two', 'three', 'four',
                             'five', 'six'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [22]:
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [28]:
frame2.['debt'].loc['two'] = 0.6

SyntaxError: invalid syntax (<ipython-input-28-d638f202f737>, line 1)

In [29]:
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [30]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = pd.DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [31]:
frame3.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


<h2>5.2 Essential Functionality

In [32]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])

In [33]:
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [36]:
obj.reindex(['a', 'b', 'c', 'e'])

a   -5.3
b    7.2
c    3.6
e    NaN
dtype: float64

In [37]:
frame = pd.DataFrame(np.arange(9).reshape((3,3)),
                    index=['a', 'c', 'd'],
                    columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [39]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [40]:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)


Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [44]:
frame2.loc[['a', 'b', 'c', 'd'], states]

KeyError: 'Passing list-likes to .loc or [] with any missing labels is no longer supported, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike'

In [50]:
frame.reindex(['a', 'b', 'c'], columns=states)

Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0


In [51]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [52]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32

In [53]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [54]:
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [55]:
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int32

In [56]:
data.iloc[2, [3, 0, 1]]

four    11
one      8
two      9
Name: Utah, dtype: int32

In [57]:
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

In [58]:
data.iloc[[1, 2], [3, 0, 1]]

Unnamed: 0,four,one,two
Colorado,7,4,5
Utah,11,8,9


In [59]:
data.loc[:'Utah', 'two']

Ohio        1
Colorado    5
Utah        9
Name: two, dtype: int32

In [60]:
data.iloc[:, :3]

Unnamed: 0,one,two,three
Ohio,0,1,2
Colorado,4,5,6
Utah,8,9,10
New York,12,13,14


In [61]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
                  index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])


In [62]:
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [63]:
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [64]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [70]:
np.abs(df1.rsub(1))

Unnamed: 0,b,c,d
Ohio,1.0,0.0,1.0
Texas,2.0,3.0,4.0
Colorado,5.0,6.0,7.0


In [71]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
                    index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [72]:
frame

Unnamed: 0,b,d,e
Utah,-1.592478,0.339065,-1.568295
Ohio,0.045507,0.009805,-1.250134
Texas,-0.576656,1.90979,1.764939
Oregon,1.246981,-0.159241,0.265973


In [73]:
f = lambda x: x.max() - x.min()

In [76]:
frame.apply(f, axis='rows')

b    2.839459
d    2.069030
e    3.333234
dtype: float64

In [77]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])

In [78]:
frame.apply(f)

Unnamed: 0,b,d,e
min,-1.592478,-0.159241,-1.568295
max,1.246981,1.90979,1.764939


In [79]:
frame.describe()

Unnamed: 0,b,d,e
count,4.0,4.0,4.0
mean,-0.219162,0.524855,-0.196879
std,1.187915,0.94619,1.533307
min,-1.592478,-0.159241,-1.568295
25%,-0.830612,-0.032456,-1.329675
50%,-0.265575,0.174435,-0.492081
75%,0.345876,0.731746,0.640715
max,1.246981,1.90979,1.764939


In [80]:
conda install pandas-datareader


Collecting package metadata (current_repodata.json): ...working... done
Note: you may need to restart the kernel to use updated packages.
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\tedbr\anaconda3

  added / updated specs:
    - pandas-datareader


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    conda-4.9.1                |   py38haa95532_0         2.9 MB
    pandas-datareader-0.9.0    |             py_0          72 KB
    ------------------------------------------------------------
                                           Total:         2.9 MB

The following NEW packages will be INSTALLED:

  pandas-datareader  pkgs/main/noarch::pandas-datareader-0.9.0-py_0


The following packages will be UPDATED:

  conda                                        4.8.3-py38_0 --> 4.9.1-py38haa95532_0



Downloading and Extracting Packages

conda-4.9.1 

In [82]:
import pandas_datareader.data as web
all_data = {ticker: web.get_data_yahoo(ticker)
           for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}

In [83]:
price = pd.DataFrame({ticker: data['Adj Close']
                     for ticker, data in all_data.items()})

In [84]:
volume = pd.DataFrame({ticker: data['Volume']
                      for ticker, data in all_data.items()})

In [85]:
returns = price.pct_change()
returns.tail()

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-11-03,0.015354,0.011071,0.020264,0.014871
2020-11-04,0.040837,-0.019797,0.048249,0.059944
2020-11-05,0.035494,0.025648,0.031887,0.008141
2020-11-06,-0.001136,-0.006361,0.001926,-0.000919
2020-11-09,-0.019968,0.027756,-0.023824,0.00071


In [86]:
returns.head()

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-11-12,,,,
2015-11-13,-0.029209,-0.009697,-0.009002,-0.01946
2015-11-16,0.016379,0.014877,0.0176,0.016681
2015-11-17,-0.004291,0.000823,-0.008238,-0.005021
2015-11-18,0.031665,0.014945,0.016613,0.020267


<h2>Chapter 7

<h3>Transforming data

In [93]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [94]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [95]:
meat_to_animal = {
    'bacon': 'pig',
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'corned beef': 'cow',
    'honey ham': 'pig',
    'nova lox': 'salmon'
    }

In [96]:
lowercased = data['food'].str.lower()

In [97]:
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [98]:
data['animal'] = lowercased.map(meat_to_animal)

In [99]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [2]:

import numpy as np
np.random.seed(12345)
values = np.random.rand(10)
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [4]:
import pandas as pd
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0


In [6]:
val = 'a,b,   cde'
[x.strip() for x in val.split(',')]

['a', 'b', 'cde']

In [8]:
import re
regex = re.compile('\s+')
regex.sub('', val)

'a,b,cde'