In [70]:
import pandas as pd
import numpy as np

In [3]:
obj = pd.Series([4, 7, -5, 3])

In [4]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [6]:
obj.values

array([ 4,  7, -5,  3])

In [7]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [8]:
obj2 = pd.Series([4, 7, -5, 3], index=['a', 'b', 'c', 'd'])
obj2

a    4
b    7
c   -5
d    3
dtype: int64

In [9]:
obj2.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [10]:
obj2['d']

3

In [11]:
obj2[['a', 'c']]

a    4
c   -5
dtype: int64

In [12]:
mask1 = obj2 > 0
mask1

a     True
b     True
c    False
d     True
dtype: bool

In [13]:
data_mask1 = obj2[mask1]
data_mask1

a    4
b    7
d    3
dtype: int64

# Construct Series based on dict

In [14]:
sdata={'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [15]:
obj3 = pd.Series(sdata)

In [16]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [17]:
states = ['California', 'Ohio', 'Oregon', 'Texas']

In [18]:
obj4 = pd.Series(sdata, index=states)

In [19]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

# Check if there are some null values

In [20]:
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [21]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [22]:
obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [23]:
obj4.name = 'population'

In [24]:
obj4.index.name = 'state'

In [25]:
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [26]:
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

# DataFrame

In [33]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
       'year': [2000, 2001, 2002, 2001, 2002, 2003],
       'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

In [34]:
frame = pd.DataFrame(data)

In [35]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [36]:
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [37]:
frame2 = pd.DataFrame(data, columns=['pop', 'state', 'year'])

In [38]:
frame2.head()

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


# Retrieving Column(s)

In [39]:
frame2['state']

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

In [42]:
frame2[['pop', 'state']]

Unnamed: 0,pop,state
0,1.5,Ohio
1,1.7,Ohio
2,3.6,Ohio
3,2.4,Nevada
4,2.9,Nevada
5,3.2,Nevada


# Retrieveing row(s)

In [43]:
frame3 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                     index=['one', 'two', 'three', 'four', 'five', 'six'])

In [44]:
frame3

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


Rows can also be retrieved by position or name with the special loc attribute

In [45]:
frame3.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [46]:
frame3['debt'] = 16.5

In [47]:
frame3

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


Assigning a column that doesn’t exist will create a new column. The del keyword will delete columns as with a dict.

In [57]:
mask2 = frame3['state'] == 'Ohio'

In [58]:
frame3['eastern'] = mask2

In [59]:
frame3

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,16.5,True
two,2001,Ohio,1.7,16.5,True
three,2002,Ohio,3.6,16.5,True
four,2001,Nevada,2.4,16.5,False
five,2002,Nevada,2.9,16.5,False
six,2003,Nevada,3.2,16.5,False


In [60]:
frame3['eastern'] = frame3[mask2]

In [61]:
frame3

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,16.5,2000.0
two,2001,Ohio,1.7,16.5,2001.0
three,2002,Ohio,3.6,16.5,2002.0
four,2001,Nevada,2.4,16.5,
five,2002,Nevada,2.9,16.5,
six,2003,Nevada,3.2,16.5,


In [62]:
del frame3['eastern']

In [64]:
frame3

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


# Reindexing

In [65]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [66]:
obj = obj.reindex(['a', 'b', 'c', 'd', 'e'])

In [67]:
obj

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

For ordered data like time series, it may be desirable to do some interpolation or fill‐ ing of values when reindexing. The method option allows us to do this, using a method such as ffill, which forward-fills the values:

In [68]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [69]:
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

With DataFrame, reindex can alter either the (row) index, columns, or both. When passed only a sequence, it reindexes the rows in the result:

In [71]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                    index=['a', 'c', 'd'],
                    columns=['Ohio', 'Texas', 'California'])

In [72]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


# Reindexing rows

In [73]:
frame5 = frame.reindex(['a', 'b', 'c', 'd'])
frame5

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


# Reindexing columns

The columns can be reindexed with the columns keyword

In [74]:
states = ['Texas', 'Utah', 'California']

In [75]:
frame5.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


In [79]:
#reidexing both rows and columns
frame.loc[['a', 'b','c', 'd'], states]

Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


# Drop

In [80]:
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])

In [81]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [82]:
new_obj = obj.drop('c')

In [83]:
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [84]:
obj.drop(['d', 'c'])

a    0.0
b    1.0
e    4.0
dtype: float64

In [85]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                   index=['Ohio', 'Colorado', 'Utah', 'New York'],
                   columns=['one', 'two', 'three', 'four'])

In [86]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [90]:
# Dropping two rows at the same time
data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


You can drop values from the columns by passing axis=1 or axis='columns':

In [91]:
data.drop('two', axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [92]:
# dropping two columns at the same time
data.drop(['two', 'four'], axis='columns')

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


In [93]:
obj.drop('c', inplace=True)

In [94]:
obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

# Indexing, Selection, and Filtering

In [95]:
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])

In [96]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [97]:
obj['b']

1.0

In [98]:
obj[['a', 'd']]

a    0.0
d    3.0
dtype: float64

In [99]:
obj[obj > 2]

d    3.0
dtype: float64

In [100]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                   index=['Ohio', 'Colorado', 'Utah', 'New York'],
                   columns=['one', 'two', 'three', 'four'])

In [101]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [102]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64

In [103]:
data[['three', 'one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [104]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


The row selection syntax data[:2] is provided as a convenience. Passing a single ele‐ ment or a list to the [] operator selects columns.

In [105]:
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


# Selection with loc and iloc

For DataFrame label-indexing on the rows.

In [108]:
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int64

In [109]:
data.iloc[2, [3, 0, 1]]

four    11
one      8
two      9
Name: Utah, dtype: int64

In [110]:
data.loc[:'Utah', 'two']

Ohio        1
Colorado    5
Utah        9
Name: two, dtype: int64

In [111]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [119]:
data.iloc[:, :3]

Unnamed: 0,one,two,three
Ohio,0,1,2
Colorado,4,5,6
Utah,8,9,10
New York,12,13,14


In [120]:
data.iloc[:, :3][data.three > 5]

Unnamed: 0,one,two,three
Colorado,4,5,6
Utah,8,9,10
New York,12,13,14


In [121]:
# df[val]                Select single column or sequence of columns from the DataFrame
# df.loc[val]            Selects single row or subset of rows from the DataFrame by label
# df.loc[:, val]         Selects single column or subset of columns by label
# df.loc[val1, val2]     Select both rows and columns by label

# Function Application and Mapping

In [124]:
frame = pd.DataFrame(np.random.randn(4, 3),
                    columns=list('bde'),
                    index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [125]:
frame

Unnamed: 0,b,d,e
Utah,-1.639991,1.233311,-0.453592
Ohio,-0.874021,-0.824109,-0.531015
Texas,0.219342,-1.090053,-0.769116
Oregon,1.451535,-1.402175,-2.709864


In [126]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,1.639991,1.233311,0.453592
Ohio,0.874021,0.824109,0.531015
Texas,0.219342,1.090053,0.769116
Oregon,1.451535,1.402175,2.709864


In [127]:
f = lambda x: x.max() - x.min()

# Apply on rows 

In [129]:
frame.apply(f)

b    3.091525
d    2.635485
e    2.256272
dtype: float64

Here the function f, which computes the difference between the maximum and mini‐ mum of a Series, is invoked once on each column in frame. The result is a Series hav‐ ing the columns of frame as its index.

# Apply on columns 

In [132]:
frame.apply(f, axis='columns')

Utah      2.873301
Ohio      0.343006
Texas     1.309396
Oregon    4.161398
dtype: float64

In [133]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])

In [134]:
frame.apply(f)

Unnamed: 0,b,d,e
min,-1.639991,-1.402175,-2.709864
max,1.451535,1.233311,-0.453592


In [135]:
frame.apply(f, axis='columns')

Unnamed: 0,min,max
Utah,-1.639991,1.233311
Ohio,-0.874021,-0.531015
Texas,-1.090053,0.219342
Oregon,-2.709864,1.451535


# ApplyMap

Suppose you wanted to compute a formatted string from each floating-point value in frame

In [138]:
format = lambda x: '%.2f' % x

In [139]:
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,-1.64,1.23,-0.45
Ohio,-0.87,-0.82,-0.53
Texas,0.22,-1.09,-0.77
Oregon,1.45,-1.4,-2.71


# Sorting and Ranking

In [140]:
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])

In [141]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [142]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=['three', 'one'],
                     columns=['d', 'a', 'b', 'c'])

In [143]:
# Sorting index based on Rows
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [145]:
# Sorting index based on Columns
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


The data is sorted in ascending order by default, but can be sorted in descending order, too

In [147]:
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [150]:
frame.sort_values(by=['a', 'c'])

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


# Ranking

Ranking assigns ranks from one through the number of valid data points in an array.

In [153]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

by default rank breaks ties by assigning each group the mean rank

In [154]:
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [155]:
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

# Summarizing and Computing Descriptive Statistics

In [156]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])

In [157]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [158]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [161]:
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [162]:
df.sum(axis=1, skipna=False)

a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64

Some methods, like idxmin and idxmax, return indirect statistics like the index value where the minimum or maximum values are attained:

In [165]:
df.idxmax

<bound method DataFrame.idxmax of     one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3>

In [166]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


# Correlation and Covariance

In [167]:
conda install pandas-datareader

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /Users/eric/opt/anaconda3/envs/upp

  added / updated specs:
    - pandas-datareader


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2020.4.5.1         |           py36_0         155 KB
    lxml-4.5.0                 |   py36hef8c89e_0         1.2 MB
    pandas-datareader-0.8.1    |             py_0          71 KB
    ------------------------------------------------------------
                                           Total:         1.5 MB

The following NEW packages will be INSTALLED:

  libxml2            pkgs/main/osx-64::libxml2-2.9.9-hf6e021a_1
  libxslt            pkgs/main/osx-64::libxslt-1.1.33-h33a18ac_0
  lxml               pkgs/main/osx-64::lxml-4.5.0-py36hef8c89e_0
  pandas-datareader  pkgs/main/noarch::pandas-datareader-0.8.1-py_0

The 

In [168]:
import pandas_datareader.data as web

In [170]:
all_data = {ticker: web.get_data_yahoo(ticker)
            for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}

In [171]:
price = pd.DataFrame({ticker: data['Adj Close']
                     for ticker, data in all_data.items()})

In [172]:
volume = pd.DataFrame({ticker: data['Volume']
                     for ticker, data in all_data.items()})

In [173]:
returns = price.pct_change()

In [174]:
returns.tail()

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-04-01,-0.052617,-0.052195,-0.035508,-0.049183
2020-04-02,0.016687,0.046224,0.020709,0.013766
2020-04-03,-0.014371,-0.033273,-0.00921,-0.020485
2020-04-06,0.087237,0.079744,0.074368,0.081102
2020-04-07,-0.011582,0.001045,-0.01077,-0.000345


In [175]:
returns['MSFT'].corr(returns['IBM'])

0.597494845582567

In [176]:
returns['MSFT'].cov(returns['IBM'])

0.0001597321309294345

In [177]:
returns.corr()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,1.0,0.535229,0.695334,0.638458
IBM,0.535229,1.0,0.597495,0.52942
MSFT,0.695334,0.597495,1.0,0.744226
GOOG,0.638458,0.52942,0.744226,1.0


In [178]:
returns.cov()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,0.000323,0.000149,0.000216,0.000193
IBM,0.000149,0.00024,0.00016,0.000138
MSFT,0.000216,0.00016,0.000298,0.000216
GOOG,0.000193,0.000138,0.000216,0.000283


In [179]:
returns.corrwith(volume)

AAPL   -0.141577
IBM    -0.098436
MSFT   -0.037160
GOOG   -0.043206
dtype: float64

# Unique Values, Value Counts, and Membership

In [198]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

In [200]:
uniques = obj.unique()

In [201]:
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [202]:
uniques.sort()

In [203]:
uniques

array(['a', 'b', 'c', 'd'], dtype=object)

In [204]:
obj.value_counts()

a    3
c    3
b    2
d    1
dtype: int64

In [205]:
mask = obj.isin(['b', 'c'])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [206]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object