# Introduction to pandas Data Structures

### Series

In [1]:
import pandas as pd
import numpy as np

In [147]:
obj = pd.Series([1,2,3,4])

In [148]:
obj

0    1
1    2
2    3
3    4
dtype: int64

In [4]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
strObj = pd.Series(['abcd'])

In [6]:
strObj

0    abcd
dtype: object

In [7]:
objWithCustomIndex = pd.Series([1,2,3,4],index=['a','b','c','d'])

In [8]:
objWithCustomIndex

a    1
b    2
c    3
d    4
dtype: int64

In [9]:
objWithCustomIndex.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [10]:
objWithCustomIndex['a']

1

In [11]:
objWithCustomIndex[objWithCustomIndex>2]

c    3
d    4
dtype: int64

In [12]:
1 in objWithCustomIndex

False

In [13]:
'a' in objWithCustomIndex

True

# Dictionary to pandas Series


In [14]:
dict = {'o':1,'h':2,'i':3,'s':5}

In [15]:
dict['o']

1

In [16]:
pandasSeriesFromADictionary =  pd.Series(dict)

In [17]:
pandasSeriesFromADictionary

o    1
h    2
i    3
s    5
dtype: int64

In [18]:
states = ['Assam','Meghalaya','Nagaland','Manipur']

In [19]:
pandasSeriesFromADictionary = pd.Series(dict, index=states)

In [20]:
pandasSeriesFromADictionary

Assam       NaN
Meghalaya   NaN
Nagaland    NaN
Manipur     NaN
dtype: float64

In [21]:
states = ['h','o','i','s']

In [22]:
pandasSeriesFromADictionary = pd.Series(dict, index=states)

In [23]:
pandasSeriesFromADictionary

h    2
o    1
i    3
s    5
dtype: int64

In [24]:
states = ['h','o','s','i']

In [25]:
pandasSeriesFromADictionary = pd.Series(dict, index=states)

In [26]:
pandasSeriesFromADictionary

h    2
o    1
s    5
i    3
dtype: int64

In [27]:
pandasSeriesFromADictionary.isnull

<bound method Series.isnull of h    2
o    1
s    5
i    3
dtype: int64>

In [28]:
pandasSeriesFromADictionary.isnull()

h    False
o    False
s    False
i    False
dtype: bool

In [29]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [30]:
obj = pd.Series(sdata)

In [31]:
obj

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [32]:
states1 = ['California', 'Ohio', 'Oregon', 'Texas']

In [33]:
obj = pd.Series(sdata, index=states1)

In [34]:
obj

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [35]:
obj.index = ['C','O','R','T']

In [36]:
obj

C        NaN
O    35000.0
R    16000.0
T    71000.0
dtype: float64

# DataFrame
A DataFrame represents a rectangular table of data and contains an ordered collec‐
tion of columns, each of which can be a different value type (numeric, string,
boolean, etc.). The DataFrame has both a row and column index; it can be thought of
as a dict of Series all sharing the same index. Under the hood, the data is stored as one
or more two-dimensional blocks rather than a list, dict, or some other collection of
one-dimensional arrays.

In [37]:
data = {'state': ['ArunachalPradesh','Assam','Meghalaya','Manipur','Nagaland','Mizoram','Tripura'],
        'capital':['Itanagar','Guwahati','Shillong','Imphal','Kohima','Aizawl','Agartala'],
        'population':[15,89,66,23,24,36,42]
       }

In [38]:
stateDataframe = pd.DataFrame(data)

In [39]:
stateDataframe

Unnamed: 0,state,capital,population
0,ArunachalPradesh,Itanagar,15
1,Assam,Guwahati,89
2,Meghalaya,Shillong,66
3,Manipur,Imphal,23
4,Nagaland,Kohima,24
5,Mizoram,Aizawl,36
6,Tripura,Agartala,42


In [40]:
stateDataframe = pd.DataFrame(data, columns=['capital','state','population'])

In [41]:
stateDataframe

Unnamed: 0,capital,state,population
0,Itanagar,ArunachalPradesh,15
1,Guwahati,Assam,89
2,Shillong,Meghalaya,66
3,Imphal,Manipur,23
4,Kohima,Nagaland,24
5,Aizawl,Mizoram,36
6,Agartala,Tripura,42


In [42]:
stateDataframe['capital']

0    Itanagar
1    Guwahati
2    Shillong
3      Imphal
4      Kohima
5      Aizawl
6    Agartala
Name: capital, dtype: object

In [43]:
stateDataframe['population'] = np.arange(8,15)

In [44]:
stateDataframe

Unnamed: 0,capital,state,population
0,Itanagar,ArunachalPradesh,8
1,Guwahati,Assam,9
2,Shillong,Meghalaya,10
3,Imphal,Manipur,11
4,Kohima,Nagaland,12
5,Aizawl,Mizoram,13
6,Agartala,Tripura,14



### Creating a new Column from existing dataframe

In [45]:
stateDataframe['hugePopulation'] = stateDataframe['population']>10

In [46]:
stateDataframe

Unnamed: 0,capital,state,population,hugePopulation
0,Itanagar,ArunachalPradesh,8,False
1,Guwahati,Assam,9,False
2,Shillong,Meghalaya,10,False
3,Imphal,Manipur,11,True
4,Kohima,Nagaland,12,True
5,Aizawl,Mizoram,13,True
6,Agartala,Tripura,14,True


In [47]:
del stateDataframe['hugePopulation']

In [48]:
stateDataframe

Unnamed: 0,capital,state,population
0,Itanagar,ArunachalPradesh,8
1,Guwahati,Assam,9
2,Shillong,Meghalaya,10
3,Imphal,Manipur,11
4,Kohima,Nagaland,12
5,Aizawl,Mizoram,13
6,Agartala,Tripura,14


In [49]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

In [50]:
pop

{'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

In [51]:
newDf = pd.DataFrame(pop)

In [52]:
newDf

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [53]:
newDf.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [54]:
pdata = {'Ohio': newDf['Ohio'][:-1],
'Nevada': newDf['Nevada'][:2]}

In [55]:
pdata

{'Ohio': 2000    1.5
 2001    1.7
 Name: Ohio, dtype: float64, 'Nevada': 2000    NaN
 2001    2.4
 Name: Nevada, dtype: float64}

In [56]:
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4


In [57]:
newDf.index.name=['year']

In [58]:
newDf.columns.name=['states']

In [59]:
newDf

[states],Nevada,Ohio
[year],Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [60]:
newDf.values

array([[ nan,  1.5],
       [ 2.4,  1.7],
       [ 2.9,  3.6]])

In [61]:
obbj = pd.Series(range(3), index=['a','b','c'])

In [62]:
obbj

a    0
b    1
c    2
dtype: int64

In [63]:
pd.DataFrame(obbj)

Unnamed: 0,0
a,0
b,1
c,2


In [64]:
indexes = pd.Index(range(3))

In [65]:
indexes

RangeIndex(start=0, stop=3, step=1)

In [66]:
obj3 = pd.Series(['b','r','caterpillar'], index=[0,2,4])

In [67]:
obj3.reindex(range(6), method='ffill')

0              b
1              b
2              r
3              r
4    caterpillar
5    caterpillar
dtype: object

## Index Objects
pandas’s Index objects are responsible for holding the axis labels and other metadata
(like the axis name or names). Any array or other sequence of labels you use when
constructing a Series or DataFrame is internally converted to an Index

In [68]:
obj = pd.Series(range(3), index=['a','b','c'])

In [69]:
obj

a    0
b    1
c    2
dtype: int64

In [70]:
inx = obj.index

In [71]:
inx

Index(['a', 'b', 'c'], dtype='object')

In [72]:
labels = pd.Index(range(2))

In [73]:
labels

RangeIndex(start=0, stop=2, step=1)

In [74]:
labels = pd.Index(np.arange(3))

In [75]:
labels

Int64Index([0, 1, 2], dtype='int64')

In [76]:
np.arange(3)

array([0, 1, 2])

In [77]:
range(2)

range(0, 2)

In [78]:
pdata

{'Ohio': 2000    1.5
 2001    1.7
 Name: Ohio, dtype: float64, 'Nevada': 2000    NaN
 2001    2.4
 Name: Nevada, dtype: float64}

# Essential Functionality

## ReIndexing

An important method on pandas objects is reindex , which means to create a new
object with the data conformed to a new index.

In [79]:
obj = pd.Series(range(4), index=['a','b','c','d'])

In [80]:
obj

a    0
b    1
c    2
d    3
dtype: int64

In [81]:
obj2 = obj.reindex(['b','c','d','a'])

In [82]:
obj2

b    1
c    2
d    3
a    0
dtype: int64

In [83]:
frame = pd.DataFrame(np.arange(9).reshape((3,3)), index=['a','b','c'], columns=['A','B','C'])

In [84]:
frame

Unnamed: 0,A,B,C
a,0,1,2
b,3,4,5
c,6,7,8


In [85]:
frame1 = pd.DataFrame(np.arange(6).reshape((3,2)), index=['a','b','c'], columns=['A','B'])

In [86]:
frame1

Unnamed: 0,A,B
a,0,1
b,2,3
c,4,5


In [87]:
frame2 = pd.DataFrame(np.arange(6).reshape((2,3)), index=['a','b'], columns=['A','B','C'])

In [88]:
frame2


Unnamed: 0,A,B,C
a,0,1,2
b,3,4,5


In [89]:
frame.reindex(['b','c','a'])

Unnamed: 0,A,B,C
b,3,4,5
c,6,7,8
a,0,1,2


# Dropping Entries from an Axis
Dropping one or more entries from an axis is easy if you already have an index array
or list without those entries. As that can require a bit of munging and set logic, the
Getting Started with pandas drop method will return a new object with the indicated value or values deleted from
an axis:

### For Series

In [90]:
obj.drop('a')

b    1
c    2
d    3
dtype: int64

In [91]:
obj

a    0
b    1
c    2
d    3
dtype: int64

### For Dataframes

In [92]:
df = pd.DataFrame(np.arange(16).reshape((4,4)), index=['a','b','c','d'], columns=['one','two','three','four'])

In [93]:
df

Unnamed: 0,one,two,three,four
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11
d,12,13,14,15


In [94]:
df.drop('a', axis='index')

Unnamed: 0,one,two,three,four
b,4,5,6,7
c,8,9,10,11
d,12,13,14,15


In [95]:
df.drop(['a','b'], axis='index')

Unnamed: 0,one,two,three,four
c,8,9,10,11
d,12,13,14,15


Row Delete

In [96]:
df.drop(['a','b'])

Unnamed: 0,one,two,three,four
c,8,9,10,11
d,12,13,14,15


Column Delete

In [97]:
df.drop('two', axis=1)

Unnamed: 0,one,three,four
a,0,2,3
b,4,6,7
c,8,10,11
d,12,14,15


In [98]:
df.drop('one', axis='columns')

Unnamed: 0,two,three,four
a,1,2,3
b,5,6,7
c,9,10,11
d,13,14,15


Permanently change the data use inplace = True

In [99]:
df.drop('one',axis='columns', inplace=True)

In [100]:
df

Unnamed: 0,two,three,four
a,1,2,3
b,5,6,7
c,9,10,11
d,13,14,15


# Indexing, Selection, and Filtering

For Series

In [101]:
obj = pd.Series(np.arange(6), index=['a','b','c','d','e','f'])

In [102]:
obj

a    0
b    1
c    2
d    3
e    4
f    5
dtype: int64

In [103]:
obj[2:4]

c    2
d    3
dtype: int64

In [104]:
obj['a']

0

In [105]:
obj[5]

5

In [106]:
obj[obj>2]

d    3
e    4
f    5
dtype: int64

For Dataframe

In [107]:
dframe = pd.DataFrame(np.arange(16).reshape((4,4)),index=['a','b','c','d'], columns=['A','B','C','D'])

In [108]:
dframe

Unnamed: 0,A,B,C,D
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11
d,12,13,14,15


In [109]:
dframe['a']

KeyError: 'a'

In [110]:
dframe['A']

a     0
b     4
c     8
d    12
Name: A, dtype: int64

In [111]:
dframe[['A','B']]

Unnamed: 0,A,B
a,0,1
b,4,5
c,8,9
d,12,13


In [112]:
dframe[:2]

Unnamed: 0,A,B,C,D
a,0,1,2,3
b,4,5,6,7


In [113]:
dframe[:1]

Unnamed: 0,A,B,C,D
a,0,1,2,3


In [114]:
dframe[dframe['C']>3]

Unnamed: 0,A,B,C,D
b,4,5,6,7
c,8,9,10,11
d,12,13,14,15


In [115]:
dframe<6

Unnamed: 0,A,B,C,D
a,True,True,True,True
b,True,True,False,False
c,False,False,False,False
d,False,False,False,False


### Selection with loc and iloc

For DataFrame label-indexing on the rows, I introduce the special indexing operators
loc and iloc . They enable you to select a subset of the rows and columns from a
DataFrame with NumPy-like notation using either axis labels ( loc ) or integers
( iloc ).

loc = without integer values
iloc = index values(integers)

In [116]:
dframe

Unnamed: 0,A,B,C,D
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11
d,12,13,14,15


In [117]:
dframe.loc['a']

A    0
B    1
C    2
D    3
Name: a, dtype: int64

In [118]:
dframe.loc['a',['A','B']]

A    0
B    1
Name: a, dtype: int64

In [119]:
dframe.iloc[1]

A    4
B    5
C    6
D    7
Name: b, dtype: int64

In [120]:
dframe.iloc[0,[0,1]]

A    0
B    1
Name: a, dtype: int64

In [121]:
dframe.iloc[[0,1],[1,2]]

Unnamed: 0,B,C
a,1,2
b,5,6


### Arithmetic and Data Alignment
An important pandas feature for some applications is the behavior of arithmetic
between objects with different indexes. When you are adding together objects, if any
index pairs are not the same, the respective index in the result will be the union of the
index pairs. For users with database experience, this is similar to an automatic outer
join on the index labels.

In [122]:
a = pd.Series([1,2,3,4,5], index=['a','b','c','d','e'])

In [123]:
a

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [124]:
b = pd.Series([6,7,8,9], index = ['a','c','d','e'])

In [125]:
b

a    6
c    7
d    8
e    9
dtype: int64

In [126]:
a + b

a     7.0
b     NaN
c    10.0
d    12.0
e    14.0
dtype: float64

In [127]:
# Dataframe


In [128]:
c = pd.DataFrame(np.arange(9).reshape(3,3), columns=list('abc'), index=['G','A', 'N'])

In [129]:
c

Unnamed: 0,a,b,c
G,0,1,2
A,3,4,5
N,6,7,8


In [130]:
d = pd.DataFrame(np.arange(12).reshape(4,3), columns=list('acd'), index=['G','A','N','S'])

In [131]:
d

Unnamed: 0,a,c,d
G,0,1,2
A,3,4,5
N,6,7,8
S,9,10,11


In [132]:
c + d

Unnamed: 0,a,b,c,d
A,6.0,,9.0,
G,0.0,,3.0,
N,12.0,,15.0,
S,,,,


### Arithmetic methods with fill values

In [133]:
e = c+d

In [134]:
e

Unnamed: 0,a,b,c,d
A,6.0,,9.0,
G,0.0,,3.0,
N,12.0,,15.0,
S,,,,


In [135]:
e = c.add(d, fill_value=0)

In [136]:
e

Unnamed: 0,a,b,c,d
A,6.0,4.0,9.0,5.0
G,0.0,1.0,3.0,2.0
N,12.0,7.0,15.0,8.0
S,9.0,,10.0,11.0


In [137]:
e

Unnamed: 0,a,b,c,d
A,6.0,4.0,9.0,5.0
G,0.0,1.0,3.0,2.0
N,12.0,7.0,15.0,8.0
S,9.0,,10.0,11.0


In [138]:
e.fillna(0)

Unnamed: 0,a,b,c,d
A,6.0,4.0,9.0,5.0
G,0.0,1.0,3.0,2.0
N,12.0,7.0,15.0,8.0
S,9.0,0.0,10.0,11.0


### Operations between DataFrame and Series

In [139]:
frame = pd.DataFrame(np.arange(9).reshape(3,3), columns=list('abc'), index=list('ABC'))

In [140]:
frame

Unnamed: 0,a,b,c
A,0,1,2
B,3,4,5
C,6,7,8


In [141]:
series = frame.iloc[1]

In [142]:
series

a    3
b    4
c    5
Name: B, dtype: int64

In [143]:
frame - series

Unnamed: 0,a,b,c
A,-3,-3,-3
B,0,0,0
C,3,3,3


In [144]:
series2 = pd.Series(range(3), index=list('abd'))

In [145]:
series2

a    0
b    1
d    2
dtype: int64

In [146]:
frame + series2

Unnamed: 0,a,b,c,d
A,0.0,2.0,,
B,3.0,5.0,,
C,6.0,8.0,,


### Function Application and Mapping


In [149]:
newFrame  = pd.DataFrame(np.random.randn(4,3), columns=list('bde'), index=list('ABCD'))

In [150]:
newFrame

Unnamed: 0,b,d,e
A,-0.428864,-0.084105,0.64271
B,-0.743276,3.041786,-0.009638
C,-0.716388,-0.374737,0.700888
D,-0.673786,-2.694687,1.239315


In [151]:
np.abs(newFrame)

Unnamed: 0,b,d,e
A,0.428864,0.084105,0.64271
B,0.743276,3.041786,0.009638
C,0.716388,0.374737,0.700888
D,0.673786,2.694687,1.239315


In [152]:
f = lambda x: x.max()-x.min()

In [153]:
f

<function __main__.<lambda>(x)>

In [154]:
newFrame.apply(f)

b    0.314412
d    5.736473
e    1.248953
dtype: float64

In [155]:
newFrame.apply(f, axis='columns')

A    1.071574
B    3.785063
C    1.417276
D    3.934001
dtype: float64

## Sorting and Ranking


In [156]:
sortObj = pd.Series(np.random.randn(5), index=list('bcdae'))

In [157]:
sortObj

b    0.171322
c    0.885570
d   -2.914554
a    0.371252
e    2.011437
dtype: float64

In [158]:
sortObj.sort_index()

a    0.371252
b    0.171322
c    0.885570
d   -2.914554
e    2.011437
dtype: float64

In [159]:
newFrame = pd.DataFrame(np.random.randn(4,3), columns=list('cab'), index=list('CDAB'))

In [160]:
newFrame

Unnamed: 0,c,a,b
C,0.149893,0.515863,0.294269
D,-0.008962,0.956216,-0.295146
A,2.020299,1.06444,0.659794
B,1.152522,0.078574,-1.131657


In [161]:
newFrame.sort_index(axis='columns')

Unnamed: 0,a,b,c
C,0.515863,0.294269,0.149893
D,0.956216,-0.295146,-0.008962
A,1.06444,0.659794,2.020299
B,0.078574,-1.131657,1.152522


In [162]:
newFrame.sort_index(axis='index')

Unnamed: 0,c,a,b
A,2.020299,1.06444,0.659794
B,1.152522,0.078574,-1.131657
C,0.149893,0.515863,0.294269
D,-0.008962,0.956216,-0.295146


In [166]:
newFrame.sort_values(by=['c'])

Unnamed: 0,c,a,b
D,-0.008962,0.956216,-0.295146
C,0.149893,0.515863,0.294269
B,1.152522,0.078574,-1.131657
A,2.020299,1.06444,0.659794


In [170]:
newFrame.sort_values(by=['c','b'])

Unnamed: 0,c,a,b
D,-0.008962,0.956216,-0.295146
C,0.149893,0.515863,0.294269
B,1.152522,0.078574,-1.131657
A,2.020299,1.06444,0.659794


In [167]:
newSeries = pd.Series(np.random.randn(6))

In [168]:
newSeries

0    1.435138
1    0.937906
2    0.132099
3    1.520222
4    0.383394
5   -0.433538
dtype: float64

In [169]:
newSeries.sort_values()

5   -0.433538
2    0.132099
4    0.383394
1    0.937906
0    1.435138
3    1.520222
dtype: float64

## Axis Indexes with Duplicate Labels


In [171]:
checkDup = pd.Series(np.random.randn(5), index=list('aabbc'))

In [172]:
checkDup

a   -0.071662
a   -1.075658
b    0.951090
b   -0.990055
c    0.031537
dtype: float64

In [173]:
checkDup.isnull

<bound method Series.isnull of a   -0.071662
a   -1.075658
b    0.951090
b   -0.990055
c    0.031537
dtype: float64>

In [175]:
checkDup.is_unique

True

In [176]:
checkDup.index.is_unique

False

In [177]:
checkDupDf = pd.DataFrame(np.random.randn(5,4), columns=list('aabb'), index=list('aabcc'))

In [178]:
checkDupDf

Unnamed: 0,a,a.1,b,b.1
a,2.759838,0.176637,-0.449244,0.362727
a,-0.522527,0.471869,0.245046,1.93326
b,0.38932,0.040837,-0.242067,-0.485126
c,-0.738616,-0.504429,0.294565,-0.693783
c,0.25534,0.523975,-0.676725,0.241563


In [179]:
checkDupDf.loc['a']

Unnamed: 0,a,a.1,b,b.1
a,2.759838,0.176637,-0.449244,0.362727
a,-0.522527,0.471869,0.245046,1.93326


In [180]:
checkDupDf.loc[['a','b']]

Unnamed: 0,a,a.1,b,b.1
a,2.759838,0.176637,-0.449244,0.362727
a,-0.522527,0.471869,0.245046,1.93326
b,0.38932,0.040837,-0.242067,-0.485126


In [181]:
checkDupDf.loc[['a','c'],['a']]

Unnamed: 0,a,a.1
a,2.759838,0.176637
a,-0.522527,0.471869
c,-0.738616,-0.504429
c,0.25534,0.523975
