# Getting Started with pandas

In [1]:
import pandas as pd
from pandas import Series, DataFrame

In [2]:
obj = pd.Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [3]:
obj.array

<PandasArray>
[4, 7, -5, 3]
Length: 4, dtype: int64

In [4]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
obj2 = pd.Series([4,7,-5,3], index=['c','d','a','b'])
for i in obj2.index:
    print(i, obj2[i])

c 4
d 7
a -5
b 3


In [6]:
obj2['c']

4

In [7]:
obj2[['d','a']]

d    7
a   -5
dtype: int64

In [8]:
obj2[obj2>=4]

c    4
d    7
dtype: int64

In [9]:
obj2 * 2

c     8
d    14
a   -10
b     6
dtype: int64

In [10]:
import numpy as np
np.exp(obj2)

c      54.598150
d    1096.633158
a       0.006738
b      20.085537
dtype: float64

In [11]:
kk = np.exp(obj2)
print(type(kk))

<class 'pandas.core.series.Series'>


In [12]:
'd' in obj2

True

In [13]:
sdata= {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [14]:
obj3.index

Index(['Ohio', 'Texas', 'Oregon', 'Utah'], dtype='object')

In [15]:
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(sdata, index=states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [16]:
obj4.isna()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [17]:
obj4.notna()

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [18]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [19]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [20]:
obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [21]:
obj4.name = 'City'
obj4.index.name = 'Population'

In [22]:
obj4

Population
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: City, dtype: float64

In [23]:
obj = pd.Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [24]:
obj.index = ['Bob','Steve','Jeff','Ryan']

In [25]:
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

## DataFrame

In [26]:
import pandas as pd

In [27]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [28]:
frame.head(2)

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7


In [29]:
pd.DataFrame(data, columns=['year','state','pop']) # in this order

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [30]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                           index=['one', 'two', 'three', 'four', 'five', 'six'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [31]:
frame2['year']

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [32]:
frame2.dtypes

year       int64
state     object
pop      float64
debt      object
dtype: object

In [33]:
frame2['pop']

one      1.5
two      1.7
three    3.6
four     2.4
five     2.9
six      3.2
Name: pop, dtype: float64

In [34]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [35]:
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [36]:
frame2.state

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [37]:
frame2['pop']

one      1.5
two      1.7
three    3.6
four     2.4
five     2.9
six      3.2
Name: pop, dtype: float64

In [38]:
frame2.pop # ¿porqué?

<bound method DataFrame.pop of        year   state  pop debt
one    2000    Ohio  1.5  NaN
two    2001    Ohio  1.7  NaN
three  2002    Ohio  3.6  NaN
four   2001  Nevada  2.4  NaN
five   2002  Nevada  2.9  NaN
six    2003  Nevada  3.2  NaN>

frame2

In [39]:
print(frame2)
frame2

       year   state  pop debt
one    2000    Ohio  1.5  NaN
two    2001    Ohio  1.7  NaN
three  2002    Ohio  3.6  NaN
four   2001  Nevada  2.4  NaN
five   2002  Nevada  2.9  NaN
six    2003  Nevada  3.2  NaN


Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [40]:
frame2.state

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [41]:
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [42]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [43]:
kk=pd.Series([1,2,3,4])
kk.name='kk1'
kk

0    1
1    2
2    3
3    4
Name: kk1, dtype: int64

In [44]:
frame2['debt'] = 19.69
frame2.loc['three']

year      2002
state     Ohio
pop        3.6
debt     19.69
Name: three, dtype: object

In [45]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,19.69
two,2001,Ohio,1.7,19.69
three,2002,Ohio,3.6,19.69
four,2001,Nevada,2.4,19.69
five,2002,Nevada,2.9,19.69
six,2003,Nevada,3.2,19.69


In [46]:
import numpy as np
frame2.debt = np.arange(6.)
frame2


Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


In [47]:
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [48]:
frame2['easter'] = frame2.state == 'Ohio'

In [49]:
frame2

Unnamed: 0,year,state,pop,debt,easter
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.2,,False


In [50]:
del frame2['easter']

In [51]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [52]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [53]:
pop = {'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6},        
       'Nevada': {2001: 2.4, 2002: 2.9}}
pop

{'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}, 'Nevada': {2001: 2.4, 2002: 2.9}}

In [54]:
frame3 = pd.DataFrame(pop)
frame3.index

Int64Index([2000, 2001, 2002], dtype='int64')

In [55]:
frame3.T[2002]

Ohio      3.6
Nevada    2.9
Name: 2002, dtype: float64

In [56]:
pop

{'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}, 'Nevada': {2001: 2.4, 2002: 2.9}}

In [57]:
frame4 = pd.DataFrame(pop, index=[2000, 2001, 2002])

In [58]:
frame4.dtypes

Ohio      float64
Nevada    float64
dtype: object

In [59]:
frame4.index

Int64Index([2000, 2001, 2002], dtype='int64')

In [60]:
display(frame3)

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


In [61]:
 frame3.index

Int64Index([2000, 2001, 2002], dtype='int64')

In [62]:
frame3.index.name = 'year'
frame3.index

Int64Index([2000, 2001, 2002], dtype='int64', name='year')

In [63]:
frame3

Unnamed: 0_level_0,Ohio,Nevada
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


In [64]:
frame3.columns.name = "state"
frame3.columns

Index(['Ohio', 'Nevada'], dtype='object', name='state')

In [65]:
frame3.to_numpy()

array([[1.5, nan],
       [1.7, 2.4],
       [3.6, 2.9]])

In [66]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [67]:
frame2.to_numpy()

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7],
       [2003, 'Nevada', 3.2, nan]], dtype=object)

## Index Objects

In [68]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])

index = obj.index
index

Index(['a', 'b', 'c'], dtype='object')

In [69]:
# index[2] = "d" # gives an error. Index is inmutable

In [70]:
labels = pd.Index(np.arange(3))
labels

Int64Index([0, 1, 2], dtype='int64')

In [71]:
obj2 = pd.Series([2.4,5.6,1.5], index=labels)
obj2

0    2.4
1    5.6
2    1.5
dtype: float64

In [72]:
obj2.index is labels

True

In [73]:
frame3

state,Ohio,Nevada
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


In [74]:
frame3.columns

Index(['Ohio', 'Nevada'], dtype='object', name='state')

In [75]:
'Ohio' in frame3.columns

True

In [76]:
2002 in frame3.index

True

In [77]:
2003 in frame3.index

False

In [78]:
'2002' in frame3.index

False

In [79]:
2000+2 in frame3.index

True

In [80]:
dup_labels = pd.Index(['foo', 'foo', 'bar', 'bar'])
dup_labels

Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

##  Essential Functionality

In [81]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [84]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [85]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
print(obj3)

0      blue
2    purple
4    yellow
dtype: object


In [86]:
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [87]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                     index=['a', 'c', 'd'],
                     columns=['Ohio', 'Texas', 'California'])

In [88]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [89]:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [90]:
frame.loc[['a', 'c', 'd'], ['Texas', 'California']]

Unnamed: 0,Texas,California
a,1,2
c,4,5
d,7,8
