# DataFrame

In [5]:
import numpy as np
import pandas as pd

In [6]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame =  pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [7]:
#for large dataframes head method selects only first five rows
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [8]:
pd.DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [9]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],index=['one', 'two', 'three', 'four','five', 'six'])
print(frame2)
print(frame2.columns)

       year   state  pop debt
one    2000    Ohio  1.5  NaN
two    2001    Ohio  1.7  NaN
three  2002    Ohio  3.6  NaN
four   2001  Nevada  2.4  NaN
five   2002  Nevada  2.9  NaN
six    2003  Nevada  3.2  NaN
Index(['year', 'state', 'pop', 'debt'], dtype='object')


In [10]:
#a column in a dataframe can be retrieved as a series
# frame2['state']
frame2.state

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [11]:
#retrieving data for specific index
frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [13]:
frame2['dept'] =16.5
frame2

Unnamed: 0,year,state,pop,debt,dept
one,2000,Ohio,1.5,,16.5
two,2001,Ohio,1.7,,16.5
three,2002,Ohio,3.6,,16.5
four,2001,Nevada,2.4,,16.5
five,2002,Nevada,2.9,,16.5
six,2003,Nevada,3.2,,16.5


In [15]:
frame2['dept'] = np.arange(6)
frame2

Unnamed: 0,year,state,pop,debt,dept
one,2000,Ohio,1.5,,0
two,2001,Ohio,1.7,,1
three,2002,Ohio,3.6,,2
four,2001,Nevada,2.4,,3
five,2002,Nevada,2.9,,4
six,2003,Nevada,3.2,,5


In [16]:
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val
frame2

Unnamed: 0,year,state,pop,debt,dept
one,2000,Ohio,1.5,,0
two,2001,Ohio,1.7,-1.2,1
three,2002,Ohio,3.6,,2
four,2001,Nevada,2.4,-1.5,3
five,2002,Nevada,2.9,-1.7,4
six,2003,Nevada,3.2,,5


In [18]:
frame2['eastern'] = frame2.state == 'Ohio'
print(frame2)
del frame2['eastern']
frame2.columns

       year   state  pop  debt  dept  eastern
one    2000    Ohio  1.5   NaN     0     True
two    2001    Ohio  1.7  -1.2     1     True
three  2002    Ohio  3.6   NaN     2     True
four   2001  Nevada  2.4  -1.5     3    False
five   2002  Nevada  2.9  -1.7     4    False
six    2003  Nevada  3.2   NaN     5    False


Index(['year', 'state', 'pop', 'debt', 'dept'], dtype='object')

In [19]:
#Another common form of data is a nested dict of dicts:
pop = {'nevada':{2001:2.4,2002:5.2},
      'ohio':{2000:2.3,2001:8.3,2003:3.5}}
frame3 = pd.DataFrame(pop)
frame3

Unnamed: 0,nevada,ohio
2001,2.4,8.3
2002,5.2,
2000,,2.3
2003,,3.5


In [21]:
print(frame3.T)
pd.DataFrame(pop,index=[2001,2002,2003])

        2001  2002  2000  2003
nevada   2.4   5.2   NaN   NaN
ohio     8.3   NaN   2.3   3.5


Unnamed: 0,nevada,ohio
2001,2.4,8.3
2002,5.2,
2003,,3.5


In [23]:
pdata = {'Ohio': frame3['ohio'][:-1],'Nevada': frame3['nevada'][:2]}
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2000,2.3,
2001,8.3,2.4
2002,,5.2


In [24]:
frame3.index.name = 'year'; frame3.columns.name = 'state'
frame3

state,nevada,ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,8.3
2002,5.2,
2000,,2.3
2003,,3.5


In [25]:
frame3.values

array([[2.4, 8.3],
       [5.2, nan],
       [nan, 2.3],
       [nan, 3.5]])

In [26]:
frame2.values

array([[2000, 'Ohio', 1.5, nan, 0],
       [2001, 'Ohio', 1.7, -1.2, 1],
       [2002, 'Ohio', 3.6, nan, 2],
       [2001, 'Nevada', 2.4, -1.5, 3],
       [2002, 'Nevada', 2.9, -1.7, 4],
       [2003, 'Nevada', 3.2, nan, 5]], dtype=object)

# Index Objects

In [27]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])
index = obj.index
index

Index(['a', 'b', 'c'], dtype='object')

In [28]:
index[1:]

Index(['b', 'c'], dtype='object')

In [31]:
#index[1] = 'd'
#return type error bcz index objects are immutable

In [34]:
labels = pd.Index(np.arange(3))
labels

Int64Index([0, 1, 2], dtype='int64')

In [35]:
obj2 = pd.Series([1.5, -2.5, 0], index=labels)
obj2

0    1.5
1   -2.5
2    0.0
dtype: float64

In [37]:
print(obj2.index is labels)
frame3

True


state,nevada,ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,8.3
2002,5.2,
2000,,2.3
2003,,3.5


In [38]:
print(frame3.columns)
print('Ohio' in frame3.columns)
print(2003 in frame3.index)

Index(['nevada', 'ohio'], dtype='object', name='state')
False
True


In [40]:
dup_labels = pd.Index(['foo', 'foo', 'bar', 'bar'])
dup_labels

Index(['foo', 'foo', 'bar', 'bar'], dtype='object')