## Python Dictionary for storing mixed-type data

In [2]:
data_dict = {'letters':['A','B','c','D','eee'], 
             'hundreds':[100,200,300,400,500], 
             'tens':[10.0,20.0,30.0,40.0,50.0],
             'boolean':[True,False,True,True,False]}

In [3]:
data_dict['hundreds']

[100, 200, 300, 400, 500]

In [4]:
sum(data_dict['hundreds'])

1500

In [5]:
data_dict['hundreds'] + data_dict['tens']

[100, 200, 300, 400, 500, 10.0, 20.0, 30.0, 40.0, 50.0]

## DataFrame is convenient storage for tablular data

In [1]:
import pandas as pd

In [6]:
df = pd.DataFrame(data_dict)
df

Unnamed: 0,letters,hundreds,tens,boolean
0,A,100,10.0,True
1,B,200,20.0,False
2,c,300,30.0,True
3,D,400,40.0,True
4,eee,500,50.0,False


In [7]:
df.dtypes

letters      object
hundreds      int64
tens        float64
boolean        bool
dtype: object

In [10]:
df.index

RangeIndex(start=0, stop=5, step=1)

In [11]:
df.columns

Index(['letters', 'hundreds', 'tens', 'boolean'], dtype='object')

## Accessing (selecting/indexing) a column with `df[]` notation

In [8]:
df['hundreds']

0    100
1    200
2    300
3    400
4    500
Name: hundreds, dtype: int64

### Each column is a Series

In [9]:
df['hundreds'].__class__

pandas.core.series.Series

In [14]:
df[['tens','hundreds']]

Unnamed: 0,tens,hundreds
0,10.0,100
1,20.0,200
2,30.0,300
3,40.0,400
4,50.0,500


## Math is easier with a DataFrame

In [12]:
df['hundreds'].sum()

1500

In [13]:
df['hundreds'] + df['tens']

0    110.0
1    220.0
2    330.0
3    440.0
4    550.0
dtype: float64

In [16]:
df.sum()

letters     ABcDeee
hundreds       1500
tens            150
boolean           3
dtype: object

## Boolean series as a selector

In [17]:
df['tens'] < 35

0     True
1     True
2     True
3    False
4    False
Name: tens, dtype: bool

In [18]:
df[df['tens'] < 35]

Unnamed: 0,letters,hundreds,tens,boolean
0,A,100,10.0,True
1,B,200,20.0,False
2,c,300,30.0,True


In [19]:
df[df['boolean']]

Unnamed: 0,letters,hundreds,tens,boolean
0,A,100,10.0,True
2,c,300,30.0,True
3,D,400,40.0,True


## Series are automatically aligned

In [20]:
series_spelled = pd.Series(['Five','Four','Three','Two','One'],
                   index=[4,3,2,1,0])
series_spelled

4     Five
3     Four
2    Three
1      Two
0      One
dtype: object

In [21]:
df['spelled_out'] = series_spelled
df

Unnamed: 0,letters,hundreds,tens,boolean,spelled_out
0,A,100,10.0,True,One
1,B,200,20.0,False,Two
2,c,300,30.0,True,Three
3,D,400,40.0,True,Four
4,eee,500,50.0,False,Five


## Index doesn't have to be integers

In [22]:
df2 = df.set_index('spelled_out')
df2

Unnamed: 0_level_0,letters,hundreds,tens,boolean
spelled_out,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
One,A,100,10.0,True
Two,B,200,20.0,False
Three,c,300,30.0,True
Four,D,400,40.0,True
Five,eee,500,50.0,False


In [23]:
df2['letters']

spelled_out
One        A
Two        B
Three      c
Four       D
Five     eee
Name: letters, dtype: object

## `.loc[]` label-based two-axis indexing/selecting

In [24]:
df2.loc['One','letters']

'A'

In [25]:
df2.loc[:,'hundreds']

spelled_out
One      100
Two      200
Three    300
Four     400
Five     500
Name: hundreds, dtype: int64

In [26]:
df2.loc[:,['tens','letters']]

Unnamed: 0_level_0,tens,letters
spelled_out,Unnamed: 1_level_1,Unnamed: 2_level_1
One,10.0,A
Two,20.0,B
Three,30.0,c
Four,40.0,D
Five,50.0,eee


In [27]:
df2.loc['Three',:]

letters        c
hundreds     300
tens          30
boolean     True
Name: Three, dtype: object

In [28]:
df2.loc[['Three','Five'],:]

Unnamed: 0_level_0,letters,hundreds,tens,boolean
spelled_out,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Three,c,300,30.0,True
Five,eee,500,50.0,False


In [29]:
df2.loc[df2['tens']<35,:]

Unnamed: 0_level_0,letters,hundreds,tens,boolean
spelled_out,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
One,A,100,10.0,True
Two,B,200,20.0,False
Three,c,300,30.0,True


## SettingWithCopyWarning

In [37]:
df_nums = df2[['hundreds','tens']]
df_nums

Unnamed: 0_level_0,hundreds,tens
spelled_out,Unnamed: 1_level_1,Unnamed: 2_level_1
One,100,10.0
Two,200,20.0
Three,300,30.0
Four,400,40.0
Five,500,50.0


In [41]:
df_nums['sums'] = df_nums['hundreds'] + df_nums['tens']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [43]:
df_nums = df2[['hundreds','tens']].copy()
df_nums['sums'] = df_nums['hundreds'] + df_nums['tens']
df_nums

Unnamed: 0_level_0,hundreds,tens,sums
spelled_out,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
One,100,10.0,110.0
Two,200,20.0,220.0
Three,300,30.0,330.0
Four,400,40.0,440.0
Five,500,50.0,550.0
