## Introduction to Pandas 2

In [1]:
import numpy as np
import pandas as pd

### Series as dict

In [5]:
ser_as_dict = pd.Series(['value_1', 'value_2', 'value_3'], index = ['a','b','c'])
ser_as_dict['b']

'value_2'

In [6]:
'a' in ser_as_dict

True

In [8]:
ser_as_dict['d'] = "value_4"
ser_as_dict

a    value_1
b    value_2
c    value_3
d    value_4
dtype: object

### Series as Multi-dimensional Arrays

In [10]:
ser_as_dict['a':'c'] # slicing with explicit indices

a    value_1
b    value_2
c    value_3
dtype: object

In [22]:
ser_as_dict[0:4]

a    value_1
b    value_2
c    value_3
d    value_4
dtype: object

In [24]:
sum(ser_as_dict == 'value_3')

1

Remember python uses explicit index when indexing, and implicit index with slicing, .loc and .iloc are used to always explicitly use the hand-input indices (.loc) or the default indices (.iloc)

### Data Selection in Data Frames

In [25]:
# pandas series for state's area
area = pd.Series({'California': 423967, 'Texas': 695662,
                    'New York': 141297, 'Florida': 170312,
                          'Illinois': 149995})

# pandas series for state's population
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})

# pandas data frame for area and pop
data = pd.DataFrame({'area':area, 'pop':pop})

data

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [26]:
data['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [27]:
data['pop']

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
Name: pop, dtype: int64

In [33]:
data['density'] = data['pop'] / data['area']
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


### calling a single observation

In [45]:
data.loc['California']

area       4.239670e+05
pop        3.833252e+07
density    9.041393e+01
Name: California, dtype: float64

In [50]:
data.iloc[0]

area       4.239670e+05
pop        3.833252e+07
density    9.041393e+01
Name: California, dtype: float64

In [51]:
data.values[0]

array([4.23967000e+05, 3.83325210e+07, 9.04139261e+01])

### Finding states with a density over 100

In [63]:
data.loc[data.density > 100]

Unnamed: 0,area,pop,density
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121


### Operations between dataframes and series

In [65]:
rng = np.random.RandomState(42) # seed a random number generator
df_a = pd.DataFrame(rng.randint(50, size=(3,4))) # create a 3 x 4 matrix random 0 - 50 
df_a # print matrix

Unnamed: 0,0,1,2,3
0,38,28,14,42
1,7,20,38,18
2,22,10,10,23


#### Row wise operation

In [66]:
df_a - df_a.iloc[0]

Unnamed: 0,0,1,2,3
0,0,0,0,0
1,-31,-8,24,-24
2,-16,-18,-4,-19


#### Column wise operation

In [67]:
df_a.sub(df_a[0], axis=0)

Unnamed: 0,0,1,2,3
0,0,-10,-24,4
1,0,13,31,11
2,0,-12,-12,1


#### Single element operation

In [68]:
df_a - df_a[0]

Unnamed: 0,0,1,2,3
0,0.0,21.0,-8.0,
1,-31.0,13.0,16.0,
2,-16.0,3.0,-12.0,
