# Pandas: Series
Series is for 1D data. For 2D, it is DataFrame.


In [1]:
import numpy as np
import pandas as pd

# pandas series object
# gives a column variable
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [2]:
# access the values in the data object, use .values
data.values

array([0.25, 0.5 , 0.75, 1.  ])

## Indices for pandas series

In [3]:
# data index
data.index

RangeIndex(start=0, stop=4, step=1)

In [4]:
data[1] # gives the 2nd

np.float64(0.5)

In [5]:
data[1:3] # gives the 2nd to 3rd

1    0.50
2    0.75
dtype: float64

In [6]:
# with Series, can use 'explicitly' defined index names
data2 = pd.Series([0.25, 0.5, 0.75, 1.0],
                  index = ['a', 'b', 'c', 'd'])
data2

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [7]:
# access items
data2['b']

np.float64(0.5)

In [8]:
# possible even to use non-sequential indices
# but this is less intuitive so probably not going to be useful
data3 = pd.Series([0.25, 0.5, 0.75, 1.0],
                  index=[2,4,5,1])
data3[2]

np.float64(0.25)

### Series as dictionary

Dictionary is a structure that maps **keys** to a set of **values**.

In [9]:
my_dict = {'a': 100,
           'b': 200,
           'c': 300}
my_dict = pd.Series(my_dict)
my_dict

a    100
b    200
c    300
dtype: int64

In [10]:
my_dict['c']

np.int64(300)

# Pandas DataFrame

DataFrame is a generalized Numpy array, basically a sequence of aligned Series - sharing the same indices.

In [13]:
# define population
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)

# define area
area_dict = {'California': 423967,
             'Texas': 695662,
             'New York': 141297,
             'Florida': 170312,
             'Illinois': 149995}
area = pd.Series(area_dict)

# now combine these two
states = pd.DataFrame({
    'population': population,
    'area': area
})
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [14]:
# check index (row names)
states.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [15]:
# column names
states.columns

Index(['population', 'area'], dtype='object')

In [16]:
states['area'] # select one column

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [20]:
# create DataFrame from dictionary
# if some values are missing, it'll create a union of all keys
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [21]:
# create DF from a two-dimensional array
pd.DataFrame(np.random.rand(3, 2),
             columns=['foo', 'bar'],
             index=['a', 'b', 'c'])

Unnamed: 0,foo,bar
a,0.548139,0.370785
b,0.776101,0.087069
c,0.319419,0.366685


# Pandas indexing and selection

## Selection in pandas series



In [22]:
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                  index = ['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [23]:
data['b']

np.float64(0.5)

In [24]:
# treat it as a dictionary to examine keys/indices and values
'a' in data

True

In [25]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [26]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [27]:
# add new key-value pair
data['e'] = 1.25
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [28]:
# slicing by explicit index
data['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [29]:
# slicing by implicit integer index
data[0:2]

a    0.25
b    0.50
dtype: float64

In [30]:
# masking (filter with boolean)
data[(data>0.3)&(data <0.8)]

b    0.50
c    0.75
dtype: float64

In [31]:
# fancy indexing (basically list within a list)
data[['a', 'e']]

a    0.25
e    1.25
dtype: float64

# indexers: loc, iloc

* loc attribute is for explicit indexing (user defined)
* iloc is for implicit indexing (python style, starts from 0)


In [33]:
data = pd.Series(['a', 'b', 'c'], index = [1,3,5])
# explicit index
print(data[1])
# implicit index when slicing, using the python style index
print(data[1:3])

a
3    b
5    c
dtype: object


In [35]:
# loc: forces the explict index - user defined
print(data.loc[1])
print(data.loc[1:3])

a
1    a
3    b
dtype: object


In [36]:
# iloc: implicit python style index
print(data.iloc[1]) # 2nd
print(data.iloc[1:3]) # 2,3rd

b
3    b
5    c
dtype: object


# data selection in DF


# Index alignment

In [38]:
# series
# create two series
area = pd.Series({'Alaska': 1723337, # unavailable population
                  'Texas': 695662,
                  'California': 423967},
                 name='area')
population = pd.Series({'California': 38332521,
                        'Texas': 26448193,
                        'New York': 19651127}, # unavailable area
                       name='population')
# divide to compute the population density, those with mismatches NaN
population/area

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [49]:
# in dataframe
rng = np.random.RandomState(42)
A = pd.DataFrame(rng.randint(0, 20, (2, 2)),
                 columns=list('AB'))
A

Unnamed: 0,A,B
0,6,19
1,14,10


In [41]:
B = pd.DataFrame(rng.randint(0, 10, (3, 3)),
                 columns=list('BAC'))
B

Unnamed: 0,B,A,C
0,7,4,6
1,9,2,6
2,7,4,3


In [42]:
# this will match the rownames and colnames
A+B

Unnamed: 0,A,B,C
0,10.0,26.0,
1,16.0,19.0,
2,,,


In [47]:
# can fill in the missing values with pre-defined fills
fill = A.stack().mean() # this makes A a vector
fill

np.float64(12.25)

In [48]:
# it is A that is missing in the corresponding entries
A.add(B, fill_value=0)
A.add(B, fill_value=fill)

Unnamed: 0,A,B,C
0,10.0,26.0,18.25
1,16.0,19.0,18.25
2,16.25,19.25,15.25


Unnamed: 0,A,B,C
0,10.0,26.0,6.0
1,16.0,19.0,6.0
2,4.0,7.0,3.0
