In [68]:
import pandas as pd
import numpy as np

### Pandas Series Object

In [69]:
data = pd.Series([0.25, 0.5, 0.75, 1])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

* As we see in the preceding output, the Series wraps both a sequence of values and a sequence of indices, which we can access with the values and index attributes.


In [70]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [71]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [72]:
data[1]

0.5

In [73]:
data[1:3]

1    0.50
2    0.75
dtype: float64

* associated index can be accessed with RangeIndex type association
* essentially the same as indexing an array should the type be a RangeIndex
* Primary difference from Numpy is the presence of the index

In [74]:
# Index is flexible
data_1 = pd.Series([0.25,0.5,0.75,1], index=['a', 'b', 'c', 'd'])
data_1

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [75]:
data_1[:2]

a    0.25
b    0.50
dtype: float64

In [76]:
data_1['a':'c'] # note here that the closing index is included unlike a usual slice

a    0.25
b    0.50
c    0.75
dtype: float64

### Series as specialized dictionary

In [77]:
population_dict = {'California':38332521, 'Texas':26448193, 'New York':19651127, 
                   'Florida':19552860, 'Illinois':12882135}

(population_dict, type(population_dict))

({'California': 38332521,
  'Texas': 26448193,
  'New York': 19651127,
  'Florida': 19552860,
  'Illinois': 12882135},
 dict)

In [78]:
pop_series = pd.Series(population_dict)
print(type(pop_series))
pop_series

<class 'pandas.core.series.Series'>


California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [79]:
# dictionary type access
pop_series['Florida']

19552860

In [80]:
# Series supports array-style operations like slicing (unlike dict)
pop_series['California':'New York']

California    38332521
Texas         26448193
New York      19651127
dtype: int64

### Series Objects

In [81]:
pd.Series([2,4,5])

0    2
1    4
2    5
dtype: int64

In [82]:
pd.Series(5, index=[100,200,300]) #scalar

100    5
200    5
300    5
dtype: int64

In [83]:
pd.Series({2:'a', 3:'z', 9:'l'})

2    a
3    z
9    l
dtype: object

In [84]:
pd.Series({2:'a', 3:'z', 9:'l'},index=[3,9]) #only explicity keys

3    z
9    l
dtype: object

### DataFrame Object

In [85]:
# Use Existing Series Objects
area_dict = {'California':423967, 'Texas':695662, 'New York':141297, 
                   'Florida':170312, 'Illinois':149995}
states = pd.DataFrame({'population': pop_series, 'area':area_dict})
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [86]:
display(states.index, states.columns)

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

Index(['population', 'area'], dtype='object')

In [87]:
states['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [88]:
type(states['area'])

pandas.core.series.Series

### Construct DataFrame Objects

In [89]:
# single-column DataFrame can be constructed from a single Series
pd.DataFrame(pop_series, columns=['population']) 

Unnamed: 0,population
California,38332521
Texas,26448193
New York,19651127
Florida,19552860
Illinois,12882135


In [90]:
# List of Dicts
data = [{'a':i, 'b': 2 * i} for i in range(3)]
print(data)
pd.DataFrame(data)

[{'a': 0, 'b': 0}, {'a': 1, 'b': 2}, {'a': 2, 'b': 4}]


Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [91]:
# Fill in with NAN if empty
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [92]:
# Dictionary of Series Objects
states = pd.DataFrame({'population': pop_series, 'area':area_dict})
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [93]:
# 2D Numpy Array
pd.DataFrame(np.random.rand(3,2), index=['a', 'b', 'c'], columns=['foo', 'bar'])

Unnamed: 0,foo,bar
a,0.890603,0.278919
b,0.676287,0.076154
c,0.553184,0.93444


In [94]:
# Structured Numpy Array
A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])
A

array([(0, 0.), (0, 0.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])

In [95]:
pd.DataFrame(A)

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


### Index Reminders

* Conveniently can be thought of as either as an immutable array or as an ordered set (or multi-set)
* Index objects not mutable through a normal list/array type assignment
    * ind[1] = 0 would return a like TypeError : Index does not support mutable operations
* This immutability makes it safer to share indices between multiple DataFrames and arrays, without the potential for side effects from inadvertent index modification.


#### Index as ordered set 
* Pandas objects are designed to facilitate operations such as joins across datasets, which depend on many aspects of set arithmetic. 
* The Index object follows many of the conventions used by Python’s built-in set data structure, so that unions, intersections, differences, and other combinations can be computed in a familiar way:



In [96]:
indA = pd.Index([1,3,5,7,9])
indB = pd.Index([2,3,5,7,11])

In [97]:
# Intersection
indA & indB

  indA & indB


Int64Index([3, 5, 7], dtype='int64')

In [98]:
indA | indB #union

  indA | indB #union


Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [99]:
indA ^ indB #symmetric differences

  indA ^ indB #symmetric differences


Int64Index([1, 2, 9, 11], dtype='int64')

In [100]:
(indA.intersection(indB), indA.union(indB), indA.difference(indB), indA.symmetric_difference(indB))

(Int64Index([3, 5, 7], dtype='int64'),
 Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64'),
 Int64Index([1, 9], dtype='int64'),
 Int64Index([1, 2, 9, 11], dtype='int64'))

* use Index Methods to avoid type deprecated Warnings
* https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Index.html

### Data Indexing and Selection

#### Data Selection in Series

In [101]:
### Data Selection in Series
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [102]:
data['b']

0.5

In [103]:
'a' in data

True

In [104]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [105]:
list(data.items()) # need to unzip with a list

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [106]:
# Extend a Series by assigning to a new key
data['e'] = 1.25
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [107]:
# slice by explicit index
data['a':'d']

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [108]:
# slice by implicit integer index
data[0:3]

a    0.25
b    0.50
c    0.75
dtype: float64

In [109]:
# masking
data[(data > 0.3) & (data < 0.8)]

b    0.50
c    0.75
dtype: float64

* Notice that when you are slicing with an explicit index (i.e., data['a':'c']), the final index is included in the slice, while when you’re slicing with an implicit index (i.e., data[0:2]), the final index is excluded from the slice.



### Indexers: loc, iloc, & ix

In [111]:
# First, the loc attribute allows indexing and slicing that always references the explicit index:
data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
data

1    a
3    b
5    c
dtype: object

In [112]:
data[1]

'a'

In [113]:
# implicit when slicing
data[1:3]

3    b
5    c
dtype: object

#### Loc attribute
* attribute allows indexing and slicing that always references the explicit index
    * Think actual index value (not implicit list type index value)

In [114]:
data.index

Int64Index([1, 3, 5], dtype='int64')

In [115]:
data.loc[1]

'a'

In [116]:
data.loc[1:3] # loc is inclusive here

1    a
3    b
dtype: object

#### Iloc attribute
* Allows indexing and slicing that always references the implicit Python-style index


In [117]:
data

1    a
3    b
5    c
dtype: object

In [118]:
data.iloc[0:2] # similar to a list the last index value not returned in an iloc slice

1    a
3    b
dtype: object

In [119]:
data.iloc[:1]

1    a
dtype: object

In [120]:
data.iloc[0]

'a'

#### ix - hybrid
* for Series objects is equivalent to standard []- based indexing

### Guiding Principle
* Explicit is better than implicit 

### Data Selection in DataFrame

In [121]:
data_pop = pd.DataFrame({'population': pop_series, 'area':area_dict})
data_pop

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [125]:
# Pull individual series
(data_pop['area'],type(data_pop['area']))

(California    423967
 Texas         695662
 New York      141297
 Florida       170312
 Illinois      149995
 Name: area, dtype: int64,
 pandas.core.series.Series)

In [126]:
# Dot index available generally to
data_pop.area, type(data_pop.area)

(California    423967
 Texas         695662
 New York      141297
 Florida       170312
 Illinois      149995
 Name: area, dtype: int64,
 pandas.core.series.Series)

#### Quick Reminder
* If column names are not strings or column names conflict with methods of the DataFrame, this attribute-style dot access is not possible
    * For example, the DataFrame has a pop() method, so data.pop will point to this rather than the "pop" column:
* In particular, you should avoid the temptation to try column assignment via attribute (i.e., use data['pop'] = z rather than data.pop = z).


In [128]:
# Add Column
data_pop['density'] = data_pop['population'] / data_pop['area']
data_pop

Unnamed: 0,population,area,density
California,38332521,423967,90.413926
Texas,26448193,695662,38.01874
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121
Illinois,12882135,149995,85.883763


In [130]:
# Can also view the DataFrame as an enhanced two-dimensional array
(data_pop.values, data_pop.values.shape) # 5 rows by 3 columns

(array([[3.83325210e+07, 4.23967000e+05, 9.04139261e+01],
        [2.64481930e+07, 6.95662000e+05, 3.80187404e+01],
        [1.96511270e+07, 1.41297000e+05, 1.39076746e+02],
        [1.95528600e+07, 1.70312000e+05, 1.14806121e+02],
        [1.28821350e+07, 1.49995000e+05, 8.58837628e+01]]),
 (5, 3))

In [133]:
# Transpose the DF to swap rows and columns
data_pop_t = data_pop.T
display(data_pop_t, data_pop_t.index, data_pop_t.columns)

Unnamed: 0,California,Texas,New York,Florida,Illinois
population,38332520.0,26448190.0,19651130.0,19552860.0,12882140.0
area,423967.0,695662.0,141297.0,170312.0,149995.0
density,90.41393,38.01874,139.0767,114.8061,85.88376


Index(['population', 'area', 'density'], dtype='object')

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [134]:
# Implicity iloc usage can index the underlying array as if it is a simple NumPy array
data_pop.iloc[:2, :2] # first two rows and columns

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662


In [136]:
# Explicit for same declaration
data_pop.loc['California':'Texas', 'population':'area']

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662


#### ix indexer allows a hybrid of both approaches
* ... however, it's now deprecated!

In [137]:
data_pop.ix[:2, 'population':'area']

AttributeError: 'DataFrame' object has no attribute 'ix'

In [139]:
# Combine masking and fancy indexing with .loc (conditional return)
data_pop.loc[data_pop['area'] > 300000, ['population', 'density']] # condition, then columns

Unnamed: 0,population,density
California,38332521,90.413926
Texas,26448193,38.01874


### Additional Indexing Conventions

In [142]:
data_pop # quick look again at our simple dataframe

Unnamed: 0,population,area,density
California,38332521,423967,90.413926
Texas,26448193,695662,38.01874
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121
Illinois,12882135,149995,85.883763


In [140]:
# While Indexing refers to columns, slicing refers to rows
data_pop['Florida':'Illinois']

Unnamed: 0,population,area,density
Florida,19552860,170312,114.806121
Illinois,12882135,149995,85.883763


In [141]:
# Refer to rows by number rather than index
data_pop[1:3]

Unnamed: 0,population,area,density
Texas,26448193,695662,38.01874
New York,19651127,141297,139.076746
