# Introduction to Pandas

Tutorial from http://www.gregreda.com/2013/10/26/intro-to-pandas-data-structures/

pandas introduces two new data structures to Python - Series and DataFrame, both of which are built on top of NumPy

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('max_columns', 50)
%matplotlib inline

## Series

In [3]:
# create a Series with an arbitrary list
s = pd.Series([109, 'Redkar', 3.142, -1789710578, 'Happy Coding!'])
s

0              109
1           Redkar
2            3.142
3      -1789710578
4    Happy Coding!
dtype: object

### Specify an index to use when creating the Series

In [5]:
s = pd.Series([109, 'Redkar', 3.142, -1789710578, 'Happy Coding!'],
              index=['A', 'Z', 'C', 'Y', 'E'])
s

A              109
Z           Redkar
C            3.142
Y      -1789710578
E    Happy Coding!
dtype: object

### Create series from a dictionary

In [6]:
d = {'Chicago': 1000, 'New York': 1300, 'Portland': 900, 'San Francisco': 1100,
     'Austin': 450, 'Boston': None}
cities = pd.Series(d)
cities

Chicago          1000.0
New York         1300.0
Portland          900.0
San Francisco    1100.0
Austin            450.0
Boston              NaN
dtype: float64

In [7]:
#select specific index
cities['Chicago']

1000.0

In [11]:
cities[['Chicago','Portland','Boston']]

Chicago     1000.0
Portland     900.0
Boston         NaN
dtype: float64

### boolean indexing 

In [12]:
cities[cities < 1000]

Portland    900.0
Austin      450.0
dtype: float64

In [13]:
#filters
less_than_1000 = cities < 1000
print(less_than_1000)
print('\n')
print(cities[less_than_1000])

Chicago          False
New York         False
Portland          True
San Francisco    False
Austin            True
Boston           False
dtype: bool


Portland    900.0
Austin      450.0
dtype: float64


### change the values in a Series

In [14]:
# changing based on the index
print('Old value:', cities['Chicago'])
cities['Chicago'] = 1400
print('New value:', cities['Chicago'])

Old value: 1000.0
New value: 1400.0


In [16]:
# changing values using boolean logic
print(cities[cities < 1000])
print('\n')
cities[cities < 1000] = 750

print (cities[cities < 1000])

Portland    900.0
Austin      450.0
dtype: float64


Portland    750.0
Austin      750.0
dtype: float64


### Check if item exists

In [17]:
print('Seattle' in cities)
print('San Francisco' in cities)

False
True


### Mathematical operations can be done using scalars and functions.

In [18]:
# divide city values by 3
cities / 3

Chicago          466.666667
New York         433.333333
Portland         250.000000
San Francisco    366.666667
Austin           250.000000
Boston                  NaN
dtype: float64

In [19]:
# square city values
np.square(cities)

Chicago          1960000.0
New York         1690000.0
Portland          562500.0
San Francisco    1210000.0
Austin            562500.0
Boston                 NaN
dtype: float64

### add two Series together

In [20]:
print(cities[['Chicago', 'New York', 'Portland']])
print('\n')
print(cities[['Austin', 'New York']])
print('\n')
print(cities[['Chicago', 'New York', 'Portland']] + cities[['Austin', 'New York']])
#Notice that because Austin, Chicago, and Portland were not found in both Series, they were returned with NULL/NaN values.

Chicago     1400.0
New York    1300.0
Portland     750.0
dtype: float64


Austin       750.0
New York    1300.0
dtype: float64


Austin         NaN
Chicago        NaN
New York    2600.0
Portland       NaN
dtype: float64


### Null checking

In [21]:
# returns a boolean series indicating which values aren't NULL
cities.notnull()

Chicago           True
New York          True
Portland          True
San Francisco     True
Austin            True
Boston           False
dtype: bool