## Imports

Customarily you will import the libraries like this:

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Structures

We'll use primarily 2 datastructures:
 
- Series
- DataFrame

In [None]:
s0 = pd.Series([1,2,3,5,8, np.nan])
s0

In [None]:
d0 = pd.DataFrame({
        'A': [1, 2, 3],
        'B': 'foo',
        'C': [9, 12, 15]
    })
d0

In [None]:
# Pandas works great with date indices!
dates = pd.date_range('20150101', periods=6, freq='M') # 6 month index
d1 = pd.DataFrame(np.random.randn(6, 3), index=dates, columns=['A', 'B', 'C'])
d1

## DataFrame Operations

In [None]:
d1.index

In [None]:
d1.columns

In [None]:
d1.values

In [None]:
d1.head(2)

In [None]:
d1.describe()

In [None]:
d1.T

In [None]:
d1.sort(columns='A')

In [None]:
# sort the columns
d1.sort_index(axis=1, ascending=False)

## Selecting Data

In [None]:
# Slices
d1[0:2]

In [None]:
# Selecting a column
d1['A']

In [None]:
# Selecting multiple columns
d1.loc[:, ['A', 'C']]

In [None]:
# Selecting a row
d1.loc['20150131']

In [None]:
# Selecting a value
d1.loc['20150131', 'A']

## Boolean Indexing

In [None]:
# creating a boolean mask
d1['A'] > 0

In [None]:
d1[d1['A'] > 0]

In [None]:
# Multiple conditions, don't forget the parens!!!
d1[(d1['A'] > 0) & (d1['B'] > 0)]

In [None]:
# isin
d2 = pd.DataFrame(np.random.randint(0, 3, size=(5, 4)), columns=list('ABCD'), index=list('VWXYZ'))
d2

In [None]:
d2[d2['A'].isin([0, 1])]

In [None]:
# isnull
d2.loc['V', 'E'] = np.nan
d2.loc['X', 'E'] = 3
d2['F'] =  10
d2.loc['Z', 'F'] = np.inf
d2

In [None]:
d2[pd.isnull(d2['E'])]

In [None]:
d2[~pd.isnull(d2['E'])]

## Functions

In [None]:
d3 = pd.DataFrame(np.random.randint(0, 3, size=(5, 4)), columns=list('ABCD'), index=list('VWXYZ'))
d3

In [None]:
d3.sum()

In [None]:
d3.mean()

In [None]:
d3.apply(np.std) # apply a function column wise

In [None]:
d3.apply(np.sum, axis=1) # apply a function row wise

In [None]:
d3['A'].unique()

In [None]:
d3.apply(lambda x: x + 1000)

In [None]:
d3.apply(lambda row: row['A'] + row['B'], axis=1)

## Data Wrangling

In [None]:
# Rename columns
print d3.columns

d3.columns = list('HIJK')
print d3.columns

In [None]:
d3

### Missing Data

Pandas ignores missing data when performing operations, but you can get rid of it or fill it if it fits your use case

In [None]:
d2

In [None]:
d2.dropna()

In [None]:
d2.fillna(value=0)

In [None]:
# remove undesirable values
d2.replace([np.nan, np.inf], 0)

In [None]:
# join 2 datasets
d2.join(d3) # joins on the axis by default

In [None]:
d2.join(d2, rsuffix='_copy')

## Visualize

In [None]:
plt.hist(np.random.randint(0, 10, (100, 1)), range=(0, 10), bins=10)

In [None]:
plt.hist(np.random.randint(0, 10, (100, 1)), range=(0, 10), bins=5)

In [None]:
# plt.scatter(x, y)
plt.scatter(np.arange(100), np.random.randn(100))