Data
====

Resources
----

- [Pandas documentation](http://pandas.pydata.org/pandas-docs/stable/)
- [QGrid](https://github.com/quantopian/qgrid)

In [None]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
plt.style.use('ggplot')

Working with Series
----

In [None]:
x = Series(range(5,10))

In [None]:
x

### We cna treat Series objects much like numpy vectors

In [None]:
x.sum(), x.mean(), x.std()

In [None]:
x**2

In [None]:
x[x >= 8]

### Series can also contain more information than numpy vectors

#### You can always use standard positional indexing

In [None]:
x[1:4]

#### Series index

But you can also assign labeled indexes.

In [None]:
x.index = list('abcde')
x

#### Note that with labels, the end index is included

In [None]:
x['a':'c']

#### Even when you have a labeled index, positional arguments still work

In [None]:
x[1:4]

In [None]:
x.a, x.c, x.e

#### Working with missing data

Missing data is indicated with NaN (not a number).

In [None]:
y = Series([10, np.nan, np.nan, 13, 14])
y

#### Concatenating two series

In [None]:
z = pd.concat([x, y])
z

#### Reset index to default

In [None]:
z = z.reset_index(drop=True)
z

#### `pandas` aggregate functions ignore missing data

In [None]:
z.sum(), z.mean(), z.std()

#### Selecting missing values

In [None]:
z[z.isnull()]

#### Selecting non-missing values

In [None]:
z[z.notnull()]

#### Replacement of missing values

In [None]:
z.fillna(0)

In [None]:
z.fillna(method='ffill')

In [None]:
z.fillna(method='bfill')

In [None]:
z.fillna(z.mean())

#### Working with dates / times

We will see more date/time handling in the DataFrame section.

In [None]:
z.index = pd.date_range('01-Jan-2016', periods=len(z))

In [None]:
z

#### Intelligent aggregation over datetime ranges

In [None]:
z.resample('W').sum()

#### Formatting datetime objects (see http://strftime.org)

In [None]:
z.index.strftime('%b %d, %Y')

DataFrame
----

Similar to R.

### Titanic data

In [None]:
url = 'https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv'
titanic = pd.read_csv(url)

In [None]:
titanic.shape

In [None]:
titanic.columns

In [None]:
# For display purposes, we will drop some columns
titanic = titanic[['survived', 'sex', 'age', 'fare',
                   'embarked', 'class', 'who', 'deck', 'embark_town',]]

In [None]:
titanic.dtypes

### Summarizing a data frame

In [None]:
titanic.ix[0]

In [None]:
titanic.describe()

In [None]:
titanic.head()

In [None]:
titanic.tail()

In [None]:
titanic.columns

In [None]:
titanic.index

### Indexing

In [None]:
titanic[['sex', 'age', 'class']].head()

In [None]:
titanic[10:15]

#### Using the `ix` helper for indexing

In [None]:
titanic.ix[10:15, 'age':'fare']

In [None]:
titanic.ix[10:15, [1,3,5]]

In [None]:
titanic[titanic.age < 2]

#### Sorting and ordering data

In [None]:
titanic.sort_index().head()

In [None]:
titanic.sort_values('age', ascending=True).head()

In [None]:
titanic.sort_values(['survived', 'age'], ascending=[True, True]).head()

#### Grouping data

In [None]:
sex_class = titanic.groupby(['sex', 'class'])

In [None]:
sex_class.count()

#### Why Kate Winslett survived and Leonardo DiCaprio didn't

In [None]:
df = sex_class.mean()
df['survived']

#### Of the females who were in first class, count the number from each embarking town

In [None]:
sex_class.get_group(('female', 'First')).groupby('embark_town').count()

#### Cross-tabulation

In [None]:
pd.crosstab(titanic.survived, titanic['class'])

#### We can also get multiple summaries at the same time

In [None]:
def my_func(x):
    return np.max(x)

In [None]:
mapped_funcs = {'embarked': 'count', 'age': ('mean', 'median', my_func), 'survived': sum}
sex_class.get_group(('female', 'First')).groupby('embark_town').agg(mapped_funcs)

In [None]:
titanic.columns

#### Visualizing tables

See more examples in the Graphics notebook.

In [None]:
import seaborn as sns
sns.set_context(font_scale=4)
sns.factorplot(x='sex', y='age', hue='survived', col='class', kind='box', data=titanic)
pass

### Making plots with `pandas`

In [None]:
from pandas_datareader import data as web
import datetime

In [None]:
apple = web.DataReader('AAPL', 'google', 
                        start = datetime.datetime(2015, 1, 1),
                        end = datetime.datetime(2015, 12, 31))

In [None]:
apple.head()

In [None]:
apple.plot.line(y='Close', marker='o', markersize=3, linewidth=0.5)
pass

In [None]:
# Zoom in on large drop in August
aug = apple['2015-08-01':'2015-08-30']
aug.plot.line(y=['High', 'Low', 'Open', 'Close'], marker='o', markersize=10, linewidth=1)
pass

Data conversions
----

One of the nicest features of `pandas` is the ease of converting tabular data across different storage formats. We will illustrate by converting the `titanic` dataframe into multiple formats.

In [None]:
titanic.head(2)

#### CSV

In [None]:
titanic.to_csv('titanic.csv', index=False)

In [None]:
t1 = pd.read_csv('titanic.csv')
t1.head(2)

#### Excel

In [None]:
!pip install openpyxl
t1.to_excel('titanic.xlsx')

In [None]:
t2 = pd.read_excel('titanic.xlsx')
t2.head(2)

#### Relational Database

In [None]:
import sqlite3

con = sqlite3.connect('titanic.db')
t2.to_sql('titanic', con, index=False, if_exists='replace')

In [None]:
t3 = pd.read_sql('select * from titanic', con)
t3.head(2)

#### JSON

In [None]:
t3.to_json('titanic.json')

In [None]:
t4 = pd.read_json('titanic.json')
t4.head(2)

In [None]:
t4 = t4[t3.columns]
t4.head(2)