# Data

## Libraries for Data Processing

In [None]:
%matplotlib inline

String processing and utility libraries

In [None]:
import os
import glob
import string
import re
import imageio

Libraries for functional programming

In [None]:
import operator as op
import itertools as it
from functools import reduce, partial
import toolz as tz
import toolz.curried as c

Libraries for numerical programming

In [None]:
import numpy as np
import pandas as pd
from scipy import (stats, sparse, linalg, 
                   spatial, integrate, optimize, io)

Libraries for plotting and visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

## Working with unstructured text data

### Categorical and one-hot encoding

See examples from S05_Text notebook.

### Term frequency - inverse document frequency (tf-idf)

> In information retrieval, tf–idf or TFIDF, short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. It is often used as a weighting factor in searches of information retrieval, text mining, and user modeling. The tf-idf value increases proportionally to the number of times a word appears in the document, but is often offset by the frequency of the word in the corpus, which helps to adjust for the fact that some words appear more frequently in general. Nowadays, tf-idf is one of the most popular term-weighting schemes; 83% of text-based recommender systems in the domain of digital libraries use tf-idf.

Source: [Wikipedia](https://en.wikipedia.org/wiki/Tf–idf)

#### Documents 

In [None]:
paths = glob.glob(os.path.join('data', 'Gutenberg', '*.txt') )
names = [os.path.splitext(os.path.split(path)[-1])[0] for path in paths]
names

In [None]:
N = len(names)

#### Simple processing to find words in each document

Standard Python idiom.

In [None]:
doc_terms = []
for path in paths:
    with open(path, encoding='latin-1') as f:
        text = f.read()
        text = text.lower()
        text = text.translate(str.maketrans('', '', string.punctuation))
        words = text.split()
        doc_terms.append(words)

Using a functional idiom

In [None]:
doc_terms = tz.pipe(
    paths,
    c.map(partial(open, encoding='latin-1')),
    c.map(lambda x: x.read()),
    c.map(lambda x: x.lower()),
    c.map(lambda x: x.translate(str.maketrans('', '', string.punctuation))),
    c.map(lambda x: x.split()),
    list
)

#### Term frequency

This is just a word count for each document.

In [None]:
tf = {name: tz.frequencies(doc) 
      for name, doc in zip(names, doc_terms)}

#### Document frequency

This is how many documents a term appears in.

In [None]:
df = tz.frequencies(tz.concat(d.keys() for d in tf.values()))

#### Inverse document frequency 

This weights each term by the inverse frequency of its appearance across different documents, and reduces the important of common words like "the".

In [None]:
idf = {term: np.log((1 + N)/(1 + count)) for term, count in df.items()}

#### Term frequency - inverse document frequency

This is just the product of tf and idf, and a measure of the importance of each term in a document.

In [None]:
terms = list(tz.unique(tz.concat(doc_terms)))

In [None]:
tf_idf = {}
for name, doc in tf.items():
    d = {}
    for term in terms:
        d[term] = doc.get(term, 0) * idf[term]
    tf_idf[name] = d

#### Convert to a structured data type

In [None]:
tf_idf = pd.DataFrame(tf_idf)

In [None]:
tf_idf.iloc[:5, :5]

#### Distinctive words in each document

These are words that appear frequently in that document but not in others.

In [None]:
pd.DataFrame({doc: series.sort_values(ascending=False).index[:5] 
              for doc, series in tf_idf.items()}).T

#### Why is "macb" the top hit for Macbeth? 

In [None]:
tf['shakespeare-macbeth']['macb']

In [None]:
text = open('data/gutenberg/shakespeare-macbeth.txt', encoding='latin-1').read()
re.findall('macb[^e].*\n', text, re.IGNORECASE)[:10]

### Image data

In [None]:
paths = glob.glob(os.path.join('data', 'POKEMON', '*.png') )[:3]

imgs = np.array([imageio.imread(path) for path in paths])

In [None]:
imgs.shape

In [None]:
fig, axes = plt.subplots(1,3,figsize=(10,3))
for ax, img in zip(axes, imgs):
    ax.imshow(img)
    ax.set_xticks([])
    ax.set_yticks([])

## Using `pandas` for data munging

In [None]:
url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv"

In [None]:
tips = pd.read_csv(url)

### Inspecting a data frame

In [None]:
tips.shape

In [None]:
tips.dtypes

In [None]:
tips.columns

In [None]:
tips.index

In [None]:
tips.describe()

In [None]:
tips.head(3)

In [None]:
tips.tail(3)

In [None]:
tips.sample(3)

### Series

A column is a pandas Series object. It behaves like an indexed vector.

In [None]:
tips['sex'].head()

Alternative way to get Series when column name meets requirements for Python variable (i.e. no spaces or punctuation)

In [None]:
tips.sex.head()

### Series types

#### String operations

In [None]:
tips.sex.str.lower().str[0].head()

#### Categorical data types

In [None]:
tips.day.unique()

In [None]:
tips['day'] = tips.day.astype('category')

In [None]:
tips.day.head(3)

In [None]:
tips.day.cat.reorder_categories(['Thur', 'Fri', 'Sat', 'Sun'], ordered=True, inplace=True)

In [None]:
tips.day.head(3)

#### Datetime operations

In [None]:
import pandas_datareader as pdr

data_source = 'google'
start_date = '2010-01-01'
end_date = '2016-12-31'

data = pdr.get_data_morningstar('MSFT', start_date, end_date)

In [None]:
data.head(3)

In [None]:
dates = data.index.get_level_values(1)

In [None]:
dates[:3]

In [None]:
list(it.islice(zip(dates.year, 
                    dates.month,
                    dates.day), 6))

In [None]:
msft = data.loc['MSFT']

In [None]:
msft.plot(y='High')
pass

In [None]:
msft.loc['2016-12-01':'2016-12-31', ('Low', 'High')].plot(
    linestyle='dashed', marker='+')
pass

### Indexing

In [None]:
tips[0:2]

In [None]:
tips.loc[0:2]

In [None]:
tips.iloc[0:2]

In [None]:
tips.iloc[0:3, [2,3,4]]

In [None]:
tips.loc[0:2, ['tip', 'sex', 'size']]

In [None]:
tips.loc[0:2, 'tip':'day']

#### Boolean indexing

In [None]:
tips[tips.sex == 'Male'].head(3)

In [None]:
tips[(tips.sex == 'Male') & (tips.time != 'Dinner')].head(3)

### Special selection

In [None]:
df = pd.DataFrame({
    'a': [1,2,None,4,1], 
    'b': [3,3,None,None,3],
    'c': [1,2,None,4,1]})

In [None]:
df

In [None]:
df.dropna()

In [None]:
df.dropna(how='all')

In [None]:
df.drop_duplicates()

In [None]:
df.drop_duplicates(keep='last')

### Selecting by label

In [None]:
tips.filter(regex='^s.*').head(3)

### Sorting

In [None]:
tips.sort_values(['size', 'tip'], ascending=[True, False]).head(3)

Note that ordered categorical values are sorted appropriately.

In [None]:
tips.sort_values(['day']).head(3)

### Rearrange columns

In [None]:
tips = tips[tips.columns.sort_values()]
tips.head(3)

In [None]:
tips = tips.filter('total_bill	tip	sex	smoker	day	time	size'.split())
tips.head(3)

### Transforms

In [None]:
tips['day_type'] = 'Weekday'
tips.loc[tips.day.isin(['Sat', 'Sun']), 'day_type'] = 'Weekend'
tips.head(3)

In [None]:
tips['cost'] = tips['total_bill'] + tips['tip']
tips.head(3)

In [None]:
tips.assign(log1p_cost = np.log1p(tips.cost)).head(3)

In [None]:
tips.replace({
    'sex': dict(Femals='F', Male='M'),
    'day': dict(Thur=4, Fri=5, Sat=6, Sun=7)}).head(3)

In [None]:
tips.rename({'sex': 'gender'}, axis=1).head(3)

#### Transforming missing values

In [None]:
df

In [None]:
df.fillna(0)

In [None]:
df.fillna(df.mean())

In [None]:
df.fillna(method='ffill')

In [None]:
df.fillna(method='bfill')

### Summaries

In [None]:
tips.mean()

In [None]:
tips.std()

In [None]:
tips.count()

### Grouping

In [None]:
tips.groupby(['sex', 'day', 'time']).mean()

In [None]:
tips.groupby(['sex']).agg(['mean', 'std', 'min', 'max'])

### Working with hierarchical indexes

In [None]:
df = tips.groupby(['sex', 'day', 'time']).mean()
df.head()

In [None]:
df.index

In [None]:
df.columns

In [None]:
df.loc[('Female')]

In [None]:
df.loc[('Female', 'Sat')]

In [None]:
df.loc[('Female', slice(None), 'Dinner')]

### Stacking and unstacking

In [None]:
tips.head(3)

In [None]:
df.unstack()['tip']

In [None]:
df.unstack(level=0)['tip']

In [None]:
df.unstack(level=[1,2])['tip']

In [None]:
df.unstack(level=[1,2])['tip'].stack(level=0)

#### Swapping levels

In [None]:
df.swaplevel(1,2)

### Resetting index

If you'd rather not deal with hierarchical indexes, use `reset_index`.

In [None]:
df.reset_index()

### Reshaping

In [None]:
df1 = df.reset_index()

In [None]:
df1.head(3)

#### Transpose

In [None]:
df1.T

#### Melt (gather)

In [None]:
pd.melt(df1, id_vars=['sex', 'day', 'time', 'size']).head(10)

### Pivot table

A pivot table is like a group_by operation on both the index (rows) and columns.

In [None]:
pd.pivot_table(df1, values=['total_bill', 'tip'],
               index=['sex', 'day'], 
               columns='time',
               aggfunc='mean')

### Joining data frames

In [None]:
tips1 = tips[0:5]
tips1

In [None]:
tips2 = tips[3:8]
tips2

#### Simple concatenation

In [None]:
pd.concat([tips1, tips2])

### Joining columns

Merge uses all common columns to combine. It is very flexible - see help(pd.merge).

In [None]:
pd.merge(tips1, tips2)

In [None]:
pd.merge(tips1, tips2, how='left')

In [None]:
pd.merge(tips1, tips2, how='right')

In [None]:
pd.merge(tips1, tips2, how='outer')

## Visualizing data

In [None]:
tips.head()

In [None]:
sns.set_context('notebook', font_scale=1.3)
g = sns.factorplot(x='sex', y='tip',
                   col='time', row='smoker',
                   data=tips, kind='swarm')
pass

In [None]:
sns.set_context('notebook', font_scale=1.3)
g = sns.factorplot(x='day', y='tip', hue='sex',
                   col='time', row='smoker',
                   data=tips, kind='violin', 
                   margin_titles=True)
pass

In [None]:
sns.lmplot(x='total_bill', y='tip', hue='sex', 
           col='time', row='smoker', 
           data=tips, palette='dark')
pass

### Pipelines

As nearly all `pandas` functions and methods consume and return a `DataFrame`, they can be chained togtehr to construct a pipeline. 

In [None]:
tips.head()

In [None]:
pd.set_option('precision', 2)

In [None]:
(
    tips.
    dropna().
    filter(regex=r".*[t|s].*").
    loc[lambda df: df.sex=='Female'].
    groupby(['smoker', 'time']).
    agg({'total_bill': 'mean', 
         'tip': 'mean'}).
    assign(percent = lambda df: 100*df.tip/df.total_bill)
)