In [None]:
import os

import pandas as pd
from pandas import Series, DataFrame

import numpy as np
import seaborn as sns

## Revision - Working with Strings

In [None]:
# Trimming whitespace
[x.strip() for x in s.split(',')]

# Also see rstrip, lstrip

In [None]:
# String Splitting
' '.join([x.strip() for x in s.split(',')])

In [None]:
'_#_'.join(list('abcde'))

In [None]:
# Concatenating Strings
pieces = list('abcde')
print '::'.join(pieces)
print '--'.join(pieces)
print ' '.join(pieces)

In [None]:
# Does a Substring belong to a string
print 'steady' in s
print 'set' in s

In [None]:
# Locate a substring
s.index('go')

In [None]:
s

In [None]:
s[15:17]

In [None]:
sentence = 'the sun rises in the east'

In [None]:
sentence.index('east') == sentence.find('east')

In [None]:
print sentence.index('west')

In [None]:
print sentence.find('west')

In [None]:
sentence[21:]

In [None]:
sentence.find('ris')

In [None]:
sentence.count('t')

In [None]:
# Locate a substring
s.find(',')

In [None]:
# Count occurrences
s.count(',')

In [None]:
sentence.endswith('east')

In [None]:
s2 = 'the quick brown fox jumps over the lazy dog'
s2.find('fox')

print 'lazy' in s2

print s2.endswith('dog')

In [None]:
s.startswith('ready')
# similarly .endswith()a

<big>

These string functions become very important in conjunction with the `map()` method when we're rying to clean text data.


-----------------------------------------------------------------------------------------------------------------------------

<br><br>

<center>

# Plotting and Visualization

</center>
   
<br>

---



## 1. matplotlib basics


http://matplotlib.org/

* Run **`import matplotlib.pyplot as plt`**
* Create a figure object using **`plt.figure`**
* Add subplots to it using **`add_subplot`**
    * This creates **AxesSubplot** objects on which you can place plots
* Use a plotting command like **`plt.plot`** and matplotlib will place your plot on this canvas


### 1.1 Figure, Subplots, AxisSubplot objects and your plot

#### Create a 2x2 figure and add three plots to it


In [None]:
import matplotlib.pyplot as plt

In [None]:
# Create an empty figure
fig = plt.figure(figsize=(6, 4))

In [None]:
# Run plt.figure? to check out figure options like size, dpi, color

axsp1 = fig.add_subplot(2, 2, 1)
# There will be 2 x 2 subplots on the figure and axsp1 will put your plot on subplot 1

axsp2 = fig.add_subplot(2, 2, 2)
axsp3 = fig.add_subplot(2, 2, 3)
# Now, we have three AxesSubplot objects on our figure. 

In [None]:
# First plot: timeseries
axsp1.plot(np.random.randn(40).cumsum(), 'r--')

# Second plot: histogram
axsp2.hist(np.random.randn(400), bins=10, color='b', alpha=0.3)

# Third plot: scatterplot
axsp3.scatter(np.arange(30), 4 * np.arange(30) + 6 * np.random.randn(30))
# Note: if you make changes to the AxisSubplot object, you'll have to re-run the commands above

fig

------------------------------------------------------------------------------------------------------------------------

### 1.2 Shorthand to achieve the same effect

* Create a grid figure using **`plt.subplots`**
    * Syntax: `fig, axes = plt.subplots(rows, cols, figsize = (width, height), sharex=False, sharey=False)`
    
* It returns an array of **AxisSubplot** objects 
* Reference them using basic indexing (Saves typing!)

`plt.subplots` has some interesting options such as `sharex/sharey` which are useful when comparing data on the same scale

Run `plt.subplots?` for more.

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(8, 4))

axes[1, 1].plot(np.random.randn(50).cumsum(), 'g-_')
axes[2, 1].scatter(np.arange(30), np.log10(np.arange(30)))

> NOTE:
`subplots.adjust` is a Figure method that can be used to adjust figure parameters like spacing between subplots

In [None]:
fig1, axes1 = plt.subplots(2, 2, figsize=(12, 4), sharex=True, sharey=True)

for i in range(2):
    for j in range(2):
        axes1[i, j].hist(np.random.randn(500), bins=15, alpha=0.4, color='c')

plt.subplots_adjust(wspace=0.2, hspace=0.2)        
# comment out the plt.subplots line and re-run. See what happens

------------------------------------------------------------------------------------------------------------------------

### 1.3 Plot Formatting

#### a. Color, Linestyle and Markers

The `plot` function takes `x, y` and optionally an abbreviation to specify `marker, color, and style`

Example: Abbreviations work as `color-marker-style`, so `'g--'` means color = 'green' and linestyle = '--'

In [None]:
plt.plot(np.sin(np.arange(50)), 'b*-');

#### b. Ticks, Labels, Legends

In [None]:
f = plt.figure(figsize=(8, 5))
ax1 = f.add_subplot(1, 1, 1)
ax1.plot(4 + 6 * np.sin(np.arange(50)), 'g*-');

In [None]:
# Ticks
ax1.set_xticks([5, 15, 25, 35, 45])

# Chart title
ax1.set_title('This is a Sine Curve')

# Axis Label
ax1.set_xlabel('X')
ax1.set_ylabel('4 + 6 * sin(X)')

f

In [None]:
# Add more plots
ax1.plot(np.log(np.arange(50)), 'r', label='log(x)')
ax1.plot(np.sqrt(np.arange(50)), 'b*--', label='sqrt(x)')

# Add a legend
ax1.legend(loc='best')

f

### 1.3 Saving plots to file

**Syntax**: `plt.savefig('file-path.extension', dpi=)`

------------------------------------------------------------------------------------------------------------------------

# 2. Plotting in `pandas`

* There are high level plotting methods that take advantage of the fact that data are organized in DataFrames (have index, colnames)
* Both `Series` and `DataFrame` objects have a `pandas.plot` method for making different plot types
* Other parameters that can be passed to `pandas.plot` are:
    * `xticks, xlim, yticks, ylim`
    * `label`
    * `style` (as an abbreviation,) and `alpha`
    * `grid=True`
    * `rot` (rotate tick labels by and angle 0-360)
    * `use_index` (use index for tick labels)
    * `subplots=False`

### 2.1 One variable (plotting a Series)

In [None]:
s = Series(np.random.randn(100).cumsum())
s.name = 'random_time_series'
s.plot();
# Default is a line chart

<big> 

Two ways of specifying the kind of plot to make

- `X.plot(kind=<plottype>`
- `X.plot.<plottype>`

Where X is a Series or a DataFrame

---

In [None]:
s.plot(legend=True, 
       title='My First Pandas Plot',
       xlim=(0, 100), 
       ylim=(-20, 20), 
       style='g');


In [None]:
s.plot(legend=True, 
       title='My First Pandas Plot',
       xlim=(0, 100), 
       ylim=(-20, 20), 
       style='g');

s2 = s * 1.3
s2.name = 'derived_series'
s2.plot(legend=True)

In [None]:
# Chart with options
s.plot(grid=False, 
       legend=False,
       label='timeseries',
       title='Random Normal Numbers - Cumulative Series',
       xlim=(0, 100), 
       ylim=(-20, 20),
       xticks=np.arange(0, 100, 10), 
       yticks=np.arange(-10, 10, 2),
       style='r--', 
       alpha=0.5,
       figsize=(7, 3)
      );

plt.savefig('time_series_001.png', dpi=200)

In [None]:
# One Variable as a Histogram
Series(np.random.randn(10000)).plot(kind='hist', 
                                    bins=50, 
                                    color='r', 
                                    alpha=0.2, 
                                    title='A histogram');

In [None]:
from scipy.stats import norm

s2 = norm.rvs(size=10000, loc=4, scale=2.5)
s3 = norm.rvs(size=10000, loc=-2, scale=0.5)

In [None]:
Series(s2).plot.hist(bins=50, color='g', alpha=0.8)
Series(s3).plot(kind='hist', bins=50, color='b', alpha=0.2)
plt.savefig('twoHistograms.png')

In [None]:
(Series(np.random.randint(0, 10, 25)).value_counts().sort_index().plot.bar())

## Plotting with Titanic

In [None]:
df = pd.read_csv('./data/titanic.csv')

In [None]:
df['Embarked'].value_counts().plot.barh(figsize=(3, 3));

In [None]:
df['Age'].plot.hist(bins=20, figsize=(3, 3))

In [None]:
df['Fare'].plot.hist(figsize=(3, 3), bins=20);

------------------------------------------------------------------------------------------------------------------------

### 2.2 Multiple Variables (plotting a DataFrame)

We can choose between plotting
* All Variables on one plot
* Each variable on a separate plot

In addition to the parameters above, `DataFrame.plot` also takes
* `subplots=False` (default is to plot all on the same figure)
* `sharex=False, sharey=False`
* `figsize`
* `title, legend`
* `sort_columns`

### a. Variables on the same plot

In [None]:
df = DataFrame(np.random.randn(5000, 5), 
               index=['Day_' + str(d) for d in range(5000)],
               columns=['APL', 'FBK', 'GOOG', 'MCRS', 'TWTR']).cumsum().round(3); df[:4]

In [None]:
# Default plot
df.plot(figsize=(10, 4));

------------------------------------------------------------------------------------------------------------------------

### b. Each variable on its own plot

In [None]:
df.plot(figsize=(5, 10), subplots=True);

In [None]:
df.plot(figsize=(5, 10), subplots=True, sharey=True);

### c. Barplots

This is as simple as passing `kind=bar` or `kind=barh` (for horiz bars) to `pd.plot`

#### One Variable (simple barplot)

In [None]:
fig, axes = plt.subplots(3, 1, figsize=(14, 14))
s = Series(np.random.rand(10), index=list('abcdefghij'))

s.plot(kind='bar', 
       ax=axes[0], 
       color='k', 
       alpha=0.6)

s.plot(kind='barh', 
       ax=axes[1], 
       color='k')

s.plot(
    ax=axes[2], 
    color='g')

In [None]:
df = DataFrame(np.random.rand(5,5), index=list('ABCDE'), columns=list('PQRST'))
df

In [None]:
df.plot(kind='bar', stacked=True, figsize=(10, 8))
plt.savefig('stackedBarcharts.jpeg')

> Note: Functions `value_counts()` and `pd.crosstab()` prove helpful to prepare data for stacked bar charts

------------------------------------------------------------------------------------------------------------------------

### d. Histograms & Density Plots

* _Histograms_: Pass `kind='hist'` to `pd.plot()` or use the method `pd.hist()`
* _Density Plots_: Use `kind='kde'`


### Using the `.hist()` method

In [None]:
Series(np.random.randn(1000)).hist(bins=20, alpha=0.4)

### Using the `.plot()` method

In [None]:
pd.Series(np.random.randn(1000)).plot(kind='hist', bins=20, color='orange')

### KDE

In [None]:
s = Series(np.random.randn(10000))
s.plot(kind='kde', color='b') 

In [None]:
# A bimodal distribution 
s1 = np.random.normal(0, 1, 2000)
s2 = np.random.normal(9, 2, 2000)

v = pd.Series(np.concatenate([s1, s2]))

v.hist(bins=100, alpha=0.4, color='k')
v.plot(kind='kde', style='k--')

------------------------------------------------------------------------------------------------------------------------

## e. Scatter Plots

- `.plot(kind='scatter')`
- `.scatter()`

In [None]:
df = pd.DataFrame({'A': np.arange(50),
               'B': np.arange(50) + np.random.randn(50),
               'C': np.sqrt(np.arange(50)) + np.sin(np.arange(50)) })
df[:10]

In [None]:
# Two variable Scatterplot
plt.scatter(df['B'], df['C'])
plt.title('Scatterplot of X and Y')

In [None]:
df.plot(kind='scatter', x='B', y='C', title = 'Scatterplot')

In [None]:
df.plot.scatter(x='B', y='C', title = 'Scatterplot', color='r')