# CS 5010

## Learning Python (Python version: 3)


# Plotting and Visualization

#### From Python for Data Analysis, Ch. 9 

In [None]:
import numpy as np
import pandas as pd
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.random.seed(12345)
import matplotlib.pyplot as plt
import matplotlib
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

%matplotlib notebook

## A Brief matplotlib API Primer

In [None]:
import matplotlib.pyplot as plt

Example of simple data plot

In [None]:
import numpy as np
data = np.arange(10)
data
plt.plot(data)

### Figures and Subplots

In [None]:
fig = plt.figure()

ax1 = fig.add_subplot(2, 2, 1)

ax2 = fig.add_subplot(2, 2, 2)

ax3 = fig.add_subplot(2, 2, 3)

In [None]:
plt.plot(np.random.randn(50).cumsum(), 'k--')

In [None]:
# Putting it all together: 
fig = plt.figure()

ax1 = fig.add_subplot(2, 2, 1)
ax2 = fig.add_subplot(2, 2, 2)
ax3 = fig.add_subplot(2, 2, 3)

ax1.hist(np.random.randn(100), bins=20, color='k', alpha=0.3)
ax2.scatter(np.arange(30), np.arange(30) + 3 * np.random.randn(30))
#ax3.plot(np.arange(10), color='g')
ax3.plot(np.arange(10), 'g--') # green dashed line

In [None]:
plt.close('all')

In [None]:
fig, axes = plt.subplots(2, 3)
axes

#### Adjusting the spacing around subplots

subplots_adjust(left=None, bottom=None, right=None, top=None,
                wspace=None, hspace=None)

In [None]:
fig, axes = plt.subplots(2, 2, sharex=True, sharey=True)
for i in range(2):
    for j in range(2):
        axes[i, j].hist(np.random.randn(500), bins=50, color='k', alpha=0.5)
plt.subplots_adjust(wspace=0, hspace=0)

### Colors, Markers, and Line Styles

In [None]:
# ax.plot(x, y, 'g--')
# ax.plot(x, y, linestyle='--', color='g')
plt.figure()

from numpy.random import randn
plt.plot(randn(30).cumsum(), 'ko--')

In [None]:
# Another way of plotting, with 'color', 'linestyle', and 'marker' clearly
plt.plot(randn(30).cumsum(), color='k', linestyle='dashed', marker='o')

In [None]:
plt.close('all')

In [None]:
data = np.random.randn(30).cumsum()
plt.plot(data, 'k--', label='Default')
plt.plot(data, 'k-', drawstyle='steps-post', label='steps-post')
plt.legend(loc='best') 

### Ticks, Labels, and Legends

#### Setting the title, axis labels, ticks, and ticklabels

In [None]:
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.plot(np.random.randn(1000).cumsum())

In [None]:
ticks = ax.set_xticks([0, 250, 500, 750, 1000])
labels = ax.set_xticklabels(['one', 'two', 'three', 'four', 'five'],
                            rotation=30, fontsize='small')

In [None]:
ax.set_title('My first matplotlib plot')
ax.set_xlabel('Stages')


In [None]:
# Putting it all together:
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.plot(np.random.randn(1000).cumsum())

ticks = ax.set_xticks([0, 250, 500, 750, 1000])
labels = ax.set_xticklabels(['one', 'two', 'three', 'four', 'five'],
                            rotation=30, fontsize='small')

ax.set_title('My first matplotlib plot')
ax.set_xlabel('Stages')

# props = { 'title': 'My first matplotlib plot', 'xlabel': 'Stages' } ax.set(**props)

#### Adding legends

In [None]:
from numpy.random import randn
fig = plt.figure(); ax = fig.add_subplot(1, 1, 1)
ax.plot(randn(1000).cumsum(), 'k', label='one')
ax.plot(randn(1000).cumsum(), 'k--', label='two')
ax.plot(randn(1000).cumsum(), 'k.', label='three')

ax.legend(loc='best')  selects the best locatio for the legend

In [None]:
fig = plt.figure(); ax = fig.add_subplot(1, 1, 1)
ax.plot(randn(1000).cumsum(), 'k', label='one')
ax.plot(randn(1000).cumsum(), 'k--', label='two')
ax.plot(randn(1000).cumsum(), 'k.', label='three')

ax.legend(loc='best') # Selects best location for the legend

### Annotations and Drawing on a Subplot

ax.text(x, y, 'Hello world!', family='monospace', fontsize=10)


Use the command 'pwd' to display the current working directory. 
You'll use this when reading in a .csv file in the following example.

In [None]:
pwd

In [None]:
import pandas as pd
from datetime import datetime

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

# E.g. Path: 'C:\\Users\\<username>/pyDatasets/spx.csv'
''' *IMPORTANT: Change this path before running this code 
(and create the directory 'pyDatasets' if you like)!* '''
data = pd.read_csv('~/pyDatasets/spx.csv', index_col=0, parse_dates=True)
# data = pd.read_csv('examples/spx.csv', index_col=0, parse_dates=True)
spx = data['SPX']

spx.plot(ax=ax, style='k-')

crisis_data = [
    (datetime(2007, 10, 11), 'Peak of bull market'),
    (datetime(2008, 3, 12), 'Bear Stearns Fails'),
    (datetime(2008, 9, 15), 'Lehman Bankruptcy')
]

for date, label in crisis_data:
    ax.annotate(label, xy=(date, spx.asof(date) + 75),
                xytext=(date, spx.asof(date) + 225),
                arrowprops=dict(facecolor='black', headwidth=4, width=2,
                                headlength=4),
                horizontalalignment='left', verticalalignment='top')

# Zoom in on 2007-2010
ax.set_xlim(['1/1/2007', '1/1/2011'])
ax.set_ylim([600, 1800])

ax.set_title('Important dates in the 2008-2009 financial crisis')

#### Creating and drawing (plotting) shapes

In [None]:
fig = plt.figure(figsize=(12, 6)); ax = fig.add_subplot(1, 1, 1)
rect = plt.Rectangle((0.2, 0.75), 0.4, 0.15, color='k', alpha=0.3)
circ = plt.Circle((0.7, 0.2), 0.15, color='b', alpha=0.3)
pgon = plt.Polygon([[0.15, 0.15], [0.35, 0.4], [0.2, 0.6]],
                   color='g', alpha=0.5)
ax.add_patch(rect)
ax.add_patch(circ)
ax.add_patch(pgon)

### Saving Plots to File

In [None]:
import matplotlib.pyplot as plt
import numpy as np

t = np.arange(0.0, 2.0, 0.01)
s = 1 + np.sin(2*np.pi*t)
plt.plot(t, s)

plt.xlabel('time (s)')
plt.ylabel('voltage (mV)')
plt.title('Voltage vs Time (Example of Saving Plots to a File)')
plt.grid(True)
''' *IMPORTANT: Change this path before running this code!* '''
plt.savefig('C:/Users/nb3f/pySavedFiles/test.png')
plt.show()

### matplotlib Configuration

#### Use of 'rc' method to modify configuration

matplotlib comes configured with color schemes and defaults that are geared primarily toward preparing figures for publication.

Most of these default settings can be customized via an extensive set of global parameters governing figure size, subplot spacing, colors, font sizes, grid styles, and so on. 

One way to modify the configuration is to use the 'rc' method. For example, to set the global default figure size to be 10x10, enter the following code:

plt.rc('figure', figsize=(10, 10))

The first argument to rc is the component you wish to customize, such as 'figure', 'axes', 'xtick', 'ytick', 'grid', 'legend', or many others. 

After that can follow a sequence of keyword arguments indicating the new parameters. An easy way to write down the options in your program is as a dict (dictionary):

In [None]:
# No need to run this code, just an illustration of changing font options:

# font_options = {'family' : 'monospace',
#                 'weight' : 'bold',
#                 'size'   : 'small'}
# plt.rc('font', **font_options) 

## Plotting with pandas and seaborn

### Line Plots

In [None]:
plt.close('all')

In [None]:
s = pd.Series(np.random.randn(10).cumsum(), index=np.arange(0, 100, 10))
s.plot()

In [None]:
df = pd.DataFrame(np.random.randn(10, 4).cumsum(0),
                  columns=['A', 'B', 'C', 'D'],   # legend headers
                  index=np.arange(0, 100, 10))
df.plot()

In [None]:
# Another similar example

df = pd.DataFrame(np.random.randn(10, 6).cumsum(0), # generating 6 plots 
                  columns=['A', 'B', 'C', 'D', 'E', 'F'],   # legend headers
                  index=np.arange(0, 100, 10))
df.plot()

### Bar Plots

In [None]:
fig, axes = plt.subplots(2, 1)
data = pd.Series(np.random.rand(16), index=list('abcdefghijklmnop'))
data.plot.bar(ax=axes[0], color='c', alpha=0.7)
data.plot.barh(ax=axes[1], color='k', alpha=0.7)

Another bar plot example

In [None]:
# Example from pythonspot.com

import numpy as np
import matplotlib.pyplot as plt
 
# data to plot
n_groups = 4
means_frank = (90, 55, 40, 65)
means_guido = (85, 62, 54, 20)
 
# create plot
fig, ax = plt.subplots()
index = np.arange(n_groups)
bar_width = 0.35
opacity = 0.8
 
rects1 = plt.bar(index, means_frank, bar_width,
alpha=opacity,
color='b',
label='Frank')
 
rects2 = plt.bar(index + bar_width, means_guido, bar_width,
alpha=opacity,
color='g',
label='Guido')
 
plt.xlabel('Person')
plt.ylabel('Scores')
plt.title('Scores by person')
plt.xticks(index + bar_width, ('A', 'B', 'C', 'D'))
plt.legend()
 
plt.tight_layout()
plt.show()

In [None]:
np.random.seed(12348)

df = pd.DataFrame(np.random.rand(6, 4),
                  index=['one', 'two', 'three', 'four', 'five', 'six'],
                  columns=pd.Index(['A', 'B', 'C', 'D'], name='Genus'))
df
df.plot.bar()

In [None]:
plt.figure()
df.plot.barh(stacked=True, alpha=0.5)

In [None]:
plt.close('all')

Next example uses the "tips" data set. Ensure the dataset is saved in a location you are aware of!

In [None]:
#tips = pd.read_csv('<Your Path>/tips.csv')
tips = pd.read_csv('C:/Users/nb3f/pyDatasets/tips.csv') # Example Path
party_counts = pd.crosstab(tips['day'], tips['size'])
party_counts

What does crosstab do? Look it up and get a high-level understanding of what it is and how it works

In [None]:
# Not many 1- and 6-person parties
party_counts = party_counts.loc[:, 2:5] # preserve size 2 to 5
party_counts

In [None]:
# Normalize to sum to 1
party_pcts = party_counts.div(party_counts.sum(1), axis=0)
party_pcts

In [None]:
party_pcts.plot.bar()

In [None]:
plt.close('all')

#### Bar plot example using Seaborn

In [None]:
import seaborn as sns
# tips prior to adding new column
tips.head()

In [None]:
# adding new column to tips
tips['tip_pct'] = tips['tip'] / (tips['total_bill'] - tips['tip'])
tips.head() # short first 5 data items at the top of the file

In [None]:
sns.barplot(x='tip_pct', y='day', data=tips, orient='h')
# Data is tips;  Using tip_pct column
# Organize by 'day'
# Orient the graph horizontally

In [None]:
plt.close('all')

In [None]:
sns.barplot(x='tip_pct', y='day', hue='time', data=tips, orient='h')
# Data is tips; using tip_pct column
# Organization:  nested grouping by a two variables (day and time)
# Orient the graph horizontally

In [None]:
plt.close('all')

In [None]:
sns.set(style="whitegrid")  # Add a grid for the next section

### Histograms and Density Plots

#### Histogram

In [None]:
plt.figure()

In [None]:
tips['tip_pct'].plot.hist(bins=50)

#### Density Plots

In [None]:
plt.figure()

In [None]:
tips['tip_pct'].plot.density()

In [None]:
plt.figure()

In [None]:
# Note, after running this code segment, you may get a deprecated warning; 
# but it should still work
comp1 = np.random.normal(0, 1, size=200)
comp2 = np.random.normal(10, 2, size=200)
# concatenate the two distrbutions
values = pd.Series(np.concatenate([comp1, comp2])) 
sns.distplot(values, bins=100, color='g') # 100 bins, and g= green color

### Scatter or Point Plots

Next example uses the "macrodata.csv" data set. Ensure the dataset is saved in a location you are aware of!

In [None]:
#macro = pd.read_csv('<your path>/macrodata.csv')
macro = pd.read_csv('C:/Users/nb3f/pyDatasets/macrodata.csv') # example
data = macro[['cpi', 'm1', 'tbilrate', 'unemp']]
trans_data = np.log(data).diff().dropna()
trans_data[-5:]

#### Scatter Plot

In [None]:
plt.figure()

In [None]:
sns.regplot('m1', 'unemp', data=trans_data)  # m1 vs unemp columns to plot
plt.title('Changes in log %s versus log %s' % ('m1', 'unemp'))

#### Point Plot

Great tool for visualizing the data and comparing attributes (columns) against each other

In [None]:
sns.pairplot(trans_data, diag_kind='kde', plot_kws={'alpha': 0.2})

### Facet Grids and Categorical Data

In [None]:
sns.factorplot(x='day', y='tip_pct', hue='time', col='smoker',
               kind='bar', data=tips[tips.tip_pct < 1])

# Using tips data set
# Organization:  nested grouping by day and time
# Show results on values of 'smoker' (i.e. no vs yes)
# Using data where tip_pct < 1

In [None]:
sns.factorplot(x='day', y='tip_pct', row='time',
               col='smoker',
               kind='bar', data=tips[tips.tip_pct < 1])

# Similar to above, only now organized by combinations of time and smoker
# dinner and no, dinner and yes, lunch and no, lunch and yes

#### Box Plot

Using tip_pct and day

Data: where tip_pct < 0.5

In [None]:
sns.factorplot(x='tip_pct', y='day', kind='box',
               data=tips[tips.tip_pct < 0.5])