# Welcome to the Dark Art of Coding:
## Introduction to Python
graphing with matplotlib

<img src='../images/dark_art_logo.600px.png' width='300' style="float:right">

# Prerequisites:

* pandas
* matplotlib
* numpy

IF you don't already have these installed in your virtual environment, **activate your virtualenv** and **conda install** them:

```bash
conda install pandas numpy matplotlib
```

In [None]:
# generate a horizontal bar chart
# (based on an example from the matplotlib website...)
# http://matplotlib.org/1.2.1/examples/pylab_examples/barh_demo.html

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# we'll use pandas to read in some data, but we'll only keep one of the columns

df = pd.read_csv('../universal_datasets/log_file.csv', names=['name',
                                                  'email',
                                                  'fmip',
                                                  'toip',
                                                  'date',
                                                  'lat',
                                                  'long',
                                                  'payload'],
                                            usecols=['name'])

In [None]:
# to generate this graph, we are gonna make up
# a set of random numbers between 
# 42 and 48.5

In [None]:
# NOTE:
#     0) the function len(df) returns the number of rows in df
#     1) np.random.rand yields numbers between [0 and 1].
#

df['fan_rating'] = 42 + 6.5 * np.random.rand(len(df))

In [None]:
df

In [None]:
# as a first step, let's clean up the data by dropping
# duplicate rows using these rules/parameters:
#     0) check whether there is a duplicate in a given column
#     1) keep only the last value of any duplicates
#        (default is to take the first)
#     2) replace the dataframe with our new deduplicated version

In [None]:
df.drop_duplicates(subset='name', inplace=True, keep='last')


In [None]:
df

In [None]:
# let's convert the content of the name column from
# first & last to just first name
# we do this by generating a function to do 
# the conversion on a single value

In [None]:
def fname_only(full_name):
    # change
    fname = full_name.split(' ')[0]
    return fname

In [None]:
# the map() function associated with dataframe columns maps
# the named function against
# every item in every row of the given column
# NOTE:
#     Remember, pandas has several ways to refer to columns:
#     0) when creating a new column, you must use bracket
#        syntax: df['<colname>']
#     1) to refer to an existing column, you are free to use either
#        bracket syntax OR dot notation syntax: df.<colname>
#        I find the dot notation easier to type, so defer
#        to it when possible

In [None]:
df['fname'] = df['name'].map(fname_only)

In [None]:
df

In [None]:
# lastly, let's extract just the values out of
#     the fname column.
# this is a list-like object and will be used 
#     as the names for our horizontal bars later.

In [None]:
people = df['fname'].values

In [None]:
type(people)

In [None]:
people

In [None]:
# now we can put together a sequence that will
# store the y position values. we'll use np.arange to
# create an array of sequential values based on
# the length of the people array
# this will be used as a set of temporary names
# for our horizontal bars...
# ultimately, we will replace it with real names...

# plus, we will save off the fan rating as a series for use
# in the plotting software as the x values

In [None]:
y_pos = np.arange(len(people))

performance = df['fan_rating']

In [None]:
# based on this preliminary work, we can generate a
# simple horizontal bar chart
# we give it y values and x values
# and we tell it to show() the plot

In [None]:
plt.barh(y_pos, performance)
plt.show()

In [None]:
# presume we don't like the alignment of the names
# against the horizontal bars.
# we can align the names to the center of the bars.

In [None]:
plt.barh(y_pos, performance, align='edge')
plt.show()

In [None]:
# what if we don't like the dark color? we can revise the color to have a
# greater degree of transparency by setting the alpha characteristic

In [None]:
plt.barh(y_pos, performance, alpha=0.4)
plt.show()

In [None]:
# what if we don't like the default color? 
# we can revise the color as desired
# matplotlib has a number of default colors:
#     'r' stands for red

In [None]:
plt.barh(y_pos, performance, align='center', alpha=0.4, color='r')
plt.show()

In [None]:
# what if our data comes with some percentage of error?
# this can be represented using an xerr parameter.
# here, we again use random to generate a random 
# set of values that we can
# then use to generate error bars. 
# in real life, you would have some
# margin for error for all the values.

In [None]:
error = 2 * np.random.rand(len(people))

plt.barh(y_pos, performance, xerr=error, 
        align='center', alpha=0.4, color='r')
plt.show()

In [None]:
# this still leaves off info that most of us
# would want, in terms of the
# people names, title, axis labels, etc.

In [None]:
plt.barh(y_pos, performance, xerr=error, 
         align='center', alpha=0.4, color='r')

plt.yticks(y_pos, people)
plt.xlabel('rating')
plt.title("Estimated Justice League's Fan Ratings")


In [None]:
plt.savefig('hbar.png')
plt.show()

In [None]:
# creating a histogram with some additional features
# (based heavily on an example from the matplotlib website...)
# http://matplotlib.org/1.2.1/examples/api/histogram_demo.html

# In addition to the basic histogram, this demo shows a few optional
# features:
#     * setting the number of data bins
#     * the ''normed'' flag, which normalizes bin heights so that the
#       integral of the histogram is 1. the resulting histogram is a
#       probability density
#     * setting the face color of the bars
#     * setting the opacity (alpha value)

# As always, we start by importing the appropriate libraries

In [None]:
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt

In [None]:
# we define some of the characteristics of our distribution:
mu = 100       # mean of the distribution
sigma = 15     # standard deviation of the distribution

In [None]:
# np.random.randn creates a random array of values taken from a
# standard normal distribution
# we can transform the distribution by multiplying the values by sigma and
# by adding the mean mu

In [None]:
x = mu + sigma * np.random.randn(10000)
print(x)

In [None]:
# we set a specific number of bins that we will group our values into...

In [None]:
num_bins = 50

In [None]:
# next we create the data for the histogram
# n = values (height) for each bar of the histogram
# bins = the indicators that demarck the bins
# patches = are the rectangle objects that represent each bar

In [None]:
n, bins, patches = plt.hist(x, num_bins, normed=1,
                            facecolor='green', alpha=0.5)
print(n)
print(bins)
# print(patches)
# for item in patches:
#     print(item)

plt.show()

In [None]:
# from here, we can add a 'best fit' line...
# the normal probability density function (pdf) helps to answer the question:
# "how common are samples at a given value?'

In [None]:
n, bins, patches = plt.hist(x, num_bins,
                            normed=1, 
                            facecolor='green',
                            alpha=0.5)

y = mlab.normpdf(bins, mu, sigma)
plt.plot(bins, y, 'r--')

plt.show()

In [None]:
# And let's add some labels...

n, bins, patches = plt.hist(x, num_bins, normed=1,
                            facecolor='blue', alpha=0.7)
y = mlab.normpdf(bins, mu, sigma)
plt.plot(bins, y, 'k--')

plt.xlabel('Smarts')
plt.ylabel('Probability')

plt.show()

In [None]:
# the opening '$\' and the closing '$' tags in the following commands enable you to
# include italics in the title

In [None]:
n, bins, patches = plt.hist(x, num_bins, normed=1,
                            facecolor='red', alpha=0.5)
y = mlab.normpdf(bins, mu, sigma)
plt.plot(bins, y, 'ro')
plt.xlabel('Smarts')
plt.ylabel('Probability')

plt.title(r'Histogram of IQ: $\mu=100$, $\sigma=15$')

plt.show()

In [None]:
n, bins, patches = plt.hist(x, num_bins, normed=1,
                            facecolor='blue', alpha=0.5)
y = mlab.normpdf(bins, mu, sigma)
plt.plot(bins, y, 'r--')
plt.xlabel('Smarts')
plt.ylabel('Probability')
plt.title(r'Histogram of IQ: $\mu=100$, $\sigma=15$')

plt.show()

plt.savefig('hist.png')

In [None]:
# generate a scatter plot
# (based heavily on an example from the matplotlib website...)
# http://matplotlib.org/examples/shapes_and_collections/scatter_demo.html

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# Let's read in a csv via pandas but keep only two of the columns

In [None]:
df = pd.read_csv('../universal_datasets/log_file_1000.csv', names=['name',
                                                   'email',
                                                   'fmip',
                                                   'toip',
                                                   'date',
                                                   'lat',
                                                   'long',
                                                   'payload'],
                                            nrows=35,
                                            usecols=['lat', 'long'])

In [None]:
# next, let's create two arrays from the lat and long columns

In [None]:
df

In [None]:
latitudes = df['lat']
longitudes = df['long']

In [None]:
# here we calculate the length of one of the arrays

count = len(latitudes)

In [None]:
# we'll use numpy to create a random collections of numbers for use in defining colors
# of the circles in our scatterplot

In [None]:
colors = np.random.rand(count)

In [None]:
# this step creates a random set of areas based on a radiuses from 0 to 20
# we can imagine that this is indicative of a the frequency of communications OR
# something similarly awesome and nerdy like that.
# np.pi gives us pi to 15 decimal places
# np.random.rand gives us random numbers from 0 to 1

In [None]:
areas = np.pi * (20 * np.random.rand(count))**2

In [None]:
# generate the scatter plot

In [None]:
plt.scatter(latitudes, longitudes, s=areas, c=colors) #, alpha=0.4)

In [None]:
# we chose to create stand alone variables (latitude, longitude) but these are not
# required... we can just as happily read in directly from df.lat or df.long:
# plt.scatter(df.lat, df.long, s=area, c=colors, alpha=0.4)

In [None]:
plt.savefig('scatter.png')
plt.show()

In [None]:
plt.savefig?