# Learning to visualize data using a sample Iris bioclim dataset
    - First we need to load python packages to use
    - we use the function import to load
    - the function as lets us assign an abreviation for package
    

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import warnings
warnings.simplefilter("ignore", category=DeprecationWarning)
warnings.simplefilter("ignore", category=FutureWarning)
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
%matplotlib inline


# Step 1 load the data
    - Load in a dataset and save it as a dataframe named 'df 
    - use the .head() function on df to look at the top rows

In [None]:
df = pd.read_csv("Iris_bioclim_val.csv", sep=",")
df.head()

# Dealing with duplicates
    - notice rows 3 and 4 are duplicate occurences.  We need to remove these.
The resolution of bioclim layers is ~1sq kilometer. so we don't want multiple points within the same grid square.  Just to be on the conservative side we will round lat/lon to 3 decimal places and remove anything duplicated

In [None]:
df.lon = df.lon.round(3)
df.lat = df.lat.round(3)
df = df.drop_duplicates(subset=['lat','lon'],keep='first')
df.head()

# Now that our dataset is deduplicated we can start to explore the dataset
    -Similair to using .head(), we can use .boxplot() function
        
    -Let's look at the first bioclim variable (column bio_1), and group by species
            

In [None]:
df.boxplot(column='bio_1', by='species')

# Using a For loop to automate graphing
    - We have 19 different variables, so it would be tedius to code each plot
    - We can write a loop that will iterate through each column, adding each as a subplot
    - look at the code comments for details 
            -comments are preceeded by a '#' and will be ignored by python
            -try changing values, and commenting out lines of code to see what happens

In [None]:
fig1 = plt.figure(figsize=(6,19)) #We create an empty plot and set the dimensions
count=0 # we will make a variable named count to help us below

for i in range(4,len(df.columns)): #Here we loop the variable i from a value of 4 to the total number of columns (23)
    bio = df.columns[i]   #variable saving the name of column number i  (when i = 5 bio = bio_13)
    count += 1  #increase our counter by 1 each iteration through the loop
    ax = fig1.add_subplot(9,2,count) #use our count number to set the subplot position 
    df.boxplot(column=bio, by='species', ax=ax) #plot the subplot
    ax.set_xticklabels([]) #cleaning up extra axis labels - it gets messy labeling all subplots
    ax.set_xlabel('')
fig1.subplots_adjust(hspace=.4) #adjust the spacing
plt.savefig('irisplot.pdf')  #save our final plot as a pdf,  we can add the species labels later

# PCA methods  -- explainations in progress

In [None]:
#split table into data x and species y
X= df.ix[:,3:22].values
y= df.ix[:,2].values


In [None]:
pca = PCA(n_components=2)
X_r = pca.fit(X).transform(X)
plt.figure()
lw = 2

for lab, col in zip(('Iris hexagona SS', 'Iris kimballiae', 'Iris rivularis','Iris savannarum'),
                        ('blue', 'red', 'green','purple')):
    plt.scatter(X_r[y == lab, 0], X_r[y == lab, 1], c=col, alpha=.8, lw=lw,
                label=lab)    
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('PCA of IRIS dataset')

In [None]:
# Percentage of variance explained for each components
print('explained variance ratio (first two components): %s'
      % str(pca.explained_variance_ratio_))
