## Exploratory Analysis

To begin this exploratory analysis, first import libraries and define functions for plotting the data using `matplotlib`. Depending on the data, not all plots will be made. (Hey, I'm just a simple kerneling bot, not a Kaggle Competitions Grandmaster!)

# 1. Import libraries

In [3]:
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
from scipy import misc

# 2. Load images

## 2.1. Load images into 2 lists: left_iris_l and right_iris_l

In [16]:
path = os.getcwd()
left_iris_l = []
right_iris_l = []

for dirname, _, filenames in os.walk(path+'/data/MMU-Iris-Database/'):
    for filename in filenames:
        if filename not in ['.DS_Store', 'ReadMe.txt', 'Thumbs.db']:
            dir_file_s = os.path.join(dirname, filename)
            dirname_l = dirname.split("/")
            if dirname_l[-1] == 'left':
                left_iris_l.append(plt.imread(dir_file_s))
            elif dirname_l[-1] == 'right':
                right_iris_l.append(plt.imread(dir_file_s))
            else:
                print(f'Neither left or right iris in {dir_file_s}')

## 2.2. Check the size of the 2 lists and each file

In [17]:
print(len(left_iris_l))
print(len(right_iris_l))

225
225


In [25]:
print(type(left_iris_l[0]), '\n')
print(left_iris_l[0].shape, '\n')
print(left_iris_l[0])

<class 'numpy.ndarray'> 

(240, 320, 3) 

[[[200 204 200]
  [184 188 184]
  [184 188 184]
  ...
  [ 64  68  64]
  [ 72  76  72]
  [ 80  76  72]]

 [[192 200 192]
  [184 188 184]
  [192 184 184]
  ...
  [ 80  84  80]
  [ 72  76  72]
  [ 80  80  80]]

 [[192 196 192]
  [184 188 184]
  [176 180 176]
  ...
  [ 40  52  40]
  [ 64  60  64]
  [ 80  76  72]]

 ...

 [[240 240 232]
  [184 192 192]
  [184 188 184]
  ...
  [128 124 120]
  [120 128 128]
  [128 124 120]]

 [[248 248 248]
  [248 248 240]
  [232 240 240]
  ...
  [152 156 152]
  [160 160 160]
  [160 156 152]]

 [[  0   4   0]
  [  8   8   8]
  [  0   4   0]
  ...
  [  0   8   0]
  [  8   4   8]
  [  8   8   8]]]


# 3. Modeling

In [4]:
# Correlation matrix
def plotCorrelationMatrix(df, graphWidth):
    filename = df.dataframeName
    df = df.dropna('columns') # drop columns with NaN
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    if df.shape[1] < 2:
        print(f'No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2')
        return
    corr = df.corr()
    plt.figure(num=None, figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k')
    corrMat = plt.matshow(corr, fignum = 1)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.gca().xaxis.tick_bottom()
    plt.colorbar(corrMat)
    plt.title(f'Correlation Matrix for {filename}', fontsize=15)
    plt.show()


In [5]:
# Scatter and density plots
def plotScatterMatrix(df, plotSize, textSize):
    df = df.select_dtypes(include =[np.number]) # keep only numerical columns
    # Remove rows and columns that would lead to df being singular
    df = df.dropna('columns')
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    columnNames = list(df)
    if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots
        columnNames = columnNames[:10]
    df = df[columnNames]
    ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plotSize, plotSize], diagonal='kde')
    corrs = df.corr().values
    for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
        ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=textSize)
    plt.suptitle('Scatter and Density Plot')
    plt.show()
