In [1]:

import pandas as pd

df = pd.read_csv(
    filepath_or_buffer='https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', 
    header=None, 
    sep=',')

df.columns=['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid', 'class']
df.dropna(how="all", inplace=True) # drops the empty line at file-end

df.tail()

URLError: <urlopen error [Errno 11002] getaddrinfo failed>

In [None]:
# split data table into data X and class labels y

X = df.iloc[:,0:4].values
y = df.iloc[:,4].values

In [None]:
import plotly.plotly as py

# plotting histograms
data = []

legend = {0:False, 1:False, 2:False, 3:True}

colors = {'Iris-setosa': '#0D76BF', 
          'Iris-versicolor': '#00cc96', 
          'Iris-virginica': '#EF553B'}

for col in range(4):
    for key in colors:
        trace = dict(
            type='histogram',
            x=list(X[y==key, col]),
            opacity=0.75,
            xaxis='x%s' %(col+1),
            marker=dict(color=colors[key]),
            name=key,
            showlegend=legend[col]
        )
        data.append(trace)

layout = dict(
    barmode='overlay',
    xaxis=dict(domain=[0, 0.25], title='sepal length (cm)'),
    xaxis2=dict(domain=[0.3, 0.5], title='sepal width (cm)'),
    xaxis3=dict(domain=[0.55, 0.75], title='petal length (cm)'),
    xaxis4=dict(domain=[0.8, 1], title='petal width (cm)'),
    yaxis=dict(title='count'),
    title='Distribution of the different Iris flower features'
)

fig = dict(data=data, layout=layout)
py.iplot(fig, filename='exploratory-vis-histogram')

In [None]:
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)

In [None]:

import numpy as np
mean_vec = np.mean(X_std, axis=0)
cov_mat = (X_std - mean_vec).T.dot((X_std - mean_vec)) / (X_std.shape[0]-1)
print('Covariance matrix \n%s' %cov_mat)

In [None]:
cov_mat = np.cov(X_std.T)

eig_vals, eig_vecs = np.linalg.eig(cov_mat)

print('Eigenvectors \n%s' %eig_vecs)
print('\nEigenvalues \n%s' %eig_vals)

In [None]:
## have all the eigenvalues and eigenvectors listed in pairs
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]

## Sort the pairs in decreasing order
eig_pairs.sort()
eig_pairs.reverse()

for p in eig_pairs:
    print(p[0])

In [None]:
tot = sum(eig_vals)
var_exp = [(i / tot)*100 for i in sorted(eig_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)

trace1 = dict(
    type='bar',
    x=['PC %s' %i for i in range(1,5)],
    y=var_exp,
    name='Individual'
)

trace2 = dict(
    type='scatter',
    x=['PC %s' %i for i in range(1,5)], 
    y=cum_var_exp,
    name='Cumulative'
)

data = [trace1, trace2]

layout=dict(
    title='Explained variance by different principal components',
    yaxis=dict(
        title='Explained variance in percent'
    ),
    annotations=list([
        dict(
            x=1.16,
            y=1.05,
            xref='paper',
            yref='paper',
            text='Explained Variance',
            showarrow=False,
        )
    ])
)

fig = dict(data=data, layout=layout)
py.iplot(fig, filename='selecting-principal-components')

In [None]:
matrix_w = np.hstack((eig_pairs[0][1].reshape(4,1), 
                      eig_pairs[1][1].reshape(4,1)))

print('Matrix W:\n', matrix_w)

In [None]:
data = []

for name, col in zip(('Iris-setosa', 'Iris-versicolor', 'Iris-virginica'), colors.values()):
    trace = dict(
        type='scatter',
        x=Y[y==name,0],
        y=Y[y==name,1],
        mode='markers',
        name=name,
        marker=dict(
            color=col,
            size=12,
            line=dict(
                color='rgba(217, 217, 217, 0.14)',
                width=0.5),
            opacity=0.8)
    )
    data.append(trace)

layout = dict(
    showlegend=True,
    scene=dict(
        xaxis=dict(title='PC1'),
        yaxis=dict(title='PC2')
    )
)

fig = dict(data=data, layout=layout)
py.iplot(fig, filename='projection-matrix')

In [None]:
## compute the posterior of each data point 
def expectation(data, gmm):
    numerator = np.zeros((len(gmm), data.shape[0]))
    denominator = np.zeros((len(gmm), data.shape[0]))
    for k in range(len(gmm)):
        numerator[k] = gmm[k]["prior"] * multivariate_normal.pdf(data, gmm[k]["mean"], gmm[k]["covariance"])
        for j in range(len(gmm)):
            denominator[k] += gmm[j]["prior"] * multivariate_normal.pdf(data, gmm[j]["mean"], gmm[j]["covariance"])
    return np.divide(numerator, denominator)

## compute new mean, covariance, and class prior for each class
def maximization(posterior, data, gmm):
    N = np.zeros(len(gmm))
    for k in range(N.shape[0]):
        N[k] = np.sum(posterior[k])
    
    mu = np.zeros((len(gmm), len(gmm[0]["mean"])))
    for k in range(mu.shape[0]):
        for n in range(data.shape[0]):
            mu[k] += posterior[k, n] * data[n] 
        gmm[k]["mean"] = 1 / N[k] * mu[k]
        
    prior = np.zeros(len(gmm))
    for k in range(prior.shape[0]):
        prior[k] = np.divide(N[k], N.sum())
        gmm[k]["prior"] = prior[k]
        
    return gmm

In [None]:
import csv, copy
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import chi2, multivariate_normal

%matplotlib inline

In [None]:
## plot the gaussian model 
def plotGaussianModel2D(mu, sigma, pltopt = 'k'):
    if sigma.any():
        ## setup eclipse parameter
        c = chi2.ppf(0.9, 2) 
        ## setup eigenvector and eigenvalues
        eigenValue, eigenVector = np.linalg.eig(sigma)

        t = np.linspace(0, 2*np.pi, 100) # draw 100 points
        u = [np.cos(t), np.sin(t)]
        w = c * eigenVector.dot(np.diag(np.sqrt(eigenValue)).dot(u))
        z = w.T + mu
        
    else:
        z = mu
    
    plt.plot(z[:,0], z[:,1], pltopt)
    
def colorPicker(index):
    colors = 'rgbcmyk'
    return colors[np.remainder(index, len(colors))]

def gmmplot(data, gmm):
    # plot data points
    plt.scatter(data[:, 0], data[:, 1], s = 4)
    # plot gmm
    for index, model in enumerate(gmm):
        plotGaussianModel2D(model['mean'], model['covariance'], colorPicker(index))

In [None]:
## the gmm parameters
mu = np.asarray([[2, 0], [-4, -2.5], [0, 2]]).astype('float')

sigma = np.asarray([[[0.25, 0], [0, 0.15]], 
                    [[0.25, 0.1], [0.1, 0.3]], 
                    [[0.25, 0.1], [0.1, 0.15]]]).astype('float')

In [None]:
gmm = [{'mean': mu[m], 'covariance': sigma[m], 'prior': 1.0 / 3} for m in range(3)]

In [None]:
Y = X_std.dot(matrix_w)

gmmplot(Y, gmm)

In [None]:
## k = 1
gmm1 = [{'mean': mu[m], 'covariance': sigma[m], 'prior': 1.0 / 1} for m in range(1)]
## k = 2
gmm2 = [{'mean': mu[m], 'covariance': sigma[m], 'prior': 1.0 / 2} for m in range(2)]
## k = 3
gmm3 = [{'mean': mu[m], 'covariance': sigma[m], 'prior': 1.0 / 3} for m in range(3)]
gmms = [gmm1, gmm2, gmm3]

In [None]:
## helper function to plot three different gmms, each with 3 plots
def __plot__ (gmms, data):
    ## counter for printing 
    k = 1
    
    for gmmm in gmms:
        ## init new plot each time
        plt.figure(figsize=(16, 8))
        ## index for subplot
        i = 1
        ## copy the gmm data to avoid errors
        gmm = copy.deepcopy(gmmm)
        
        ## plot and print the init stage of our model
        plt.subplot(230 + i)
        gmmplot(data, gmm)
        print("Plot ", k, ":", gmm)
        i += 1
        k += 1

        # perform the EM algorithm on the model
        for j in range(5):
            posterior = expectation(data, gmm)
            gmm = maximization(posterior, data, gmm)
            ## plot and print in the middle of the training 
            if (j == 3):
                plt.subplot(230 + i)
                gmmplot(data, gmm)
                print("Plot ", k,":", gmm)
                i += 1
                k += 1
        
        ## plot and print the final stage
        plt.subplot(230 + i)
        gmmplot(data, gmm)
        print("Plot ", k, ":", gmm)
        i += 1
        k+= 1

In [None]:
__plot__(gmms, Y)