<img src="https://github.com/djp840/MSDS_458_Public/blob/master/images/NorthwesternHeader.png?raw=1">

## MSDS422 Assignment 05
<div class="alert alert-block alert-success">
    <b>More Technical</b>: Throughout the notebook. This types of boxes provide more technical details and extra references about what you are seeing. They contain helpful tips, but you can safely skip them the first time you run through the code.
</div>

## Import packages 

In [None]:
# Helper libraries
import numpy as np
import pandas as pd
from time import time
from collections import Counter

import matplotlib
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls

from sklearn import metrics
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

In [None]:
%matplotlib inline

<div class="alert alert-block alert-info">
    <b>Suppress warning messages</b></div>

In [None]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

### Mount Google Drive to Colab Enviorment

In [None]:
#from google.colab import drive
#drive.mount('/content/gdrive')

### Load Data (Local Directory)

In [None]:
training_df = pd.read_csv("./data/MSDS422_05_train.csv")

In [None]:
print(training_df.shape)

### Save the labels to a Pandas Series 

In [None]:
target = training_df['label']
# Drop the label feature
training_df.drop("label",axis=1,inplace=True)

In [None]:
target.shape

### Standardize Training Data

In [None]:
X = training_df.values
X_std = StandardScaler().fit_transform(X)

# Calculating Eigenvectors and eigenvalues of Cov matirx
mean_vec = np.mean(X_std, axis=0)
cov_mat = np.cov(X_std.T)
eig_vals, eig_vecs = np.linalg.eig(cov_mat)
# Create a list of (eigenvalue, eigenvector) tuples
eig_pairs = [ (np.abs(eig_vals[i]),eig_vecs[:,i]) for i in range(len(eig_vals))]

# Sort the eigenvalue, eigenvector pair from high to low
eig_pairs.sort(key = lambda x: x[0], reverse= True)

# Calculation of Explained Variance from the eigenvalues
tot = sum(eig_vals)
var_exp = [(i/tot)*100 for i in sorted(eig_vals, reverse=True)] # Individual explained variance
cum_var_exp = np.cumsum(var_exp) # Cumulative explained variance

### PCA analysis

In [None]:
X = training_df.values
X_std = StandardScaler().fit_transform(X)

mean_vec=np.mean(X_std,axis=0)
cov_mat=np.cov(X_std.T)
eigvalues ,eigvectors =np.linalg.eig(cov_mat)

eigpairs=[(np.abs(eigvalues[i]),eigvectors[:,i] )for i in range(len(eigvalues))]


eigpairs.sort(key=lambda x:x[0],reverse=True)
   
tot=sum(eigvalues)
var_exp=[(i/tot)*100 for i in sorted(eigvalues,reverse=True)]
cum_var_exp=np.cumsum(var_exp)

<div class="alert alert-block alert-info">
<b>Plotly Python Open Source Graphing Library</b><br> 
https://plotly.com/python/<br>
Plotly's Python graphing library makes interactive, publication-quality graphs. Examples of how to make line plots, scatter plots, area charts, bar charts, error bars, box plots, histograms, heatmaps, subplots, multiple-axes, polar charts, and bubble charts</div>

In [None]:
trace1 = go.Scatter(
    x=list(range(784)),
    y= cum_var_exp,
    mode='lines+markers',
    name="'Cumulative Explained Variance'",
   
    line = dict(
        shape='spline',
        color = 'goldenrod'
    )
)
trace2 = go.Scatter(
    x=list(range(784)),
    y= var_exp,
    mode='lines+markers',
    name="'Individual Explained Variance'",
 
     line = dict(
        shape='linear',
        color = 'black'
    )
)
fig = tls.make_subplots(insets=[{'cell': (1,1), 'l': 0.7, 'b': 0.5}],
                          print_grid=True)

fig.append_trace(trace1,1,1)
fig.append_trace(trace2,1,1)


fig.layout.title='explained Variance plots'
fig.layout.xaxis=dict(range=[0,800],title='Feature columns')
fig.layout.yaxis=dict(range=[0,100],title='explained variance')


py.iplot(fig,filename='inset example')

In [None]:
pca=PCA(30)
pca.fit(X_std)

In [None]:
X_pca=pca.transform(X_std)

In [None]:
X_pca.shape

In [None]:
X_std.shape

In [None]:
eigenvectors=pca.components_
eigenvectors.shape

### Plotting Eigenvector

In [None]:
plt.figure(figsize=(17,16))

x_row=4
y_col=7

for i in list(range(x_row*y_col)):
    
    plt.subplot(x_row,y_col,i+1)
    plt.imshow(eigenvectors[i].reshape(28,28),cmap='twilight_shifted')
    title_='Eigenvector'+str(i+1)
    plt.title(title_)
    plt.xticks(())
    plt.yticks(())
plt.show()    

### Plotting MNIST Data

In [None]:
plt.figure(figsize=(12,13))

for i in list(range(0,70)):
    plt.subplot(7,10,i+1)
    plt.title(target[i])
    plt.imshow(training_df.iloc[i].values.reshape(28,28), interpolation = "none", cmap='binary')
    plt.xticks([])
    plt.yticks([])
plt.tight_layout()
               
plt.tight_layout

### Standardising Data and Implementing PCA

<div class="alert alert-block alert-info">
    <b>sklearn.decomposition.PCA</b><br> 
    https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
    </div>

In [None]:
X_=training_df
X_std_=StandardScaler().fit_transform(X_)
pca_=PCA(140)
X_140d=pca_.fit_transform(X_std_)
Target=target

In [None]:
Target.dtype

In [None]:
print(X_140d.shape)

In [None]:
eigenvectors_=pca_.components_
print(eigenvectors_.shape)

### Visualisation of PCA Representations

In [None]:
trace = go.Scatter(
    x = X_140d[:,0],
    y = X_140d[:,1],
    name = str(Target),
    
    mode = 'markers',
    text = Target,
    showlegend = False,
    marker = dict(
        size = 8,
        color = Target,
        colorscale ='Jet',
        showscale = False,
        line = dict(
            width = 2,
            color = 'rgb(255, 255, 255)'
        ),
        
        opacity = 0.8
    )
)

data=[trace]

layout=go.Layout(title='PCA',
                hovermode='closest',
                xaxis=dict(
                    title='First principal direction',
                    ticklen=5,
                    zeroline=False),
                 yaxis=dict(
                 title='Second principal direction',
                 ticklen=5
            ),
                 showlegend=True
                
                    
                )
fig=dict(data=data,layout=layout)
py.iplot(fig,filename='pca')

### Kmeans Clustering

<div class="alert alert-block alert-info">
    <b>sklearn.cluster.KMeans</b><br> 
    https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
    </div>

In [None]:
kmeans=KMeans(10)
X_clustered=kmeans.fit_predict(X_140d)

In [None]:
tracekmeans = go.Scatter(x=X_140d[:, 0], y= X_140d[:, 1], mode="markers",
                    showlegend=False,
                    marker=dict(
                            size=8,
                            color = X_clustered,
                            colorscale = 'Portland',
                            showscale=False, 
                            line = dict(
            width = 2,
            color = 'rgb(255, 255, 255)'
        )
                   ))


layout=go.Layout(title='Kmeans clustering',
                 hovermode='closest',
                 xaxis=dict(title='first principal direction',
                           ticklen=5,
                           zeroline=False,
                           gridwidth=2),
                 yaxis=dict(title='second principal component',
                           ticklen=5,
                           gridwidth=2),
                 showlegend=True
                     )

data = [tracekmeans]
fig1 = dict(data=data, layout= layout)
py.iplot(fig1, filename="svm")

In [None]:
x_clusters_df=pd.DataFrame(X_clustered, dtype=int)
x_clusters_df.columns=['Cluster']
targeted_df=pd.DataFrame(Target,dtype=int)

### Cross Tablulation 

<div class="alert alert-block alert-info">
    <b>pandas.crosstab</b><br> 
   https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.crosstab.html
    </div>

In [None]:
pd.crosstab(targeted_df.label, x_clusters_df.Cluster)