# Compare Dimensional Reduction


## Run following dimensional reduction algorithms for given fonts
1. Principal Component Analysis (PCA)
1. IsoMap
1. t-SNE

## Compare results of each using calculation of correlation coefficient
1. || X_hat - X || / || X ||

## Imports and Globals

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.manifold import Isomap
from sklearn.manifold import TSNE

import font_utils.load_font as LF
import font_utils.upper_lower_numerals as ULN
import sci_kit_learn_utils.utils as SKU

In [2]:
import importlib
importlib.reload(SKU)

<module 'sci_kit_learn_utils.utils' from '/home/digital-tenebrist/ms-data-science/math-637/udel-math-637/utils/sci_kit_learn_utils/utils.py'>

## Read Font
1. Returns dictionary for each variant with following fields
    1. df - pandas data frame with following trimming
        1. Retains m_label, and r0c0,...,r19c19 columns only
        1. No italic
        1. Only a-zA-Z0-9 returned
        1. Only min instances of each character based on min for a-zA-Z0-9
    1. min_char_count - number of instances of each character

In [3]:
uln = ULN.UpperLowerNumerals.get_ascii_codes()

lf = LF.LoadFont('garamond')
font_dict = lf.get_trimmed_font()
font_df = font_dict['GARAMOND']['df']

LABEL_AR = None

face_names = ['Normal', 'Bold']

raw_dfs = list()

for i in range(font_dict['GARAMOND']['min_char_count']):
        t_df = pd.DataFrame(data=[font_df.loc[font_df.m_label == x].iloc[i] for x in uln])
        
        if i==0:
            LABEL_AR = [chr(x) for x in t_df.m_label]
            
        t_df = t_df.drop(columns=['m_label'])
        t_df = t_df-t_df.mean(axis=0)

        # Perform PCA and calculate distance score
        pca = PCA(n_components=2)
        pca_y = pca.fit_transform(t_df)
        print(f'{face_names[i]:6s} PCA   Distance Score {SKU.calc_dist_cor_score(t_df,pca_y):0.4f}')
        
        # Perform IsoMap and calculate distance score
        isomap = Isomap(n_neighbors=8, n_components=2)
        iso_y = isomap.fit_transform(t_df)
        print(f'{face_names[i]:6s} Iso   Distance Score {SKU.calc_dist_cor_score(t_df,iso_y):0.4f}')
        
        # Perform t-SNE and calculate distance score
        tsne = TSNE(n_components=2, init='pca',random_state=0)
        tsne_y = tsne.fit_transform(t_df)
        print(f'{face_names[i]:6s} t-SNE Distance Score {SKU.calc_dist_cor_score(t_df,tsne_y):0.4f}')


Norm(x) 163656.4869108463
Norm(y) 83031.4081857176
Norm(d) 92561.40293252072
sqrt(x*y) 116570.27308424318
Normal PCA   Distance Score 0.7940
Norm(x) 163656.4869108463
Norm(y) 228961.3981097338
Norm(d) 98555.09267113713
sqrt(x*y) 193574.3217806399
Normal Iso   Distance Score 0.5091
Norm(x) 163656.4869108463
Norm(y) 76035.6796875
Norm(d) 96791.6216174235
sqrt(x*y) 111551.4778814456
Normal t-SNE Distance Score 0.8677
Norm(x) 187665.7684874895
Norm(y) 104975.28992548547
Norm(d) 96622.602868881
sqrt(x*y) 140357.64480805185
Bold   PCA   Distance Score 0.6884
Norm(x) 187665.7684874895
Norm(y) 287200.5925119228
Norm(d) 130678.52031437086
sqrt(x*y) 232158.82473817858
Bold   Iso   Distance Score 0.5629
Norm(x) 187665.7684874895
Norm(y) 122053.7578125
Norm(d) 87278.2173622298
sqrt(x*y) 151345.01067649617
Bold   t-SNE Distance Score 0.5767


### IsoMap Wins

1. What are the best parameters for IsoMap
1. Start with neighbors=2 and increase to 10

In [4]:
for i in range(font_dict['GARAMOND']['min_char_count']):
        t_df = pd.DataFrame(data=[font_df.loc[font_df.m_label == x].iloc[i] for x in uln])
        
        if i==0:
            LABEL_AR = [chr(x) for x in t_df.m_label]
            
        t_df = t_df.drop(columns=['m_label'])
        t_df = t_df-t_df.mean(axis=0)

        for n_n in range(2,11):
            # Perform IsoMap and calculate distance score
            isomap = Isomap(n_neighbors=n_n, n_components=2)
            iso_y = isomap.fit_transform(t_df)
            print(f'{face_names[i]:6s} Iso (n={n_n}) Dist Score {SKU.calc_dist_cor_score(t_df,iso_y):0.4f}')
        

Norm(x) 163656.4869108463
Norm(y) 806986.3877827412
Norm(d) 668120.4093581004
sqrt(x*y) 363412.3789985659
Normal Iso (n=2) Dist Score 1.8385
Norm(x) 163656.4869108463
Norm(y) 426711.59091639373
Norm(d) 284278.228123004
sqrt(x*y) 264261.4612339741
Normal Iso (n=3) Dist Score 1.0757
Norm(x) 163656.4869108463
Norm(y) 326324.43072293064
Norm(d) 187505.31528645538
sqrt(x*y) 231095.45630603962
Normal Iso (n=4) Dist Score 0.8114
Norm(x) 163656.4869108463
Norm(y) 300367.7587851636
Norm(d) 160926.05893956803
sqrt(x*y) 221714.07755048925
Normal Iso (n=5) Dist Score 0.7258
Norm(x) 163656.4869108463
Norm(y) 274915.82502050523
Norm(d) 138123.41481640626
sqrt(x*y) 212112.60716669538
Normal Iso (n=6) Dist Score 0.6512
Norm(x) 163656.4869108463
Norm(y) 252537.84796097362
Norm(d) 120078.53071852439
sqrt(x*y) 203296.4756441153
Normal Iso (n=7) Dist Score 0.5907
Norm(x) 163656.4869108463
Norm(y) 228961.3981097338
Norm(d) 98555.09267113713
sqrt(x*y) 193574.3217806399
Normal Iso (n=8) Dist Score 0.5091
Nor