# Compare Dimensional Reduction


## Run following dimensional reduction algorithms for given fonts
1. Principal Component Analysis (PCA)
1. IsoMap
1. t-SNE

## Compare results of each using calculation of correlation coefficient
1. || X_hat - X || / || X ||

## Imports and Globals

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.manifold import Isomap
from sklearn.manifold import TSNE

import font_utils.load_font as LF
import font_utils.upper_lower_numerals as ULN
import sci_kit_learn_utils.utils as SKU

In [10]:
import importlib
importlib.reload(SKU)

<module 'sci_kit_learn_utils.utils' from '/home/digital-tenebrist/ms-data-science/math-637/udel-math-637/utils/sci_kit_learn_utils/utils.py'>

## Read Font
1. Returns dictionary for each variant with following fields
    1. df - pandas data frame with following trimming
        1. Retains m_label, and r0c0,...,r19c19 columns only
        1. No italic
        1. Only a-zA-Z0-9 returned
        1. Only min instances of each character based on min for a-zA-Z0-9
    1. min_char_count - number of instances of each character

In [22]:
uln = ULN.UpperLowerNumerals.get_ascii_codes()

lf = LF.LoadFont('garamond')
font_dict = lf.get_trimmed_font()
font_df = font_dict['GARAMOND']['df']

LABEL_AR = None

face_names = ['Normal', 'Bold']

raw_dfs = list()

for i in range(font_dict['GARAMOND']['min_char_count']):
        t_df = pd.DataFrame(data=[font_df.loc[font_df.m_label == x].iloc[i] for x in uln])
        
        if i==0:
            LABEL_AR = [chr(x) for x in t_df.m_label]
            
        t_df = t_df.drop(columns=['m_label'])
        t_df = t_df-t_df.mean(axis=0)

        # Perform PCA and calculate distance score
        pca = PCA(n_components=2)
        pca_y = pca.fit_transform(t_df)
        print(f'{face_names[i]:6s} PCA   Distance Score {SKU.calc_dist_cor_score(t_df,pca_y):0.4f}')
        
        # Perform IsoMap and calculate distance score
        isomap = Isomap(n_neighbors=8, n_components=2)
        iso_y = isomap.fit_transform(t_df)
        print(f'{face_names[i]:6s} Iso   Distance Score {SKU.calc_dist_cor_score(t_df,iso_y):0.4f}')
        
        # Perform t-SNE and calculate distance score
        tsne = TSNE(n_components=2, init='pca',random_state=0)
        tsne_y = tsne.fit_transform(t_df)
        print(f'{face_names[i]:6s} t-SNE Distance Score {SKU.calc_dist_cor_score(t_df,tsne_y):0.4f}')


Normal PCA   Distance Score 0.5656
Normal Iso   Distance Score 0.6022
Normal t-SNE Distance Score 0.5914
Bold   PCA   Distance Score 0.5149
Bold   Iso   Distance Score 0.6963
Bold   t-SNE Distance Score 0.4651


### IsoMap Wins

1. What are the best parameters for IsoMap
1. Start with neighbors=2 and increase to 10

In [23]:
for i in range(font_dict['GARAMOND']['min_char_count']):
        t_df = pd.DataFrame(data=[font_df.loc[font_df.m_label == x].iloc[i] for x in uln])
        
        if i==0:
            LABEL_AR = [chr(x) for x in t_df.m_label]
            
        t_df = t_df.drop(columns=['m_label'])
        t_df = t_df-t_df.mean(axis=0)

        for n_n in range(2,11):
            # Perform IsoMap and calculate distance score
            isomap = Isomap(n_neighbors=n_n, n_components=2)
            iso_y = isomap.fit_transform(t_df)
            print(f'{face_names[i]:6s} Iso (n={n_n}) Dist Score {SKU.calc_dist_cor_score(t_df,iso_y):0.4f}')
        

Normal Iso (n=2) Dist Score 4.0825
Normal Iso (n=3) Dist Score 1.7370
Normal Iso (n=4) Dist Score 1.1457
Normal Iso (n=5) Dist Score 0.9833
Normal Iso (n=6) Dist Score 0.8440
Normal Iso (n=7) Dist Score 0.7337
Normal Iso (n=8) Dist Score 0.6022
Normal Iso (n=9) Dist Score 0.5396
Normal Iso (n=10) Dist Score 0.4967
Bold   Iso (n=2) Dist Score 3.1901
Bold   Iso (n=3) Dist Score 1.6300
Bold   Iso (n=4) Dist Score 1.1625
Bold   Iso (n=5) Dist Score 0.9601
Bold   Iso (n=6) Dist Score 0.9079
Bold   Iso (n=7) Dist Score 0.7982
Bold   Iso (n=8) Dist Score 0.6963
Bold   Iso (n=9) Dist Score 0.6131
Bold   Iso (n=10) Dist Score 0.5855
