# Preliminary Analysis

In [None]:
import sys
sys.path.append('C:/Users/chetai/Documents/Projects/moonGen/')

import os
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from IPython.display import display

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from scripts.evaluation.eval_utils import get_difficulty_map, load_pickle

## Load and Parse Data

In [None]:
root_path = 'C:/Users/chetai/Desktop/'

data_path = root_path + 'moonboard_data.pickle'
data = load_pickle(data_path)

print('Number of problems:', len(data))

In [None]:
# Parse raw data
NUM_ROWS = 18
NUM_COLS = 11

problem_ids = sorted(list(data.keys()))

# Placeholders
multi_hot_corpus = np.zeros((len(problem_ids), NUM_ROWS*NUM_COLS))
problem_labels = []

# Conversion map between difficulty indexes and grading scales
difficulty_map = get_difficulty_map()

# Iterate through problems
for i, pid in enumerate(problem_ids):
    problem_labels.append(difficulty_map[data[pid]['grade']]['v_scale'])
    
    holds_list = data[pid]['start'] + data[pid]['mid'] + data[pid]['end']
    for hold in holds_list:
        j = NUM_COLS*hold[0] + hold[1]
        multi_hot_corpus[i][j] = 1
problem_labels = np.asarray(problem_labels)
        
print(multi_hot_corpus.shape)
print(problem_labels.shape)

In [None]:
display(data[problem_ids[1]])

## Get Some Statistics

In [None]:
# Count of labels
counts = pd.Series(problem_labels).value_counts()
percents = pd.Series(problem_labels).value_counts(normalize=True)

counts = pd.DataFrame(counts, columns=['Counts'])
percents = pd.DataFrame(percents, columns=['Percentage'])
stats = counts.join(percents)

stats.reset_index(inplace=True)
stats.rename({'index': 'Label'}, axis='columns', inplace=True)
stats

## TSNE / PCA Visualization

In [None]:
sample_idx = np.random.permutation(problem_labels.shape[0])[:3000]

problem_labels_samp = problem_labels[sample_idx]
mhot_samp = multi_hot_corpus[sample_idx]

In [None]:
pca_mod = PCA(n_components=2, random_state=7)

x_pca = pca_mod.fit_transform(mhot_samp)
print(x_pca.shape)

In [None]:
tsne_mod = TSNE(
    n_components=2, 
    perplexity=10, 
    learning_rate=100,
    early_exaggeration=12,
    n_iter=2000,
    n_iter_without_progress=300,
    init='pca',
    verbose=2,
    random_state=7, 
    n_jobs=-1
)

x_tsne = tsne_mod.fit_transform(mhot_samp)
print(x_tsne.shape)

In [None]:
# Plot PCA visualization
tcomp1 = x_pca[:, 0]
tcomp2 = x_pca[:, 1]

plt.figure(figsize=(10, 10))

for label in np.unique(problem_labels_samp):
    s_idx = np.where(problem_labels_samp == label)[0]
    s_tcomp1 = tcomp1[s_idx]
    s_tcomp2 = tcomp2[s_idx]
    
    plt.scatter(s_tcomp1, s_tcomp2, s=6, alpha=0.8, label=label)

plt.title('MoonBoard Problems PCA Visualization')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')

plt.savefig(root_path + 'pca_visual.png')
plt.show()
plt.close()

In [None]:
# Plot TSNE visualization
tcomp1 = x_tsne[:, 0]
tcomp2 = x_tsne[:, 1]

plt.figure(figsize=(10, 10))

for label in np.unique(problem_labels_samp):
    s_idx = np.where(problem_labels_samp == label)[0]
    s_tcomp1 = tcomp1[s_idx]
    s_tcomp2 = tcomp2[s_idx]
    
    plt.scatter(s_tcomp1, s_tcomp2, s=6, alpha=0.8, label=label)

plt.title('MoonBoard Problems TSNE Visualization')
plt.xlabel('TSNE Component 1')
plt.ylabel('TSNE Component 2')

plt.savefig(root_path + 'tsne_visual.png')
plt.show()
plt.close()