## Load libraries

In [1]:
# Libraries to work cross-platform
import os

# Libraries to work with dataset
import numpy as np
# import pandas as pd
# import pickle

# Library to reduce dimensions
from sklearn.decomposition import PCA
from gensim.models.keyedvectors import KeyedVectors

# Libraries for monitoring operation process
from tqdm import tqdm

## Configurate and declare global variables

In [2]:
os_name = os.name

if os_name == 'nt':
    """Windows platform"""
    BASE_DIR = "E:/THIENDHB_GOOGLEDRIVE/MASTER TILBURG/THESIS/"
    INPUT_DIR = BASE_DIR + "DATASET/INPUT/"
    OUTPUT_DIR = BASE_DIR + "DATASET/OUTPUT/"
elif os_name == 'posix':
    """Linux platform"""
    BASE_DIR = "/media/pinkalinux/WORK/THIENDHB_GOOGLEDRIVE/MASTER TILBURG/THESIS/"
    INPUT_DIR = BASE_DIR + "DATASET/INPUT/"
    OUTPUT_DIR = BASE_DIR + "DATASET/OUTPUT/"

SEED = 6886
%matplotlib inline

## Import data

In [3]:
# Load Gensim keyed vectors of word embeddings
kv = KeyedVectors.load(OUTPUT_DIR + 'skill_word_norm_vectors.kv')

## Extract element from keyed vectors

In [4]:
# Extract vocabulary (words)
X_vocab = kv.index_to_key
len(X_vocab), X_vocab[:10]

(7949,
 ['anual',
  'mp',
  'word',
  'librarys',
  'outpatient',
  'secretariat',
  'structural',
  'double',
  'assignment',
  'channel'])

In [5]:
# Extract vectors (word embeddings)
X_vectors = []
for token in tqdm(X_vocab):
    X_vectors.append(kv[token])
X_vectors = np.asarray(X_vectors)
X_vectors.shape

100%|██████████████████████████████████████████████████████████████████████████| 7949/7949 [00:00<00:00, 441181.44it/s]


(7949, 300)

## Reduce dimensions

In [6]:
# PCA to get Top Components
vector_size = X_vectors.shape[1]
print(vector_size)
pca_embeddings = {}

pca = PCA(n_components=vector_size, random_state=SEED)
X_vectors = X_vectors - np.mean(X_vectors)
X_fit = pca.fit_transform(X_vectors)
U1 = pca.components_

300


In [7]:
# Post-Processing: Removing Projections on Top Components
z = []

for i, x in tqdm(enumerate(X_vectors)):
    for u in U1[0:7]:
        x = x - np.dot(u.transpose(), x) * u
    z.append(x)

z = np.asarray(z)

7949it [00:00, 29659.77it/s]


In [8]:
# PCA Dim Reduction
pca = PCA(n_components=int(vector_size / 2), random_state=SEED)
X_vectors = z - np.mean(z)
X_new_final = pca.fit_transform(X_vectors)

In [9]:
# PCA to do Post-Processing Again
pca = PCA(n_components=int(vector_size / 2), random_state=SEED)
X_new = X_new_final - np.mean(X_new_final)
X_new = pca.fit_transform(X_new)
Ufit = pca.components_

In [10]:
# Post-Processing: Removing Projections on Top Components again
X_new_final = X_new_final - np.mean(X_new_final)

final_pca_embeddings = []

for i, x in enumerate(X_vocab):
    final_pca_embeddings.append(X_new_final[i])

    for u in Ufit[0:7]:
        final_pca_embeddings[i] = (
            final_pca_embeddings[i] - np.dot(u.transpose(), final_pca_embeddings[i]) * u
        )

In [11]:
halfsize_kv = KeyedVectors(vector_size=int(vector_size / 2))
halfsize_kv.add_vectors(keys=X_vocab, weights=final_pca_embeddings)

In [12]:
# Save Gensim keyed vectors to file
halfsize_kv.save(OUTPUT_DIR + 'skill_halfsize_word_vectors.kv')

In [13]:
# Normalize vectors
halfsize_kv.fill_norms(force=True)

# Save normalized keyed vectors to file
halfsize_kv.save(OUTPUT_DIR + 'skill_halfsize_word_norm_vectors.kv')