# GloVe pretrained embeddings dimension reduction with PCA on Google Colab

## Download GloVe files.

In [None]:
from urllib.request import urlretrieve
import os
from zipfile import ZipFile

def download(url, file):
    if not os.path.isfile(file):
        print("Download file... " + file + " ...")
        urlretrieve(url,file)
        print("File downloaded")

download('https://nlp.stanford.edu/data/glove.6B.zip','glove.6B.zip')
print("All the files are downloaded")

## Uncompress downloaded files.

In [None]:
def uncompress_features_labels(dir):
    if(os.path.isdir('data')):
        print('Data extracted')
    else:
        with ZipFile(dir) as zipf:
            zipf.extractall('data')
uncompress_features_labels('glove.6B.zip')

## Dimension reduction.

Paper : https://arxiv.org/abs/1708.03629

In [None]:
import numpy as np
import sys
from sklearn.decomposition import PCA
import subprocess

Glove = {}
f = open('/content/data/glove.6B.300d.txt')
dims = 300
red_dims = 150
file_text = "glove.6b.300d"

print("Loading Word vectors.")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    Glove[word] = coefs
f.close()

print("Done.")
X_train = []
X_train_names = []
for x in Glove:
        X_train.append(Glove[x])
        X_train_names.append(x)

X_train = np.asarray(X_train)
pca_embeddings = {}

# PCA to get Top Components
pca =  PCA(n_components = dims)
print(X_train.shape)
print(np.mean(X_train))
X_train = X_train - np.mean(X_train)
X_fit = pca.fit_transform(X_train)
U1 = pca.components_

z = []

# Removing Projections on Top Components
for i, x in enumerate(X_train):
	for u in U1[0:7]:        
        	x = x - np.dot(u.transpose(),x) * u 
	z.append(x)

z = np.asarray(z)

# PCA Dim Reduction
pca =  PCA(n_components = red_dims)
X_train = z - np.mean(z)
X_new_final = pca.fit_transform(X_train)


# PCA to do Post-Processing Again
pca =  PCA(n_components = red_dims)
X_new = X_new_final - np.mean(X_new_final)
X_new = pca.fit_transform(X_new)
Ufit = pca.components_

X_new_final = X_new_final - np.mean(X_new_final)

final_pca_embeddings = {}
filename_reduced = "{}_reduced_embeddings_{}.txt".format(file_text, red_dims)
embedding_file = open(filename_reduced, 'w')

for i, x in enumerate(X_train_names):
        final_pca_embeddings[x] = X_new_final[i]
        embedding_file.write("%s\t" % x)
        for u in Ufit[0:7]:
            final_pca_embeddings[x] = final_pca_embeddings[x] - np.dot(u.transpose(),final_pca_embeddings[x]) * u 

        for t in final_pca_embeddings[x]:
                embedding_file.write("%f\t" % t)
        
        embedding_file.write("\n")

print("The Reduced Embedding is available at {0}".format(filename_reduced))

## Upload to Drive

In [None]:
# Install the PyDrive wrapper & import libraries.
# This only needs to be done once in a notebook.
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once in a notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Create & upload a file.
uploaded = drive.CreateFile({'title': 'glove.6b.300d_reduced_embeddings_150.txt'})
uploaded.SetContentFile('glove.6b.300d_reduced_embeddings_150.txt')
uploaded.Upload()
print('Uploaded file with ID {}'.format(uploaded.get('id')))