# Download Glove word vectors

In [None]:
# Download the zipped word vectors file
!wget https://nlp.stanford.edu/data/glove.840B.300d.zip
# Extract the text file
!unzip glove.840B.300d.zip
# Remove the original zip file
!rm glove*.zip

# Connect to BigQuery

In [None]:
import numpy as np
from google.cloud import bigquery
import pandas as pd
from tqdm import tqdm

In [None]:
from google.colab import auth
auth.authenticate_user()

Edit the following cell to enter the project ID of you google project:

In [None]:
project_id = '[your Cloud Platform project ID]'

client = bigquery.Client(project=project_id)

# Load word vectors into a Pandas dataframe

In [None]:
# First, read through the the space delimited text file and extract the words and vectors
# We store these as dicts inside a list to then make it easy to convert to a Pandas DataFrame
glove = []
with open('glove.840B.300d.txt','r') as f:
  for line in tqdm(f, total=2196017):
    split_line = line.split(" ")
    word = split_line[0]
    embedding = np.array(split_line[1:], dtype=np.float32)
    glove.append({'word': word, 'vector': embedding})

100%|██████████| 2196017/2196017 [02:24<00:00, 15180.51it/s]


In [None]:
glove_dataframe = pd.DataFrame(glove)

In [None]:
# Check that this has worked by looking at a couple of the entries
glove_dataframe.sample(2)

Unnamed: 0,word,vector
233068,45-year,"[0.10416, -0.27513, 0.44471, 0.40818, 0.25943,..."
2135957,1883-O,"[-0.18289, -0.39028, -0.045854, 0.40068, -0.43..."


# Create BigQuery dataset

In [None]:
dataset_id = 'word_vectors_us'

# Construct a full Dataset object to send to the API.
dataset = bigquery.Dataset(f'{project_id}.{dataset_id}')

# Note - we will set the dataset location to US to match the source data
# in `bigquery-public-data.breathe` 
dataset.location = "US"

dataset = client.create_dataset(dataset, timeout=30)  # Make an API request.

# Upload vectors to BQ table

In [None]:
# Provide the name for the table which will contain the word vectors:
vectors_table_id = 'glove_vectors'

table_id = f"{project_id}.{dataset_id}.{vectors_table_id}"
print(f'Word vectors to be save to: {table_id}')

The next step is to define the table schema. We could just let the API try to determine the schema automatically, but our schema is a bit unusual so it's best to be explicit.

We want there to be a single string field for the word, and then an ARRAY field to store the vectors. 

In [None]:
# Define the schema to match the dataframe:
schema = [bigquery.SchemaField("word", bigquery.enums.SqlTypeNames.STRING),
          bigquery.SchemaField('vector', bigquery.enums.SqlTypeNames.FLOAT, mode='REPEATED'),
         ]

The next step is to use the API to save the pandas dataframe to a table. When I tried to save the entire 5GB dataframe in one go Colab crashed with a memory error. The solution I've used is to upload the data in chunks in a loop.

In [None]:
# Define the job to load the dataframe into the target table
job_config = bigquery.LoadJobConfig(
    schema=schema,
    write_disposition="WRITE_APPEND"
)

In [None]:
for i in range(0, len(glove_dataframe), 250000):
  end = min(i+250000, len(glove_dataframe)-1)
  print(f'Uploading from index {i} to {end}')
  job = client.load_table_from_dataframe(
    glove_dataframe.iloc[i:end], 
    table_id, 
    job_config=job_config,
  )
  job.result()

Uploading from index 250000 to 500000
Uploading from index 500000 to 750000
Uploading from index 750000 to 1000000
Uploading from index 1000000 to 1250000
Uploading from index 1250000 to 1500000
Uploading from index 1500000 to 1750000
Uploading from index 1750000 to 2000000
Uploading from index 2000000 to 2196016


In [None]:
q = client.query(f'SELECT COUNT(*) total_rows FROM {table_id}')
results = q.result().to_dataframe()

In [None]:
results

Unnamed: 0,total_rows
0,2196016
