In [1]:
import requests
import pandas as pd
import io
import gzip
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA


In [2]:

def read_gzipped_tabular_data_from_url(url, **kwargs):
    """
    Reads a gzipped tabular data file directly from a URL into a pandas DataFrame.

    Args:
        url (str): The URL of the gzipped file (e.g., 'http://example.com/data.txt.gz').
        **kwargs: Additional keyword arguments to pass to pandas.read_csv().

    Returns:
        pandas.DataFrame: The DataFrame containing the data, or None if an error occurs.
    """
    try:
        # Step 1: Fetch the content from the URL
        response = requests.get(url, stream=True) # Use stream=True for potentially large files
        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)

        # Step 2: Read the content as bytes
        # response.content gets all content at once, useful for smaller files
        # For very large files, consider iterating with response.iter_content()
        # and processing in chunks, or using io.BytesIO directly with the stream.
        # However, for pandas.read_csv, it often works well with response.content
        # as it handles the underlying buffer.
        compressed_data = response.content

        # Step 3: Wrap the bytes content in an in-memory binary file-like object
        # pandas.read_csv can read from a file-like object
        bytes_io = io.BytesIO(compressed_data)

        # Step 4: Let pandas handle the decompression and parsing
        # pandas' read_csv is smart enough to handle 'compression' parameter
        # or often infer it from the URL's file extension.
        df = pd.read_csv(bytes_io, compression='gzip', **kwargs)
        return df

    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from URL: {e}")
        return None
    except Exception as e:
        print(f"An error occurred while processing the data: {e}")
        return None



In [3]:
# Import fasttext vectors (first 200,000 words)

ft_url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz"
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz"
ft = read_gzipped_tabular_data_from_url(ft_url, skiprows=[0], nrows=2e5,
                                        quoting=3, sep=" ", header=None)


In [4]:
# Separate the base data into a vector of words and a matrix of embeddings

ft_words = ft[0]
ft_embeddings = ft.drop(columns=[0])
ft_embeddings = ft_embeddings.to_numpy()

# Normalize the embeddings, so that cosine similarity is just the dot product

row_magnitudes = np.linalg.norm(ft_embeddings, axis=1, keepdims=True)
ft_embeddings = ft_embeddings / row_magnitudes

# Check that the normalization worked
np.sum(ft_embeddings[0,:]**2)

np.float64(1.0000000000000009)

In [5]:
# Function to get embeddings for a word
def get_embedding(word):
  try:
      word_index = ft_words[ft_words == word].index[0]
  except IndexError:
      print("The word '" + word +"' was not found in the unique words list.")
      word_index = -1 # Set to a sentinel value or handle as needed

  if word_index != -1:
      # 2. Use this index to access the corresponding row in the NumPy matrix
      embedding = ft_embeddings[word_index, :]
      return embedding
  else:
      return None


In [None]:
# Check the embedding for "Paris"
get_embedding("Paris")

In [7]:
# Best matches from word
def best_matches(word, n=10):
    dot_products = ft_embeddings @ get_embedding(word)
    original_indexes = np.arange(ft_embeddings.shape[0])
    sorted_indices = original_indexes[np.argsort(-dot_products)]
    sorted_indices = sorted_indices[0:n]
    return pd.DataFrame({"words": ft_words[sorted_indices],
                         "cosine": dot_products[sorted_indices]})


# Best matches from embedding vector
def best_matches_from_embedding(embedding, n=10):
    embedding = embedding / np.sum(embedding**2) # normalization
    dot_products = ft_embeddings @ embedding
    original_indexes = np.arange(ft_embeddings.shape[0])
    sorted_indices = original_indexes[np.argsort(-dot_products)]
    sorted_indices = sorted_indices[0:n]
    return pd.DataFrame({"words": ft_words[sorted_indices],
                         "cosine": dot_products[sorted_indices]})



# Best matches

In [8]:
best_matches("Paris")

Unnamed: 0,words,cosine
2258,Paris,1.0
1564,France,0.689296
26453,Parisian,0.686385
28700,Marseille,0.673107
10311,Brussels,0.667431
68573,Marseilles,0.660686
32077,Strasbourg,0.657652
29849,Toulouse,0.644216
96682,Aix-en-Provence,0.636887
39210,paris,0.627016


In [None]:
best_matches("Bourdieu", 20)

In [None]:
best_matches("sociology", 20)

In [None]:
best_matches("Julien")

# Vector semantics

In [None]:
best_matches_from_embedding(get_embedding("Paris") - get_embedding("France") + get_embedding("Spain"))

In [None]:
best_matches_from_embedding(get_embedding("doctor") - get_embedding("man") + get_embedding("woman"))

In [None]:
best_matches_from_embedding(get_embedding("doctor") - get_embedding("woman") + get_embedding("man"))

# PCA representation

In [18]:
queries = ["Bach", "Beethoven", "Mozart", "Haendel", "Verdi", "Bizet", "Debussy",
          "Tupac", "Eminem", "Wu-Tang",
          "Armstrong", "Coltrane", "Miles", "Ellington", "Hawkins", "Peterson"]

In [20]:
queries = ["Julien", "Etienne", "Felix",
          "Marie", "Anne", "Camille",
          "Panagiotis", "Nikos",
          "Maria", "Eugenia",
          "Khaled", "Karim",
          "Maryam", "Aisha",
          "Gunther", "Gunnar", "Anders",
          "Greta", "Ursula", "Helga"]

In [None]:
## Get the submatrix of embeddings matching the names
matching_indices_boolean = ft_words.isin(queries)
matching_indices = ft_words[matching_indices_boolean].index.tolist()
submatrix = ft_embeddings[matching_indices, :]
submatrix_labels = ft_words[matching_indices]

## Compute the PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(submatrix)
plt.figure(figsize=(10, 8))

# Scatter plot of the PCA results
plt.scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.0) # Set alpha to 0 to hide default points

# Annotate each point with its corresponding string label
for i, label in enumerate(submatrix_labels):
    plt.annotate(label, (X_pca[i, 0], X_pca[i, 1]), textcoords="offset points", xytext=(0,5), ha='center')


# Distance to two poles
(here using average cosine distance)

In [22]:
def distance_to_pole(word, pole_words):
  matching_indices_boolean = ft_words.isin(pole_words)
  matching_indices = ft_words[matching_indices_boolean].index.tolist()
  submatrix = ft_embeddings[matching_indices, :]
  dot_products = submatrix @ get_embedding(word)
  return np.average(1 - dot_products)

def pole_distance_difference(word, pole_1, pole_2):
  distance_pole_1 = distance_to_pole(word, pole_1)
  distance_pole_2 = distance_to_pole(word, pole_2)
  return {"distance_pole_1": distance_pole_1, "distance_pole_2": distance_pole_2,
          "distance_1 - distance_2": distance_pole_1 - distance_pole_2}

In [23]:
pole1 = ["man", "father", "brother", "he", "him", "gentleman",
           "boy", "dude", "sir", "male"]
pole2 = ["woman", "lady", "girl", "women", "female", "mother", "sister",
           "she", "her", "gal"]

In [24]:
word = "doctor"
pole_distance_difference(word, pole1, pole2)

{'distance_pole_1': np.float64(0.7119831575370893),
 'distance_pole_2': np.float64(0.7008471026073315),
 'distance_1 - distance_2': np.float64(0.011136054929757777)}

In [25]:
word = "nurse"
pole_distance_difference(word, pole1, pole2)

{'distance_pole_1': np.float64(0.7699731397060436),
 'distance_pole_2': np.float64(0.6327828842980713),
 'distance_1 - distance_2': np.float64(0.13719025540797236)}

In [30]:
word = "CEO"
pole_distance_difference(word, pole1, pole2)

{'distance_pole_1': np.float64(0.827689953083771),
 'distance_pole_2': np.float64(0.8739041112992931),
 'distance_1 - distance_2': np.float64(-0.046214158215522105)}