# Embeddings per word

## Setting up variables

In [1]:
# N.B. implemented in google colab originally, file paths should be updated to reflect this if running in drive 
# from google.colab import drive
# drive.mount('/drive')

In [5]:
# change the variables in this cell according to the model and words you are analysing 
data_length = 500
language = 'en'
model_name = 'bert-base-uncased' #enter 'camembert' if using 'camembert-base-ccnet'
model_size = 512
words = ['auteur','author'] 

In [3]:
# dictionary of alternative spellings to account for variation in diacritics 
alternate_spellings = {'a la mode':'à la mode','académie':'academie','après':'apres','depanneur':'dépanneur','detente':'détente','faux pas':'faux-pas','fete':'fête','fin de siecle':'findesiecle','gite':'gîte','lycee':'lycée','rendezvous':'rendez vous','cafe':'café','cliche':'cliché','elite':'élite','etiquette':'étiquette','facade':'façade','naive':'naïve','end of century':'end-of-century','middle class':'middleclass','working class':'workingclass'}

## Word Embeddings

In [4]:
import pandas as pd
import numpy as np
import torch



In [5]:
# uncomment the following if running in colab
# !pip install transformers
# !pip install sentencepiece

In [10]:
from transformers import BertTokenizer, BertForMaskedLM,CamembertForMaskedLM, CamembertTokenizer


if model_name == "camembert":
  model = CamembertForMaskedLM.from_pretrained("camembert/camembert-base-ccnet",output_hidden_states = True)
  tokenizer = CamembertTokenizer.from_pretrained("camembert/camembert-base-ccnet")

else:
  tokenizer = BertTokenizer.from_pretrained(model_name)
  model = BertForMaskedLM.from_pretrained(model_name,output_hidden_states = True)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Functions

In [12]:
def bert_text_preparation(text, tokenizer):
  """
  Preprocesses the sentence input for BERT to process
  """

  marked_text = "[CLS] " + text + " [SEP]"
  tokenized_text = tokenizer.tokenize(marked_text)
  token_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
  segment_ids = [1]*len(token_ids)

  # Pad token IDs and attention mask
  max_length = model_size  # Maximum sequence length for model
  padding_length = max_length - len(token_ids)

  padded_token_ids = token_ids + [0] * padding_length
  padded_segment_ids = segment_ids + [0] * padding_length

  # convert inputs to tensors
  tokens_tensor = torch.tensor([padded_token_ids])
  segments_tensor = torch.tensor([padded_segment_ids])

  return tokenized_text, tokens_tensor, segments_tensor

In [17]:
def get_bert_embeddings(tokens_tensor, segments_tensor, model):
    """
    Obtains contextual BERT embeddings for tokens.
    """
    # gradient calculation id disabled
    with torch.no_grad():
      # obtain hidden states
      outputs = model(tokens_tensor, segments_tensor)
      layers = outputs.hidden_states[-3:]

    # remove dimension 1, the "batches"
    squeezed_layers = [torch.squeeze(layer,dim=1) for layer in layers]

    # swap dimensions 0 and 1 so we can loop over tokens
    processed_layers = [layer.permute(1,0,2) for layer in squeezed_layers]

    return processed_layers[0],processed_layers[1],processed_layers[2]

In [14]:
def make_dataframe_all(context_tokens,context_embeddings_dict,token_sent_index):
  """
  Process the embeddings into a dataframe  
  """
  context_data = pd.DataFrame(columns = ['text','embeddings_layer_12','embeddings_layer_11','embeddings_layer_10','sentence_no'])

  for key in context_embeddings_dict.keys():
    context_embeddings_dict[key] = [element.detach().numpy() for element in context_embeddings_dict[key]]
    context_embeddings_dict[key] = [element.flatten() for element in context_embeddings_dict[key]]

  context_data['text']= context_tokens
  context_data['embeddings_layer_12']= context_embeddings_dict['layer_12']
  context_data['embeddings_layer_11']=context_embeddings_dict['layer_11']
  context_data['embeddings_layer_10']=context_embeddings_dict['layer_10']
  context_data['sentence_no']= token_sent_index

  return context_data

In [15]:
def get_target_embeddings(word,context_data,tokenized_word,sentences,alternate_spellings): # word e.g. [chanson], tokenized_word = ['chan','##son']
    """
    Select only the embeddings for the target word for clustering   
    """
    
    if word in alternate_spellings.keys(): # if there is an alternative way to spell the word
      main_df = context_data[(context_data['text']==word)|(context_data['text']==word+'</w>')|(context_data['text']==word+'s')|(context_data['text']==word+'s'+'</w>')|(context_data['text']==alternate_spellings[word])|(context_data['text']==alternate_spellings[word]+'</w>')|(context_data['text']==alternate_spellings[word]+'s')|(context_data['text']==alternate_spellings[word]+'s'+'</w>')]
    else:
      main_df = context_data[(context_data['text']==word)|(context_data['text']==word+'</w>')|(context_data['text']==word+'s')|(context_data['text']==word+'s'+'</w>')]

    if len(tokenized_word)>1: # if the word has been split to be tokenized
        indices = list(context_data[context_data["text"] == tokenized_word[0]].index.values)
        new_ind = []
        av_emb = {'embeddings_layer_12':[], # obtaining different layers' embeddings 
                  'embeddings_layer_11':[],
                  'embeddings_layer_10':[]
                  }
        for i in indices:
            embedding_length = 1
            for j in range(1,len(tokenized_word)): # checking all the tokens match the tokenised target word 
                if context_data.iloc[i+j, context_data.columns.get_loc('text')] == tokenized_word[j] or context_data.iloc[i+j, context_data.columns.get_loc('text')] == tokenized_word[j]+'s':
                    embedding_length += 1
            if embedding_length == len(tokenized_word):
                new_ind+=[i]
                for layer in ['embeddings_layer_12','embeddings_layer_11','embeddings_layer_10']:
                    av = context_data.iloc[i, context_data.columns.get_loc(layer)] # averaging the tokens' embeddings to get a contextualised word embedding
                    for k in range(len(tokenized_word)-1):
                        av += context_data.iloc[i+k, context_data.columns.get_loc(layer)]
                    av = av/len(tokenized_word)
                    av_emb[layer] +=  [av]
        split_tokens = context_data.iloc[new_ind]
        for layer in ['embeddings_layer_12','embeddings_layer_11','embeddings_layer_10']:
            split_tokens[layer] = av_emb[layer]
        w_df = pd.concat([main_df,split_tokens])
    else:
        w_df = main_df

    indexed = [(sentences.index(sentence),sentence) for sentence in sentences] # getting the index for each sentence
    sentence_dict = {}
    for index,sentence in indexed:
        sentence_dict[index] = sentence

    # adding original sentence to df
    sent_numbers = w_df['sentence_no'].tolist()
    original = [sentence_dict[number] for number in sent_numbers]
    w_df['original'] = original

    # trimming the dataframe
    w_df = w_df.iloc[:500,:]

    return w_df


## For loop for each word

In [18]:
from collections import OrderedDict

for word in words:

  # checking length and initiating variable of sentences
  with open('Sentence_Datasets/'+word+'_'+language+'_unique.txt','r') as final_list:
      sentences = final_list.readlines()
      sentences = sentences[:1000]

  tokenized_word = tokenizer.tokenize(word)
  print('Tokenized_word: ',tokenized_word)

  context_embeddings_dict = {'layer_10':[],
                            'layer_11':[],
                            'layer_12':[]}

  context_tokens = []
  token_sent_index = []

  for sentence in sentences:
    tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(sentence, tokenizer)
    try:
      layer_10, layer_11, layer_12 = get_bert_embeddings(tokens_tensor, segments_tensors, model)
    except RuntimeError:
      continue

    # make ordered dictionary to keep track of the position of each word
    tokens = OrderedDict()
    sentence_index = sentences.index(sentence)

    # loop over tokens in sensitive sentence
    for token in tokenized_text[1:-1]:
      # keep track of position of word and whether it occurs multiple times
      if token in tokens:
        tokens[token] += 1
      else:
        tokens[token] = 1

      # compute the position of the current token
      token_indices = [i for i, t in enumerate(tokenized_text) if t == token]
      current_index = token_indices[tokens[token]-1]

      # get the corresponding embedding
      vec_layer10 = layer_10[current_index]
      vec_layer11 = layer_11[current_index]
      vec_layer12  = layer_12[current_index]

      # save values
      token_sent_index += [sentence_index]
      context_tokens.append(token)
      context_embeddings_dict['layer_10'].append(vec_layer10)
      context_embeddings_dict['layer_11'].append(vec_layer11)
      context_embeddings_dict['layer_12'].append(vec_layer12)

  context_data = make_dataframe_all(context_tokens,context_embeddings_dict,token_sent_index)

  w_df = get_target_embeddings(word,context_data,tokenized_word,sentences,alternate_spellings)

  print('Filtered data frame length: ',len(w_df)) #checking enough samples

  w_df.to_pickle('Results/Results_Pickles/'+word+'_'+language+'_'+model_name+'.pkl') # save results to pickle 


Tokenized_word:  ['auteur']


KeyboardInterrupt: 

## Clustering

In [6]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import plotly.express as px
from csv import writer
from scipy.spatial.distance import pdist

layers = ['10','11','12']

for word in words:

  w_df = pd.read_pickle('Results/Results_Pickles/'+word+'_'+language+'_'+model_name+'.pkl') #N.B. pickle files not included due to file size

  for layer in layers:
      X = np.array(w_df['embeddings_layer_'+layer].tolist()) # word_embeddings

      pca = PCA(2)
      df = pca.fit_transform(X)

      word_embeddings = df

      # Compute silhouette scores for different numbers of clusters
      min_clusters = 2
      max_clusters = 10
      silhouette_scores = []

      for k in range(min_clusters, max_clusters + 1):
          kmeans = KMeans(n_clusters=k, random_state=0, n_init=10)
          clusters = kmeans.fit_predict(word_embeddings)
          score = silhouette_score(word_embeddings, clusters)
          silhouette_scores.append(score)

      # Compute within-cluster sum of squares (WCSS) for different numbers of clusters
      min_clusters = 2
      max_clusters = 6
      wcss = []

      for k in range(min_clusters, max_clusters + 1):
          kmeans = KMeans(n_clusters=k, random_state=0,n_init=10)
          kmeans.fit(word_embeddings)
          wcss.append(kmeans.inertia_)

      # saving the wcss to plot elbow plots
      with open('Results/Other_Results/'+layer+'_en.csv','a') as elbow_csv:
        el_writer = writer(elbow_csv)
        el_writer.writerow([word,wcss])


      # Create text annotations for each point
      annotations = w_df['original'].tolist()

      max_silhouette_clusters = silhouette_scores.index(max(silhouette_scores))+2

      kmeans = KMeans(n_clusters = max_silhouette_clusters,n_init=10, random_state =0 )
      clusters = kmeans.fit_predict(df)

      # Plotting
      # Creating a dataframe with embeddings and sentences
      data = {'x': df[:, 0], 'y': df[:, 1], 'cluster': clusters, 'word': annotations}
      sentence_df = pd.DataFrame(data)

      # Create a scatter plot with hover annotations using Plotly
      fig = px.scatter(sentence_df, x='x', y='y', color='cluster', hover_data=['word'], labels={'x': 'dim 1', 'y': 'dim 2'},width=1200, height=600)
      fig.update_traces(marker=dict(size=10))

      fig.update_layout(
          title="Clusters "+word+' Layer '+layer,
          title_font_size=24,  # You can adjust the font size if needed
          )

      # Display the plot
      fig.show()

      # computing cosine distances for embeddings

      # calculate cosine distances using pairwise distances
      distances = pdist(X, metric='cosine')

      # calculate the standard deviation
      std_dev = np.std(distances)

      # calculate mean distance
      av = np.mean(distances)

       # save results for each layer in different file, e.g. layer 10 one file, layer 11 another file etc., each csv
      with open('Results/Other_Results/'+language+'_'+model_name+'_'+layer+'.csv','a') as cluster_data:
        cluster_writer = writer(cluster_data)
        cluster_writer.writerow([word, silhouette_scores, wcss, distances, std_dev, av])

      # save cluster data to csv and excel for qualitative analysis
      w_df['cluster'] = clusters
      clusters = w_df.drop(columns = ['embeddings_layer_12','embeddings_layer_11','embeddings_layer_10','sentence_no'])
      clusters.to_excel('Results/Other_Results/'+word+'_'+language+model_name+layer+'_cl.xlsx', index=False)

