Install the required modules:

In [None]:
!pip install datasets
!pip install --upgrade gensim
!pip install langdetect
!pip install plotly

Import the required modules:

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import random
import re
from datasets import load_dataset
from gensim.models.word2vec import Word2Vec
from langdetect import detect
from pandas.core.common import flatten
from sklearn.manifold import TSNE

Load the dataset:

In [None]:
dataset = load_dataset("timdettmers/openassistant-guanaco")

In [None]:
docs = dataset['train']['text'] + dataset['test']['text']

def filter_english(strings):
  english_strings = []
  for string in strings:
    try:
      if detect(string) == 'en':
        english_strings.append(string)
    except:
      print(f"Error with {string}")
  return english_strings

docs = filter_english(docs)
docs[:5]

Let's separate the sentences written by the human from the ones of the assistant:

In [None]:
human_questions = []
assistant_answers = []

for string in docs:
  segments = string.split("### Human:")
  for segment in segments[1:]:
    #This if is added to add to the list of questions only those that have then received an answer
    if "### Assistant:" in segment:
      human_questions.append(segment.split("### Assistant:")[0].strip())

  segments = string.split("### Assistant:")
  for segment in segments[1:]:
      assistant_answers.append(segment.split("### Human:")[0].strip())

print(len(human_questions))
print(len(assistant_answers))

Let's clean the datasets and split them into different sentences:

In [None]:
human_questions = [re.sub('[\n\t]', '', doc) for doc in human_questions]
assistant_answers = [re.sub('[\n\t]', '', doc) for doc in assistant_answers]
human_questions = [re.sub('[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', '', doc) for doc in human_questions]
assistant_answers = [re.sub('[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', '', doc) for doc in assistant_answers]
human_questions = [re.split('[?!.]\s', doc) for doc in human_questions]
assistant_answers = [re.split('[?!.]\s', doc) for doc in assistant_answers]

Flatten the list of lists and tokenize each word:

In [None]:
human_questions = list(flatten(human_questions))
human_questions = [re.sub('\W', ' ', doc).lower().split() for doc in human_questions]

assistant_answers = list(flatten(assistant_answers))
assistant_answers = [re.sub('\W', ' ', doc).lower().split() for doc in assistant_answers]

#We also compute a combination of the two tet
full_text = human_questions + assistant_answers

 The minimum length of each sentence is 5, so we don't need to remove sentences that are composed by few words

 Now we train the Word2Vec on the human questions, by providing as input the tokenized words, the size of each embedding, the minimum number of occurences for each word and the context window size:



In [None]:
model_human = Word2Vec(human_questions, vector_size=30, min_count=5, window=10)

How many words do we have in our model?

In [None]:
len(model_human.wv)

How is each embedding vector made?

In [None]:
term = 'house'
model_human.wv[term]

What are the most similar words to the word "short" ?

In [None]:
term = 'short'
model_human.wv.most_similar(term)

Now, let's do the same thing but with the assistant answers:

In [None]:
model_assistant = Word2Vec(assistant_answers, vector_size=30, min_count=5, window=10)

It's vocabulary size is more than 4 times the human one:

In [None]:
len(model_assistant.wv)

Let's see an example of embedding:

In [None]:
term = 'house'
model_assistant.wv[term]

And the most similar words to the word "short":

In [None]:
term = 'short'
model_assistant.wv.most_similar(term)

Finally, let's perform the same operations with the full_text to our disposal:

In [None]:
model_full = Word2Vec(full_text, vector_size=30, min_count=5, window=10)

The length of the vocabulary isn't much different from the one of just the assistant:

In [None]:
len(model_full.wv)

Let's again an example of an embedding:

In [None]:
term = 'plant'
model_full.wv[term]

And once again the most similar word to the word short:

In [None]:
term = 'short'
model_full.wv.most_similar(term)

Has the model understood the relation between this words?

In [None]:
vec = model_full.wv['king'] + (model_full.wv['woman'] - model_full.wv['man'])
vec

Seems it does not:

In [None]:
model_full.wv.most_similar(vec)

And this kind of relation?

In [None]:
vec = model_full.wv['france'] + (model_full.wv['rome'] - model_full.wv['italy'])
vec

It seems to have a better idea of what we are talking about, but still he hasn't fully understood the relation:

In [None]:
model_full.wv.most_similar(vec)

#Visualizing the embedding vector using t-SNE

We'll now produce a graphical representation of a subset of the embeddings, because to reduce the time required for computation, we'll limit our representation to 500 random samples:

In [None]:
random_samples_human = random.sample(list(model_human.wv.key_to_index), 500)
word_vectors_human = model_human.wv[random_samples_human]

random_samples_assistant = random.sample(list(model_assistant.wv.key_to_index), 500)
word_vectors_assistant = model_assistant.wv[random_samples_assistant]

random_samples_full = random.sample(list(model_full.wv.key_to_index), 500)
word_vectors_full = model_full.wv[random_samples_full]

We'll provide the vectors to the TSNE algorithm, to fit a model and have a 3 dimensional representation:

In [None]:
tsne = TSNE(n_components=3, n_iter=2000)

tsne_embeddings_human = tsne.fit_transform(word_vectors_human)
tsne_embeddings_assistant = tsne.fit_transform(word_vectors_assistant)
tsne_embeddings_full = tsne.fit_transform(word_vectors_full)

We transpose the matrix, as to have each dimension in each row of the resulting matrix and we get the coordinates of each point:

In [None]:
x_human, y_human, z_human = np.transpose(tsne_embeddings_human)
x_assistant, y_assistant, z_assistant = np.transpose(tsne_embeddings_assistant)
x_full, y_full, z_full = np.transpose(tsne_embeddings_full)

We draw the plots (we also reduced the amount of samples to be shown for a bettere representation):

In [None]:
fig = px.scatter_3d(x=x_human[:150], y=y_human[:150], z=z_human[:150], text=random_samples_human[:150])
fig.update_traces(marker=dict(size=3,line=dict(width=2)))
fig.show()

In [None]:
fig = px.scatter_3d(x=x_assistant[:150], y=y_assistant[:150], z=z_assistant[:150], text=random_samples_assistant[:150])
fig.update_traces(marker=dict(size=3,line=dict(width=2)))
fig.show()

In [None]:
fig = px.scatter_3d(x=x_full[:150], y=y_full[:150], z=z_full[:150], text=random_samples_full[:150])
fig.update_traces(marker=dict(size=3,line=dict(width=2)))
fig.show()