In [4]:
 from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/My Drive/NLP_progetto_2024

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive/My Drive/NLP_progetto_2024


In [5]:
#Installing gensim
#!pip install --upgrade gensim

Here to import useful libraries

In [6]:
import json
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
from gensim.models.word2vec import Word2Vec

## 1) Preprocess Data

In [7]:
# Load the dataset
file_json = open("medical_meadow_wikidoc_medical_flashcards.json")
data = json.load(file_json)

In [8]:
# Converting the json file to a pandas dataframe
pd_dataset = pd.DataFrame.from_dict(data)
# Explore the dataframe
print(pd_dataset.head())
print(pd_dataset.columns)
pd_dataset.describe()

                       instruction  \
0  Answer this question truthfully   
1  Answer this question truthfully   
2  Answer this question truthfully   
3  Answer this question truthfully   
4  Answer this question truthfully   

                                               input  \
0  What is the relationship between very low Mg2+...   
1  What leads to genitourinary syndrome of menopa...   
2  What does low REM sleep latency and experienci...   
3  What are some possible causes of low PTH and h...   
4  How does the level of anti-müllerian hormone r...   

                                              output  
0  Very low Mg2+ levels correspond to low PTH lev...  
1  Low estradiol production leads to genitourinar...  
2  Low REM sleep latency and experiencing halluci...  
3  PTH-independent hypercalcemia, which can be ca...  
4  The level of anti-müllerian hormone is directl...  
Index(['instruction', 'input', 'output'], dtype='object')


Unnamed: 0,instruction,input,output
count,33955,33955.0,33955.0
unique,1,33289.0,33335.0
top,Answer this question truthfully,,
freq,33955,408.0,402.0


In [9]:
# We observe that the column "instruction" has only one unique value, so we are going to drop it
pd_dataset = pd_dataset.drop(columns=['instruction'])

In [10]:
# Split the dataset into two different dataframes, one for questions and one for answers
pd_dataset_questions = pd_dataset['input']
pd_dataset_answers = pd_dataset['output']

In [11]:
# Convert the dataframe to a numpy array
np_dataset_answers = pd_dataset_answers.to_numpy().flatten()
np_dataset_questions = pd_dataset_questions.to_numpy().flatten()

In [12]:
# Tokenize the answers dataset
np_dataset_answers_tokens = [re.sub('\W', ' ', sentence).lower().split() for sentence in np_dataset_answers]

In [13]:
print(np_dataset_answers_tokens[7])

['insulinoma', 'or', 'sulfonylurea', 'drugs', 'can', 'cause', 'low', 'glucose', 'and', 'high', 'c', 'peptide', 'levels']


In [14]:
# We remove only ',' and '.' since other symbols may be useful (like + for chemical formulations) and lower case
final_dataset_tokens = [re.sub(r'[.,()]', '', str(sentence)).lower() for w in np_dataset_answers_tokens for sentence in w]

In [15]:
print(final_dataset_tokens[12])

results


In [16]:
# We encode 'utf-8' all the tokens
# Necessary to train Word2Vec
final_encoded = [s.encode('utf-8').split() for s in final_dataset_tokens]

In [17]:
print(final_encoded[23])

[b'syndrome']


We are not removing stopwords right now since we are going to do word2vec
Now we have our dataset, we are going to work on the word2vec implementation

In [18]:
word2vec = Word2Vec(final_encoded, vector_size=50, min_count=10, window=20)

In [19]:
len(word2vec.wv)

8721

In [20]:
# Visualize the vector for the word 'blood'
term = 'blood'
word2vec.wv[term.encode('utf-8')]

array([[-2.34196894e-03, -4.74080583e-03, -3.63711128e-03,
        -2.35471013e-03, -1.30342217e-02, -7.08977226e-03,
        -6.39467500e-03,  1.54445888e-02,  1.17338225e-02,
        -1.78850407e-03, -9.63312667e-03,  1.05846096e-02,
        -1.76752694e-02,  3.29897879e-03,  8.02200288e-03,
        -3.89539008e-03, -9.04865004e-03, -1.64553877e-02,
        -1.65748280e-02,  1.42547628e-02,  7.27420347e-03,
        -7.72836211e-04,  1.94214899e-02, -1.70882717e-02,
         1.17854599e-03, -8.88593204e-04, -1.61409322e-02,
        -1.81996748e-02,  1.43775251e-02, -7.26578711e-03,
        -1.58020929e-02,  1.59053560e-02, -1.25085544e-02,
        -6.24371273e-03,  3.55124241e-03, -6.70282589e-03,
        -1.59294344e-02, -7.84253608e-03, -1.80282425e-02,
         2.79551512e-03, -9.25740227e-03, -4.01663783e-05,
         5.44829853e-03, -4.74613905e-03,  1.76363345e-02,
         1.40850879e-02, -1.10177754e-03, -1.27002003e-03,
         1.70069456e-03,  1.35006597e-02],
       [-1.57

In [21]:
word2vec.wv.n_similarity(['blood'.encode('utf-8')], ['glucose'.encode('utf-8')])
# Visualize the most similar words to glucose
word2vec.wv.similar_by_word('glucose'.encode('utf-8'))

[(b'headache', 0.4783393442630768),
 (b'rays', 0.4653876721858978),
 (b'lowering', 0.451872318983078),
 (b'becker', 0.4438175857067108),
 (b'injected', 0.44271421432495117),
 (b'temperature', 0.43988871574401855),
 (b'zoster', 0.42546725273132324),
 (b'subperiosteal', 0.4164178967475891),
 (b'right', 0.41351082921028137),
 (b'weakens', 0.41124093532562256)]

In [22]:
import random

# Select 500 random samples
sample = random.sample(list(word2vec.wv.key_to_index), 500)
sample_decoded = []
for s in sample:
  sample_decoded.append(s.decode('utf-8'))


In [23]:
word_vectors = word2vec.wv[sample]
word_vectors

array([[ 0.01303462,  0.00832977, -0.01951368, ..., -0.01242318,
         0.00497906, -0.00171659],
       [-0.00693002,  0.00723173,  0.00424167, ..., -0.00481276,
         0.01101673,  0.01202074],
       [ 0.01263914, -0.01871311,  0.01410609, ...,  0.01615741,
        -0.01637337,  0.01876849],
       ...,
       [-0.01896856,  0.01443628,  0.01813582, ..., -0.00124633,
         0.0017246 , -0.00249093],
       [-0.00190813, -0.00689055,  0.01365748, ...,  0.00874928,
         0.01238842,  0.00601917],
       [-0.01941793,  0.01155131,  0.01036901, ...,  0.01717287,
         0.0033099 ,  0.00465026]], dtype=float32)

In [24]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=3, n_iter=2000)

tsne_embedding = tsne.fit_transform(word_vectors)

In [25]:
x, y, z = np.transpose(tsne_embedding)

In [26]:
#!pip install plotly



In [27]:
import plotly.express as px

# Visualize the samples extracted before
fig = px.scatter_3d(x=x, y=y, z=z)
fig.update_traces(marker=dict(size=3,line=dict(width=2)))
fig.show()

In [28]:
# Visualize samples with the label
fig = px.scatter_3d(x=x[:200],y=y[:200],z=z[:200],text=sample_decoded[:200])
fig.update_traces(marker=dict(size=3,line=dict(width=2)),textfont_size=10)
fig.show()

In [29]:
# Add some specific terms to sample:
words = ['blood', 'heart']
words_encoded = []
for w in words:
  words_encoded.append(w.encode('utf-8'))

word_vectors = word2vec.wv[words_encoded+sample]

tsne = TSNE(n_components=3)
tsne_embedding = tsne.fit_transform(word_vectors)

x, y, z = np.transpose(tsne_embedding)

In [31]:
import plotly.express as px

# Visualize the position of the terms specified before

r = (-200,200)
fig = px.scatter_3d(x=x, y=y, z=z, range_x=r, range_y=r, range_z=r, text=words + [None] * 500)
fig.update_traces(marker=dict(size=3,line=dict(width=2)),textfont_size=10)
fig.show()

## References


*   Tutorials of Natural Language Processing course (A.Y. 2023/2024) at PoliMi


