## Exercise: Train Custom word embeddings on healthcare dataset


 Dataset credits --> https://www.kaggle.com/datasets/jpmiller/layoutlm

In [1]:
#Import necessary libraries
import pandas as pd
import numpy as np
from string import punctuation
import string
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import fasttext
import re

In [2]:
#read the dataset medquad.csv
df_hc = pd.read_csv("datasets_nlp/medquad.csv")

In [3]:
#print the shape of dataframe
df_hc.shape

(16412, 4)

In [4]:
#print top 5 rows
df_hc.head()

Unnamed: 0,question,answer,source,focus_area
0,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma
1,What causes Glaucoma ?,"Nearly 2.7 million people have glaucoma, a lea...",NIHSeniorHealth,Glaucoma
2,What are the symptoms of Glaucoma ?,Symptoms of Glaucoma Glaucoma can develop in ...,NIHSeniorHealth,Glaucoma
3,What are the treatments for Glaucoma ?,"Although open-angle glaucoma cannot be cured, ...",NIHSeniorHealth,Glaucoma
4,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma


In [5]:
# check null values

df_hc.isnull().sum()

question       0
answer         5
source         0
focus_area    14
dtype: int64

## preprocessing tasks

- drop na values
- reset indexes to maintain consistency
- all lowercase
- remove punctuation




In [6]:
# drop na values
df_hc.dropna(inplace=True)

In [7]:
# reset indexes to maintain consistency
df_hc = df_hc.reset_index(drop=True)

In [8]:
# make a new dataframe consisting 'answer' column only
# (since it is a single column technically the datatype will be series)
df_ans = df_hc['answer']

In [9]:
# write a function that lowercases all text and removes punctuation
# This script affects single row at a time
def preprocess(text):
    temp = ""
    text = text.strip()
    text = text.lower()
    text = text.replace("-"," ").replace("  "," ")
    for word in text:
        if word not in punctuation:
            temp+=word
    return temp        
            

In [10]:
# The above function affects single row at a time 
# Write a for loop in such a way that each row is passed-->preprocessed (with the help of function)--> stored in a dataframe

for i in range(len(df_ans)):
    df_ans[i] = preprocess(df_ans[i])

In [11]:
# save dataframe as .txt using dataframe.to_csv with file extension as .txt
df_ans.to_csv("datasets_nlp/healthcare.txt",header=False,index=False)

In [12]:
# use  fasttext.train_unsupervised to create custom word embeddings by giving the path of .txt file
model_hc = fasttext.train_unsupervised("datasets_nlp/healthcare.txt")

Read 3M words
Number of words:  14242
Number of labels: 0
Progress: 100.0% words/sec/thread:   51996 lr:  0.000000 avg.loss:  1.652157 ETA:   0h 0m 0s


In [13]:
# use model.get_nearest_neighbors to get similar words 
model_hc.get_nearest_neighbors("ecg")

[(0.8263229131698608, 'echocardiogram'),
 (0.8148790597915649, 'echocardiography'),
 (0.7939037084579468, 'electrocardiogram'),
 (0.791140615940094, 'ekg'),
 (0.7846466302871704, 'echo'),
 (0.7412909269332886, 'ekgs'),
 (0.7244842052459717, 'bnp'),
 (0.7211816310882568, 'kar'),
 (0.7004911303520203, 'og'),
 (0.676946222782135, 'doppler')]

In [14]:
model_hc.get_nearest_neighbors("ligament")

[(0.9238018989562988, 'ligaments'),
 (0.7782121896743774, 'tendons'),
 (0.6682389974594116, 'tendon'),
 (0.6544951796531677, 'cartilage'),
 (0.6522432565689087, 'joints'),
 (0.6520083546638489, 'resurfacing'),
 (0.6491118669509888, 'extending'),
 (0.6475667357444763, 'compress'),
 (0.6425794363021851, 'widening'),
 (0.6418549418449402, 'tailbone')]

### https://fasttext.cc/docs/en/unsupervised-tutorial.html

In [15]:
# By reading the documentation get the word vector of the word "brain"
model_hc.get_word_vector("brain")

array([ 0.02016333,  0.30255845,  0.04084903,  0.11208713,  0.10345652,
        0.79780525,  0.3647665 , -0.02003851,  0.57160765, -0.34668088,
        0.1862476 , -0.03828843, -0.23584977, -0.18024597,  0.21052791,
        0.21451059,  0.3546918 ,  0.0197646 ,  0.30479044,  0.16199633,
       -0.26654622,  0.27713412,  0.12995963, -0.29214403, -0.7431378 ,
       -0.3986335 , -0.2762654 ,  0.08387363,  0.44675967,  0.37873188,
        0.03612946, -0.1457945 , -0.15453799,  0.3425216 , -0.04356172,
       -0.07204971, -0.48246548,  0.4327433 ,  0.5472697 , -0.10391239,
        0.33897513, -0.15688512, -0.4303833 ,  0.2605495 , -0.06738608,
       -0.1000694 ,  0.08408111,  0.11826676, -0.67635757,  0.2858772 ,
        0.2585256 ,  0.18417582,  0.44063672, -0.33223882, -0.31031257,
       -0.1642105 ,  0.15793689,  0.2839121 ,  0.56134087,  0.37980518,
       -0.0189843 ,  0.7617114 ,  0.41815984,  0.3020979 , -0.42336625,
       -0.22726956,  0.19179755, -0.04257245, -0.17210221,  0.03