## Working with the csv file


In [None]:
import pandas as pd
pd.set_option('display.max_rows', 100)

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd '/content/drive/MyDrive/softpr_sose23/Datasets'

/content/drive/MyDrive/softpr_sose23/Datasets


In [None]:
df = pd.read_csv('/content/drive/MyDrive/softpr_sose23/Datasets/icd_defs_modified.csv')
df.head()

Unnamed: 0,Disease,Description,Essential_Features
0,Disorders of intellectual development,Disorders of intellectual development are a gr...,The presence of significant limitations in int...
1,Developmental speech or language disorders,Developmental speech or language disorders ari...,Persistent difficulties in understanding or pr...
2,Autism spectrum disorder,Autism spectrum disorder is characterised by p...,Persistent deficits in initiating and sustaini...
3,Developmental learning disorder,Developmental learning disorder is characteris...,The presence of significant limitations in lea...
4,Developmental motor coordination disorder,Developmental motor coordination disorder is c...,Significant delay in the acquisition of gross ...


In [None]:
disease_names = []
for name in df["Disease"]:
  disease_names.append(name)

In [None]:
disease_names[-1]

'Gender incongruence of childhood'

## Pre-processing

Remove punctuations.

Convert all the texts to lowercase.

Tokenize the texts into individual words.

Remove stop words, such as "and", "the", "a", etc.

Perform lemmatization on the remaining words to convert them to their base form.

Filter out any words that occur infrequently in the corpus to reduce the dimensionality of the data.

Create word embeddings of the texts.

In [None]:
# pre-processing the descriptions for further work
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download("stopwords")
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

df["Description_words"] = df["Description"].fillna('') + ' ' + df["Essential_Features"].fillna('')
df["Description_words"] = df["Description_words"].str.split()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
df['Description_words'] = df['Description_words'].apply(lambda x: [lemmatizer.lemmatize(word.translate(str.maketrans('', '', string.punctuation)).lower()) for word in x if word.lower() not in stop_words])
df['Description_words'] = df['Description_words'].apply(lambda x: [word for word in x if not any(char.isdigit() for char in word)])
df['Description_words'] = df['Description_words'].apply(lambda words: ' '.join(words))

In [None]:
df['Description_words'][0]

'disorder intellectual development group etiologically diverse condition originating developmental period characterised significantly average intellectual functioning adaptive behaviour approximately two standard deviation mean approximately le percentile based appropriately normed individually administered standardized test appropriately normed standardized test available diagnosis disorder intellectual development requires greater reliance clinical judgment based appropriate assessment comparable behavioural indicator presence significant limitation intellectual functioning across various domain perceptual reasoning working memory processing speed verbal comprehension often substantial variability extent domain affected individual whenever possible performance measured using appropriately normed standardized test intellectual functioning found approximately standard deviation mean ie approximately le percentile situation appropriately normed standardized test available assessment int

# Obtaining the text embeddings

The process of creating word embeddings involves training a neural network on a large corpus of text data. However, pre-trained word embeddings are readily available online, and we utilize them in our project. See a complete list of pre-trained models [here](https://github.com/UKPLab/sentence-transformers/blob/master/docs/pretrained-models/sts-models.md).

In [None]:
!pip3 install tueplots==0.0.5
!pip3 install transformers
!pip install torch --upgrade
!pip3 install sentence-transformers==2.2.2

Collecting tueplots==0.0.5
  Downloading tueplots-0.0.5-py3-none-any.whl (18 kB)
Installing collected packages: tueplots
Successfully installed tueplots-0.0.5
Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m75.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_

In [None]:
from sentence_transformers import SentenceTransformer

name = "stsb-bert-large"
model = SentenceTransformer(name)

In [None]:
sentences = list(df.Description_words.values)
disorder_embeddings = model.encode(sentences)

In [None]:
len(disorder_embeddings)

90

In [None]:
df['Embeddings'] = disorder_embeddings.tolist()

In [None]:
df.head()

Unnamed: 0,Disease,Description,Essential_Features,Description_words,Embeddings
0,Disorders of intellectual development,Disorders of intellectual development are a gr...,The presence of significant limitations in int...,disorder intellectual development group etiolo...,"[0.1327546238899231, -0.5305650234222412, 0.24..."
1,Developmental speech or language disorders,Developmental speech or language disorders ari...,Persistent difficulties in understanding or pr...,developmental speech language disorder arise d...,"[0.5160753130912781, -0.642192006111145, -0.37..."
2,Autism spectrum disorder,Autism spectrum disorder is characterised by p...,Persistent deficits in initiating and sustaini...,autism spectrum disorder characterised persist...,"[0.25560277700424194, -0.8141465187072754, -0...."
3,Developmental learning disorder,Developmental learning disorder is characteris...,The presence of significant limitations in lea...,developmental learning disorder characterised ...,"[0.8489938974380493, -0.2814409136772156, 0.57..."
4,Developmental motor coordination disorder,Developmental motor coordination disorder is c...,Significant delay in the acquisition of gross ...,developmental motor coordination disorder char...,"[0.17629142105579376, -0.07963546365499496, -0..."


In [None]:
df.to_csv('icd_embedded.csv', index=False)

# Projection of word embeddings on 3d space



In [None]:
disease_names = []
for name in df["Disease"]:
    disease_names.append(name)

In [None]:
import plotly.express as px
import pandas as pd
from sklearn.decomposition import PCA

# performing dimensionality reduction using PCA
pca = PCA(n_components=3)
embeddings_3d = pca.fit_transform(disorder_embeddings)

# creating a DataFrame for plotly
data_df = pd.DataFrame(embeddings_3d, columns=['PC1', 'PC2', 'PC3'])
data_df['Disease'] = disease_names

# creating an interactive 3D scatter plot
fig = px.scatter_3d(data_df, x='PC1', y='PC2', z='PC3', text='Disease')
fig.show()

# Projection of word embeddings on 2d space

In [None]:
# performing dimensionality reduction using PCA
pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(disorder_embeddings)

# creating a dataframe for plotly
data_df = pd.DataFrame(embeddings_2d, columns=['PC1', 'PC2'])
data_df['Disease'] = disease_names

# creating an interactive 2D scatter plot
fig = px.scatter(data_df, x='PC1', y='PC2', text='Disease')
fig.show()
