## Working with the csv files


In [1]:
import pandas as pd
pd.set_option('display.max_rows', 100)
import numpy as np
import re
from sklearn.metrics.pairwise import cosine_similarity
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd '/content/drive/MyDrive/softpr_sose23/Datasets/characters_embeddings'

/content/drive/MyDrive/softpr_sose23/Datasets/characters_embeddings


In [3]:
def similarity_top(char_emb, disorder_embs):

  # reshaping the character_embedding to match the shape of mental_disorder_embeddings
  char_emb = char_emb.reshape(1, -1)
  # calculating the cosine similarity
  similarity_scores = cosine_similarity(disorder_embs, char_emb)

  scores_names = []
  for score, name in zip(similarity_scores, disease_names):
      data = {"disease_name": name, "similarity_score": score}
      scores_names.append(data)

  scores_names = sorted(scores_names, key=lambda x: x['similarity_score'], reverse=True)

  results = []

  for item in scores_names:
    disease_name = item['disease_name']
    similarity_score = item['similarity_score'][0]
    results.append((disease_name, similarity_score))

  return results[:5]


def convert_string_to_numpy_array(s):
    '''Function to convert a string to a NumPy array'''
    numbers_list = re.findall(r'-?\d+\.\d+', s)
    return np.array(numbers_list, dtype=np.float64)

In [4]:
df_icd = pd.read_csv('/content/drive/MyDrive/softpr_sose23/Datasets/icd_data/icd_embedded.csv')
df_icd.head()

Unnamed: 0,Disease,Description,Essential_Features,Description_words,Embeddings
0,Disorders of intellectual development,Disorders of intellectual development are a gr...,The presence of significant limitations in int...,disorder intellectual development group etiolo...,"[0.1327546238899231, -0.5305650234222412, 0.24..."
1,Developmental speech or language disorders,Developmental speech or language disorders ari...,Persistent difficulties in understanding or pr...,developmental speech language disorder arise d...,"[0.5160753130912781, -0.642192006111145, -0.37..."
2,Autism spectrum disorder,Autism spectrum disorder is characterised by p...,Persistent deficits in initiating and sustaini...,autism spectrum disorder characterised persist...,"[0.25560277700424194, -0.8141465187072754, -0...."
3,Developmental learning disorder,Developmental learning disorder is characteris...,The presence of significant limitations in lea...,developmental learning disorder characterised ...,"[0.8489938974380493, -0.2814409136772156, 0.57..."
4,Developmental motor coordination disorder,Developmental motor coordination disorder is c...,Significant delay in the acquisition of gross ...,developmental motor coordination disorder char...,"[0.17629142105579376, -0.07963546365499496, -0..."


In [5]:
df_dost = pd.read_csv('/content/drive/MyDrive/softpr_sose23/Datasets/characters_embeddings/oscar_wilde_embedded.csv')
df_dost.head()

Unnamed: 0,name,novel,publishing_year,gender,description,description_words,Embeddings
0,Dorian Gray,The picture of Dorian Gray,1890,m,Dorian Gray is the central character in Oscar ...,dorian gray central character oscar wilde nove...,"[0.6452938914299011, -0.5137581825256348, 0.91..."
1,Lord Henry Wotton,The picture of Dorian Gray,1890,m,Lord Henry Wotton is a central character in Os...,lord henry wotton central character oscar wild...,"[0.9790019392967224, -0.5674798488616943, 1.04..."
2,Basil Hallward,The picture of Dorian Gray,1890,m,Basil Hallward is a significant character in O...,basil hallward significant character oscar wil...,"[0.6276613473892212, -0.14477911591529846, 0.6..."
3,Sibyl Vane,The picture of Dorian Gray,1890,f,Sibyl Vane is a pivotal character in Oscar Wil...,sibyl vane pivotal character oscar wilde novel...,"[0.4086354076862335, -0.3449292778968811, -0.3..."
4,James Vane,The picture of Dorian Gray,1890,m,James Vane is a character in Oscar Wilde's nov...,james vane character oscar wilde novel the pic...,"[0.6833420991897583, 0.16091173887252808, 0.79..."


In [6]:
df_icd['numpy_array'] = df_icd['Embeddings'].apply(convert_string_to_numpy_array)
df_dost['numpy_array'] = df_dost['Embeddings'].apply(convert_string_to_numpy_array)

## Cosine similarity
Higher Cosine Similarity Score: A higher cosine similarity score indicates a stronger similarity or resemblance between the compared vectors. When the cosine similarity score is closer to 1, it suggests that the vectors have similar directions and are pointing in similar directions in the multi-dimensional space. In other words, the vectors are more aligned or closer in terms of their semantic meaning or context.

Lower Cosine Similarity Score: On the other hand, a lower cosine similarity score indicates less similarity between the compared vectors. When the cosine similarity score is closer to 0, it suggests that the vectors have different or dissimilar directions in the multi-dimensional space. This implies that the vectors are further apart and have less semantic similarity or contextual resemblance.

In [7]:
disease_names = []
for name in df_icd["Disease"]:
    disease_names.append(name)

In [8]:
icd_embeddings = np.array(df_icd["numpy_array"].tolist())

In [9]:
df_dost["numpy_array"][0]

array([ 0.64529389, -0.51375818,  0.91574383, ...,  0.02556077,
        1.0349133 , -0.39464551])

In [10]:
disorders_characters = {}

for name, i in zip(list(df_dost.name.values), range(len(icd_embeddings))):
    sim_score = similarity_top(df_dost["numpy_array"][i], icd_embeddings)
    disorders_characters[name] = sim_score

In [11]:
df_dost['similarity_scores'] = None

for i, row in df_dost.iterrows():
    character_name = row['name']
    if character_name in disorders_characters:
        similarity_scores = disorders_characters[character_name]
        df_dost.at[i, 'similarity_scores'] = similarity_scores
    else:
        df_dost.at[i, 'similarity_scores'] = []

In [12]:
df_dost.head()

Unnamed: 0,name,novel,publishing_year,gender,description,description_words,Embeddings,numpy_array,similarity_scores
0,Dorian Gray,The picture of Dorian Gray,1890,m,Dorian Gray is the central character in Oscar ...,dorian gray central character oscar wilde nove...,"[0.6452938914299011, -0.5137581825256348, 0.91...","[0.6452938914299011, -0.5137581825256348, 0.91...",[(Secondary obsessive-compulsive or related sy...
1,Lord Henry Wotton,The picture of Dorian Gray,1890,m,Lord Henry Wotton is a central character in Os...,lord henry wotton central character oscar wild...,"[0.9790019392967224, -0.5674798488616943, 1.04...","[0.9790019392967224, -0.5674798488616943, 1.04...","[(Secondary impulse control syndrome, 0.458828..."
2,Basil Hallward,The picture of Dorian Gray,1890,m,Basil Hallward is a significant character in O...,basil hallward significant character oscar wil...,"[0.6276613473892212, -0.14477911591529846, 0.6...","[0.6276613473892212, -0.14477911591529846, 0.6...","[(Personality disorder, 0.4932267215779592), (..."
3,Sibyl Vane,The picture of Dorian Gray,1890,f,Sibyl Vane is a pivotal character in Oscar Wil...,sibyl vane pivotal character oscar wilde novel...,"[0.4086354076862335, -0.3449292778968811, -0.3...","[0.4086354076862335, -0.3449292778968811, -0.3...","[(Separation anxiety disorder, 0.4440922725660..."
4,James Vane,The picture of Dorian Gray,1890,m,James Vane is a character in Oscar Wilde's nov...,james vane character oscar wilde novel the pic...,"[0.6833420991897583, 0.16091173887252808, 0.79...","[0.6833420991897583, 0.16091173887252808, 0.79...","[(Separation anxiety disorder, 0.4913517119684..."


In [13]:
df_dost['similarity_scores'][0]

[('Secondary obsessive-compulsive or related syndrome', 0.5271600225308122),
 ('Kleptomania', 0.5005715213903825),
 ('Secondary impulse control syndrome', 0.49317024242540636),
 ('Secondary personality change', 0.4842144130164494),
 ('Disorders due to addictive behaviours', 0.48222430416700784)]

In [14]:
df_dost = df_dost.drop(columns=["Embeddings"])

In [15]:
len(df_dost)

43

In [None]:
# saving the dataframe as a CSV file
"""
file_path = '/content/drive/MyDrive/softpr_sose23/eta_hoffmann_disorders.csv'
df_dost.to_csv(file_path, index=False)
"""

In [None]:
df_dost["name"][0]

'Bernard'

# 3d projection - character and disorders


In [None]:
import plotly.express as px
import pandas as pd
from sklearn.decomposition import PCA

# performing dimensionality reduction using PCA
pca = PCA(n_components=3)
disease_embeddings_3d = pca.fit_transform(icd_embeddings)

# creating a DataFrame for disease embeddings plot
disease_data_df = pd.DataFrame(disease_embeddings_3d, columns=['PC1', 'PC2', 'PC3'])
disease_data_df['Type'] = 'Disease'
disease_data_df['Name'] = disease_names

char_embedding_2d = df_dost['numpy_array'][0].reshape(1, -1)
char_embedding_3d = pca.transform(char_embedding_2d)
# creating a DataFrame for character embedding plot
character_data_df = pd.DataFrame(char_embedding_3d, columns=['PC1', 'PC2', 'PC3'])
character_data_df['Type'] = 'Character'
character_data_df['Name'] = df_dost["name"][0]

# concatenating the two DataFrames
combined_data_df = pd.concat([disease_data_df, character_data_df], ignore_index=True)

# creating an interactive 3D scatter plot
fig = px.scatter_3d(combined_data_df, x='PC1', y='PC2', z='PC3', text='Name', color='Type', symbol='Type', width=800, height=800)
fig.show()

In [None]:
fig.write_html("plotly_author.html")

# 3D projection of all the characters with a drop-down menu

In [None]:
!pip3 install tueplots==0.0.5
!pip3 install transformers
!pip install torch --upgrade
!pip3 install sentence-transformers==2.2.2

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

name = "stsb-bert-large"
model = SentenceTransformer(name)

# getting the description input and convert it into an embedding
description = input()
description_embedding = model.encode([description])

In [None]:
!pip install dash

Collecting dash
  Downloading dash-2.13.0-py3-none-any.whl (10.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
Collecting Werkzeug<2.3.0 (from dash)
  Downloading Werkzeug-2.2.3-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.6/233.6 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
Collecting dash-html-components==2.0.0 (from dash)
  Downloading dash_html_components-2.0.0-py3-none-any.whl (4.1 kB)
Collecting dash-core-components==2.0.0 (from dash)
  Downloading dash_core_components-2.0.0-py3-none-any.whl (3.8 kB)
Collecting dash-table==5.0.0 (from dash)
  Downloading dash_table-5.0.0-py3-none-any.whl (3.9 kB)
Collecting retrying (from dash)
  Downloading retrying-1.3.4-py3-none-any.whl (11 kB)
Collecting ansi2html (from dash)
  Downloading ansi2html-1.8.0-py3-none-any.whl (16 kB)
Installing collected packages: dash-table, dash-html-components, dash-core-components, W

In [None]:
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.graph_objs as go
import pandas as pd
from sklearn.decomposition import PCA

# creating a Dash app
app = dash.Dash(__name__)

# defining the layout of the web page
app.layout = html.Div([
    html.H1(''),
    dcc.Dropdown(
        id='character-dropdown',
        options=[{'label': name, 'value': i} for i, name in enumerate(df_dost["name"])],
        value=0
    ),
    dcc.Graph(id='scatter-plot')
])

@app.callback(
    Output('scatter-plot', 'figure'),
    [Input('character-dropdown', 'value')]
)
def update_plot(selected_character_index):
    # performing PCA for the selected character embedding
    char_embedding_2d = df_dost['numpy_array'][selected_character_index].reshape(1, -1)
    char_embedding_3d = pca.transform(char_embedding_2d)

    # creating a DataFrame for character embedding plot
    character_data_df = pd.DataFrame(char_embedding_3d, columns=['PC1', 'PC2', 'PC3'])
    character_data_df['Type'] = 'Character'
    character_data_df['Name'] = df_dost["name"][selected_character_index]

    # concatenating the two DataFrames
    combined_data_df = pd.concat([disease_data_df, character_data_df], ignore_index=True)

    # building an interactive 3D scatter plot
    fig = go.Figure()
    for _, row in combined_data_df.iterrows():
        fig.add_trace(
            go.Scatter3d(
                x=[row['PC1']],
                y=[row['PC2']],
                z=[row['PC3']],
                mode='markers+text',
                text=[row['Name']],
                textposition='top center',
                marker=dict(
                    size=5,
                    color='red' if row['Type'] == 'Character' else 'blue'
                ),
                name=row['Type']
            )
        )

    fig.update_layout(
        title='',
        scene=dict(
            xaxis_title='PC1',
            yaxis_title='PC2',
            zaxis_title='PC3'
        ),
        showlegend=False
    )

    return fig

if __name__ == '__main__':
    app.run_server(debug=True)


# Alternative to Dash

In [16]:
%cd '/content/drive/MyDrive/softpr_sose23/Datasets/characters_plots'

/content/drive/MyDrive/softpr_sose23/Datasets/characters_plots


In [None]:
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
from sklearn.decomposition import PCA
import json
import plotly

# performing dimensionality reduction using PCA
pca = PCA(n_components=3)
disease_embeddings_3d = pca.fit_transform(icd_embeddings)

# creating a DataFrame for disease embeddings plot
disease_data_df = pd.DataFrame(disease_embeddings_3d, columns=['PC1', 'PC2', 'PC3'])
disease_data_df['Type'] = 'Disease'
disease_data_df['Name'] = disease_names

# initializing character data
character_data = []

# populating character data
for idx, character_name in enumerate(df_dost["name"]):
    char_embedding_2d = df_dost['numpy_array'][idx].reshape(1, -1)
    char_embedding_3d = pca.transform(char_embedding_2d)
    character_data.append({
        'PC1': char_embedding_3d[0, 0],
        'PC2': char_embedding_3d[0, 1],
        'PC3': char_embedding_3d[0, 2],
        'Type': 'Character',
        'Name': character_name
    })

# creating a DataFrame for character data
character_data_df = pd.DataFrame(character_data)

# combining character and disease data
combined_data_df = pd.concat([disease_data_df, character_data_df], ignore_index=True)

hover_data = {
        'PC1': False,
        'PC2': False,
        'PC3': False,
        'Name': True,
        'Type': True
    }

# creating an interactive 3D scatter plot
fig = px.scatter_3d(combined_data_df, x='PC1', y='PC2', z='PC3', color='Type', symbol='Type', hover_data=hover_data, width=1000, height=800)


"""
# adding a transparent sphere around the character embedding point
sphere_radius_pixels = 20
sphere_trace = go.Scatter3d(
    x=[0], y=[0], z=[0],
    mode='markers',
    marker=dict(size=sphere_radius_pixels, color='rgba(0, 128, 255, 0.7)', opacity=0.5),
    name='Character Sphere'
)
fig.add_trace(sphere_trace)
"""

# HTML piece
html_template = f"""
<!DOCTYPE html>
<html>
<head>
    <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
    <script src="https://cdn.plot.ly/plotly-locale-en-latest.min.js"></script>
</head>
<body>

<div id="scatter-plot"></div>

<script>
    var characterData = {character_data_df.to_json(orient='records')};

    var data = {json.dumps(fig, cls=plotly.utils.PlotlyJSONEncoder)};
    var plotDiv = document.getElementById('scatter-plot');
    Plotly.newPlot(plotDiv, data);
</script>

</body>
</html>
"""

# saving the HTML file
with open('oscar_plot.html', 'w') as f:
    f.write(html_template)

fig.show()
