# **Clustering over Medical Linked Data**


In [84]:
!pip install matplotlib

!apt-get install libgeos-3.5.0
!apt-get install libgeos-dev
!pip install https://github.com/matplotlib/basemap/archive/master.zip

!pip install pyproj==1.9.6

import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap

Reading package lists... Done
Building dependency tree       
Reading state information... Done
E: Unable to locate package libgeos-3.5.0
E: Couldn't find any package by glob 'libgeos-3.5.0'
E: Couldn't find any package by regex 'libgeos-3.5.0'
Reading package lists... Done
Building dependency tree       
Reading state information... Done
libgeos-dev is already the newest version (3.6.2-1build2).
0 upgraded, 0 newly installed, 0 to remove and 30 not upgraded.
Collecting https://github.com/matplotlib/basemap/archive/master.zip
  Using cached https://github.com/matplotlib/basemap/archive/master.zip
Building wheels for collected packages: basemap
  Building wheel for basemap (setup.py) ... [?25l[?25hdone
  Created wheel for basemap: filename=basemap-1.2.2+dev-cp37-cp37m-linux_x86_64.whl size=121759581 sha256=e468e44eb540ca07272e0c4a420273deb85ed3f5202def394433be439a338d83
  Stored in directory: /tmp/pip-ephem-wheel-cache-6mxxlup0/wheels/98/4a/fc/ce719b75d97e646645c225f3332b1b21753610031

In [85]:
import requests, pprint

In [86]:
!pip install rdflib

from rdflib import Graph, URIRef, Literal
from rdflib.namespace import RDFS, FOAF



In [87]:
import pandas as pd, urllib
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans,AgglomerativeClustering,AffinityPropagation
from sklearn.mixture import GaussianMixture

## Clustering number of deaths and Omim for Oncology diseases 

OMIM - Online Mendelian Inheritance in Man    
Phenotype MIM number = 603956 | CERVICAL CANCER   
Omim.org:    
https://www.omim.org/entry/603956?search=603956&highlight=603956

In [88]:
query = """
select distinct ?Thing ?Name ?Deaths   ?Omim
where {  ?Thing  dbp:field dbr:Oncology ;
                            dbp:name ?Name ;
                            dbp:deaths ?Deaths;
                            dbp:omim ?Omim .
 filter(datatype(?Deaths ) = xsd:integer)
}


"""
endpoint = "http://dbpedia.org/sparql"

# We need to encode the query string for the HTTP request.
param = urllib.parse.urlencode({'default-graph-uri': 'http://dbpedia.org',
                                'query': query,
                                'format': 'text/csv'})

We read the data from the SPARQL endpoint using it as a REST service. The data is returned in CSV, through implicit content negotiation, and is loaded directly using Pandas.

In [89]:
data = pd.read_csv(endpoint + '?' + param)
print(data)

                                                Thing  ...    Omim
0             http://dbpedia.org/resource/Lung_cancer  ...  211980
1          http://dbpedia.org/resource/Bladder_cancer  ...  109800
2           http://dbpedia.org/resource/Breast_cancer  ...  114480
3         http://dbpedia.org/resource/Cervical_cancer  ...  603956
4            http://dbpedia.org/resource/Liver_cancer  ...  114550
5        http://dbpedia.org/resource/Kaposi's_sarcoma  ...  148000
6       http://dbpedia.org/resource/Pancreatic_cancer  ...  260350
7         http://dbpedia.org/resource/Prostate_cancer  ...  176807
8       http://dbpedia.org/resource/Testicular_cancer  ...  273300
9          http://dbpedia.org/resource/Stomach_cancer  ...  137215
10  http://dbpedia.org/resource/Acute_myeloid_leuk...  ...  602439
11      http://dbpedia.org/resource/Colorectal_cancer  ...  114500
12          http://dbpedia.org/resource/Neuroblastoma  ...  256700
13     http://dbpedia.org/resource/Endometrial_cancer  ...  60

We create a method for [K-Means clustering](https://en.wikipedia.org/wiki/K-means_clustering), to simply wrap the process in a single method call.

In [90]:
def doKmeans(X, nclust=2):
    model = KMeans(nclust)
    model.fit(X)
    clust_labels = model.predict(X)
    cent = model.cluster_centers_
    return (clust_labels, cent)

In [91]:
datashort = data[['Deaths','Omim']]
print(datashort)

     Deaths    Omim
0   1700000  211980
1    200000  109800
2    627000  114480
3    311000  603956
4    782000  114550
5     20000  148000
6    411600  260350
7    359000  176807
8      9400  273300
9    783000  137215
10   147100  602439
11   551000  114500
12       15  256700
13    89900  608089
14   509000  133239
15    32400  156240


We perform the clustering, using 4 clusters.

In [92]:
clust_labels, cent = doKmeans(datashort, 4)
kmeans = pd.DataFrame(clust_labels)

# We insert the clustering data into our DataFrame
datashort.insert((datashort.shape[1]),'kmeans',kmeans)
print(datashort)

     Deaths    Omim  kmeans
0   1700000  211980       1
1    200000  109800       3
2    627000  114480       0
3    311000  603956       2
4    782000  114550       0
5     20000  148000       3
6    411600  260350       0
7    359000  176807       0
8      9400  273300       3
9    783000  137215       0
10   147100  602439       2
11   551000  114500       0
12       15  256700       3
13    89900  608089       2
14   509000  133239       0
15    32400  156240       3


In [93]:
import plotly.graph_objects as go
import numpy as np

fig = go.Figure(data=go.Scatter(
    x = datashort['Deaths'],
    y = datashort['Omim'],
    mode='markers',
    marker=dict(
        size=25,
        color= kmeans[0], #set color equal to a variable
        colorscale='BrBG', # one of plotly colorscales
        showscale=True
    )
))
fig.update_xaxes(title_text='Deaths')
fig.update_yaxes(title_text='Omim')


fig.show()

Next, we create a method for [Agglomerative clustering](https://en.wikipedia.org/wiki/Hierarchical_clustering).

In [94]:
def doAgglomerative(X, nclust=2):
    model = AgglomerativeClustering(n_clusters=nclust, 
                                    affinity = 'euclidean', linkage = 'ward')
    clust_labels1 = model.fit_predict(X)
    return (clust_labels1)

In [95]:
datashort = data[['Deaths','Omim']]
print(datashort)

     Deaths    Omim
0   1700000  211980
1    200000  109800
2    627000  114480
3    311000  603956
4    782000  114550
5     20000  148000
6    411600  260350
7    359000  176807
8      9400  273300
9    783000  137215
10   147100  602439
11   551000  114500
12       15  256700
13    89900  608089
14   509000  133239
15    32400  156240


We perform the clustering, using 4 clusters.

In [96]:
clust_labels1 = doAgglomerative(datashort, 4)
agglomerative = pd.DataFrame(clust_labels1)

# We insert the clustering data into our DataFrame
datashort.insert((datashort.shape[1]),'agglomerative',agglomerative)
print(datashort)

     Deaths    Omim  agglomerative
0   1700000  211980              2
1    200000  109800              0
2    627000  114480              1
3    311000  603956              3
4    782000  114550              1
5     20000  148000              0
6    411600  260350              0
7    359000  176807              0
8      9400  273300              0
9    783000  137215              1
10   147100  602439              3
11   551000  114500              1
12       15  256700              0
13    89900  608089              3
14   509000  133239              1
15    32400  156240              0


Now we plot the companies on a scatter plot, adding color to represent the cluster each company belongs to.

In [97]:
import plotly.graph_objects as go
import numpy as np

fig = go.Figure(data=go.Scatter(
    x = datashort['Deaths'],
    y = datashort['Omim'],
    mode='markers',
    marker=dict(
        size=25,
        color= agglomerative[0], #set color equal to a variable
        colorscale='Bluered_r', # one of plotly colorscales
        showscale=True
    )
))

fig.update_xaxes(title_text='Deaths')
fig.update_yaxes(title_text='Omim')

fig.show()

We create a method for [Affinity propagation](https://en.wikipedia.org/wiki/Affinity_propagation).

In [98]:
def doAffinity(X):
    model = AffinityPropagation(damping = 0.9, 
                                max_iter = 250, affinity = 'euclidean')
    model.fit(X)
    clust_labels2 = model.predict(X)
    cent2 = model.cluster_centers_
    return (clust_labels2, cent2)

In [99]:
datashort = data[['Deaths','Omim']]
print(datashort)

     Deaths    Omim
0   1700000  211980
1    200000  109800
2    627000  114480
3    311000  603956
4    782000  114550
5     20000  148000
6    411600  260350
7    359000  176807
8      9400  273300
9    783000  137215
10   147100  602439
11   551000  114500
12       15  256700
13    89900  608089
14   509000  133239
15    32400  156240


We perform the clustering.

In [100]:
clust_labels2, cent2 = doAffinity(datashort)
affinity = pd.DataFrame(clust_labels2)

# We insert the clustering data into our DataFrame.
datashort.insert((datashort.shape[1]),'affinity',affinity)
print(datashort)

     Deaths    Omim  affinity
0   1700000  211980         0
1    200000  109800         0
2    627000  114480         0
3    311000  603956         0
4    782000  114550         0
5     20000  148000         0
6    411600  260350         0
7    359000  176807         0
8      9400  273300         0
9    783000  137215         0
10   147100  602439         0
11   551000  114500         0
12       15  256700         0
13    89900  608089         0
14   509000  133239         0
15    32400  156240         0


Now we plot the companies on a scatter plot, adding color to represent the cluster each company belongs to.

In [101]:
import plotly.graph_objects as go
import numpy as np

fig = go.Figure(data=go.Scatter(
    x = datashort['Deaths'],
    y = datashort['Omim'],
    mode='markers',
    marker=dict(
        size=50,
        color= affinity[0], #set color equal to a variable
        #colorscale='picnic', # one of plotly colorscales
        showscale=True
    )
))



fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))

fig.update_xaxes(title_text='Deaths')
fig.update_yaxes(title_text='Omim')

fig.show()

We create a method for [Gaussian mixtures](https://en.wikipedia.org/wiki/Mixture_model).

In [102]:
def doGMM(X, nclust=2):
    model = GaussianMixture(n_components=nclust,init_params='kmeans')
    model.fit(X)
    clust_labels3 = model.predict(X)
    return (clust_labels3)

In [103]:
datashort = data[['Deaths','Omim']]
print(datashort)

     Deaths    Omim
0   1700000  211980
1    200000  109800
2    627000  114480
3    311000  603956
4    782000  114550
5     20000  148000
6    411600  260350
7    359000  176807
8      9400  273300
9    783000  137215
10   147100  602439
11   551000  114500
12       15  256700
13    89900  608089
14   509000  133239
15    32400  156240


We perform the clustering.

In [104]:
clust_labels3 = doGMM(datashort,4)
gmm = pd.DataFrame(clust_labels3)

# We insert the clustering data into our DataFrame.
datashort.insert((datashort.shape[1]),'gmm',gmm)
print(datashort)

     Deaths    Omim  gmm
0   1700000  211980    2
1    200000  109800    3
2    627000  114480    0
3    311000  603956    1
4    782000  114550    0
5     20000  148000    1
6    411600  260350    3
7    359000  176807    3
8      9400  273300    1
9    783000  137215    0
10   147100  602439    1
11   551000  114500    0
12       15  256700    1
13    89900  608089    1
14   509000  133239    0
15    32400  156240    1


In [105]:
import plotly.graph_objects as go
import numpy as np

fig = go.Figure(data=go.Scatter(
    x = datashort['Deaths'],
    y = datashort['Omim'],
    mode='markers',
    marker=dict(
        size=50,
        color= gmm[0], #set color equal to a variable
        colorscale='picnic', # one of plotly colorscales
        showscale=True
    )
))



fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.update_xaxes(title_text='Deaths')
fig.update_yaxes(title_text='Omim')

fig.show()