# Import libraries

In [1]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
from geopy.distance import great_circle
from shapely.geometry import MultiPoint
from pymongo import MongoClient
import calendar

# setup database

In [2]:
client = MongoClient('localhost', 27017)
db = client['promed']
posts = db.posts

# Functions

In [3]:
def get_disease_list():
  diseaseNames = posts.distinct('subject.diseaseLabels')
  return sorted(diseaseNames)

In [9]:
def get_articles(disease):
#   print('get articles', disease)
  articles = posts.find({ 
    'zoomLat': {'$ne': None}, 
    'zoomLon': {'$ne': None}, 
    'subject.diseaseLabels':{'$not':{'$size': 0}}, 
    'subject.diseaseLabels': {'$in': [disease]}
    },
    {'subject.diseaseLabels':1,
    'zoomLat': 1, 
    'zoomLon': 1, 
    'sourceDate': 1, 
    'promedDate': 1}
  ).limit(100)
  articles = list(articles);
  for article in articles:
    try:
      # not all articles have a sourceDate so fall back to promedDate if missing.
      date = article['sourceDate'] or article['promedDate']
      # convert date object to timestamp so DBSCAN can handle it
      article['sourceDate'] = calendar.timegm(date.timetuple())/10000000.0
      # convert disease labels array to single disease name
      article['subject'] = article['subject']['diseaseLabels'][0]
    except Exception as e:
      print("Problem parsing article:", article)
      print(e)
      raise
#   print(sorted([x['sourceDate'] for x in articles]))

  return articles

# Questions:
How do I determine the optimal `eps` value here?

How should I take into account the third dimension (timestamp) when determining the `eps` value?  When it was only lat long the values were in similar ranges (-90 -> 90 and -180 -> 180) but with the timestamp values are much larger (ex: `9433152000`) which makes me thing that you will never get two timestamps in the same neighborhood if you have and eps value of, say, 2.  If I begin dividing the timestamp value by 10,000,000 it goes into a similar range for lat/long - is this something I should consider doing?

Is it possible to specify dimensions for a sphereoid to define the `eps`?  It seems like there are a lot of situations where you wouldn't want a perfectly simetrical shape defining the neighborhood.

How would I go about visualizing 3d data?  4d?


In [7]:
def cluster_data(df):

    def plot_results():
        unique_labels = set(dbsc.labels_)
        colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
        for k, col in zip(unique_labels, colors):
          if k == -1:
              # Black used for noise.
              col = 'k'

          class_member_mask = (unique_labels == k)

          plt.plot(df['zoomLon'], df['zoomLat'], 'o', markerfacecolor=col,
                   markeredgecolor='k', markersize=14)

        plt.title('Estimated number of clusters: %d' % num_clusters)
        plt.show()

    coordinates = df.as_matrix(columns=['zoomLon', 'zoomLat', 'sourceDate'])
    #   coordinates = df.as_matrix(columns=['zoomLon', 'zoomLat'])
    dbsc = DBSCAN(eps=.1, min_samples=1, algorithm='ball_tree').fit(coordinates)
    core_samples_mask = np.zeros_like(dbsc.labels_, dtype=bool)
    core_samples_mask[dbsc.core_sample_indices_] = True
    cluster_labels = dbsc.labels_
    num_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
    clusters = pd.Series([coordinates[cluster_labels == n] for n in range(num_clusters)])
    #   plot_results()
    print('Number of clusters: {}'.format(num_clusters))
    print('***********')
#   print('Cluster names', cluster_labels)


# Process articles

In [10]:
diseaseList = get_disease_list()
# for each disease get a list of articles and cluster them
for disease in diseaseList:
    articleList = list(get_articles(disease))
    print("{0} articles for {1}".format(len(articleList), disease))
    df = pd.DataFrame(articleList)
    cluster_data(df)

1 articles for African Swine Fever
Number of clusters: 1
***********
1 articles for Algae
Number of clusters: 1
***********
11 articles for Anthrax
Number of clusters: 8
***********
7 articles for Antibiotic resistance
Number of clusters: 6
***********
1 articles for Aspergillosis
Number of clusters: 1
***********
3 articles for Avian Influenza
Number of clusters: 2
***********
38 articles for BSE
Number of clusters: 21
***********
2 articles for Bluetongue
Number of clusters: 2
***********
8 articles for Botulism
Number of clusters: 4
***********
12 articles for Brucellosis
Number of clusters: 8
***********
27 articles for CJD
Number of clusters: 10
***********
7 articles for Campylobacter
Number of clusters: 5
***********
2 articles for Canine distemper
Number of clusters: 1
***********
5 articles for Cat Scratch Fever
Number of clusters: 5
***********
47 articles for Cholera
Number of clusters: 39
***********
2 articles for Chronic Wasting Disease
Number of clusters: 2
***********
3

# Questions:
What would be the next steps?  How do we determine which cluster future articles fall into?  Do I just re-run this with all the data?