# Import libraries

In [1]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
from geopy.distance import great_circle
from shapely.geometry import MultiPoint
from pymongo import MongoClient
import calendar

# setup database

In [2]:
client = MongoClient('localhost', 27017)
db = client['promed']
posts = db.posts

# Functions

In [37]:
# Gets a unique list of diseases
def get_disease_list():
#   diseaseNames = posts.distinct('subject.diseaseLabels')
#   return sorted(diseaseNames)
    return ['Anthrax']

In [25]:
# Get all the articles that have the current disease in it's "diseaseLabels" array
def get_articles(disease):
#   print('get articles', disease)
  articles = posts.find({ 
    'zoomLat': {'$ne': None}, 
    'zoomLon': {'$ne': None}, 
    'subject.diseaseLabels':{'$not':{'$size': 0}}, 
    'subject.diseaseLabels': {'$in': [disease]}
    },
    {'subject.diseaseLabels':1,
    'zoomLat': 1, 
    'zoomLon': 1, 
    'sourceDate': 1, 
    'promedDate': 1}
  ).limit(100)
  articles = list(articles);
  for article in articles:
    try:
        # not all articles have a sourceDate so fall back to promedDate if missing.
        date = article.get('sourceDate') or article.get('promedDate')
        # convert date object to timestamp so DBSCAN can handle it
        article['sourceDate'] = calendar.timegm(date.timetuple())/10000000.0
        # convert disease labels array to single disease name
        article['subject'] = article['subject']['diseaseLabels'][0]
        article['zoomLat'] = float(article['zoomLat'])
        article['zoomLon'] = float(article['zoomLon'])
    except Exception as e:
        print("Problem parsing article:", article)
        print(e)
        raise
#   print(sorted([x['sourceDate'] for x in articles]))

  return articles

# Single step clustering on timestamp/lat/long

### Questions:
How do I determine the optimal `eps` value here?

How should I take into account the third dimension (timestamp) when determining the `eps` value?  When it was only lat long the values were in similar ranges (-90 -> 90 and -180 -> 180) but with the timestamp values are much larger (ex: `9433152000`) which makes me thing that you will never get two timestamps in the same neighborhood if you have and eps value of, say, 2.  If I begin dividing the timestamp value by 10,000,000 it goes into a similar range for lat/long - is this something I should consider doing?

Is it possible to specify dimensions for a sphereoid to define the `eps`?  It seems like there are a lot of situations where you wouldn't want a perfectly simetrical shape defining the neighborhood.

How would I go about visualizing 3d data?  4d?


In [158]:
def cluster(data, fields, eps=2):
#     print(type(data))
    if type(data) is not np.ndarray:
#         print('convert....')
        data = data.as_matrix(columns=fields)
#     print(type(data))
    #   coordinates = df.as_matrix(columns=['zoomLon', 'zoomLat'])
    dbsc = DBSCAN(eps=eps, min_samples=1, algorithm='ball_tree').fit(data)
    core_samples_mask = np.zeros_like(dbsc.labels_, dtype=bool)
    core_samples_mask[dbsc.core_sample_indices_] = True
    cluster_labels = dbsc.labels_
    num_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
    clusters = pd.Series([data[cluster_labels == n] for n in range(num_clusters)])
#     print('Number of clusters: {}'.format(num_clusters))
#     for cluster in clusters:
#         print(cluster[:])
#     print('***********')
    return clusters

In [202]:
def distance_cluster(data, fields):
    kms_per_radian = 6371.0088
    eps = 1.5 / kms_per_radian
    print("distance data:")
    print(data[fields])
    dbsc = DBSCAN(eps=eps, min_samples=1, algorithm='ball_tree', metric='haversine').fit(data[fields])
    core_samples_mask = np.zeros_like(dbsc.labels_, dtype=bool)
    core_samples_mask[dbsc.core_sample_indices_] = True
    cluster_labels = dbsc.labels_
    num_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
    clusters = pd.Series([data[cluster_labels == n] for n in range(num_clusters)])
    return clusters

In [49]:
def cluster_single_step(df):
    cluster(df,['zoomLon', 'zoomLat', 'sourceDate'])

# Two step temporal/spatial clustering
First cluster on times only.  After that cluster on lat/long great circle distances

In [223]:
def cluster_two_step(df):
    # perform temporal clustering
    temporalClusters = cluster(df,['sourceDate'], .1) # eps of .1 ~ 8 days
#     print(type(temporalClusters))
    # foreach temporal cluster
    for temporalCluster in temporalClusters:
        subSet = df['sourceDate'].isin(temporalCluster.flatten())
#         print("articles in that temporal cluster:")
#         print(df.loc[subSet][['subject','sourceDate','zoomLat','zoomLon']])
        clusterArticles = df.loc[subSet][['sourceDate','zoomLat','zoomLon']]
        print('clusterArticles')
        print(clusterArticles)
#         perform clustering on the articles in that cluster based on the lat/long great circle distance
        spacialTemporalCluster = distance_cluster(clusterArticles, ['zoomLat', 'zoomLon']) 
        print('temporal cluster*******************************')
        print(temporalCluster.flatten())
        print('spatial/temporal cluster***********************')
#         print(spacialTemporalCluster[['sourceDate', 'zoomLat', 'zoomLon']])
        print(spacialTemporalCluster[:].values)
        print()
        print()


# Process articles

In [224]:
diseaseList = get_disease_list()
# for each disease get a list of articles and cluster them
for disease in diseaseList:
    articleList = list(get_articles(disease))
    print("{0} articles for {1}".format(len(articleList), disease))
    df = pd.DataFrame(articleList)
    cluster_single_step(df)
    cluster_two_step(df)
    

100 articles for Anthrax
clusterArticles
   sourceDate    zoomLat    zoomLon
0   135.27648  48.707531  19.491650
1   135.27648  49.202110  21.652161
distance data:
     zoomLat    zoomLon
0  48.707531  19.491650
1  49.202110  21.652161
temporal cluster*******************************
[ 135.27648  135.27648]
spatial/temporal cluster***********************
[   sourceDate    zoomLat   zoomLon
0   135.27648  48.707531  19.49165
    sourceDate   zoomLat    zoomLon
1   135.27648  49.20211  21.652161]


clusterArticles
   sourceDate    zoomLat     zoomLon
2   94.400886  23.873001  121.016998
6   94.408869   8.626220   39.616032
distance data:
     zoomLat     zoomLon
2  23.873001  121.016998
6   8.626220   39.616032
temporal cluster*******************************
[ 94.4008859  94.4088692]
spatial/temporal cluster***********************
[   sourceDate    zoomLat     zoomLon
2   94.400886  23.873001  121.016998
    sourceDate  zoomLat    zoomLon
6   94.408869  8.62622  39.616032]


clusterArticl

# Questions:
What would be the next steps?  How do we determine which cluster future articles fall into?  Do I just re-run this with all the data?