In [7]:
import pandas as pd
from tboc import sql
df = sql.innovation.sql_query("SELECT * FROM firms WHERE firms.city = 'London'")
date = df.collection_date.unique()[-2]
df = df[df.collection_date == date]
df = df[df.latitude.isnull() == False]
df.to_csv('test_data.csv', index = False)

# DB SCan Clustering
This Jupyter Notebook demonstrates the use of my DB Scan script. The script be used to identify 
clusters defined by the user. Ie, a point is only included in a cluster if the cluster itself 
reaches a critical mass of points, and the user can set a maximum distance at which point a point
can no longer be considered a part of the same cluster.

The test dataset used below is a dataframe of innovative companies in London from Crunchbase. We will peform DB scan clustering to identify where cluster of these firms exist in London.

In [8]:
# Import and review the test data
df = pd.read_csv('test_data.csv')
df.sample(3)

Unnamed: 0,iid,organization_name,description,cb_rank_(company),categories,headquarters_location,city,number_of_employees,coord,latitude,...,collection_date,founded_date,founded_date_precision,website,funding_status,last_funding_type,last_funding_amount_currency_(in_usd),similarweb__global_traffic_rank,similarweb__average_visits_(6_months),ipo_status
2252,30596,Warren Creative - Brand Design Agency,Warren Creative is an award-winning specialist...,441513.0,"Advertising, Brand Marketing, Graphic Design, ...","London, England, United Kingdom",London,,"[51.520939, -0.09863399999999999]",51.520939,...,2019-08-01,,,,,,,,,
2152,30426,UBM Plc,UBM is a global events-led marketing and commu...,44474.0,"B2B, Digital Marketing, Events, Information Se...","London, England, United Kingdom",London,,"[51.5066188, -0.1040966]",51.506619,...,2019-08-01,,,,,,,,,
1659,29591,PAT Testing Nationwide,PAT Testing Company UK,698203.0,,"London, England, United Kingdom",London,,"[51.523464, -0.1027328]",51.523464,...,2019-08-01,,,,,,,,,


In [9]:
# Import DB Scan Cluster Script
from db_scan import db_scan_clustering
help(db_scan_clustering)

Help on function db_scan_clustering in module db_scan:

db_scan_clustering(df, lat_col, long_col, max_distance, min_sample)
    Identifes clusters from a sample of points. Clusters are based on closest distance, however
    what defines a cluster is qualified below. The user can specify a minimum number of points 
    needed to form a cluster, as well as specifying the maximum distance which should seperate
    one cluster from another.
    
    df              Your dataframe of points, including longitude and latitude coordinates.
    long_col        The column containing longitude coordinates
    latitude_col    The column containing latitude coordinates
    max distance    The maximum distance (in meters) between a cluster and a point in order
                    for it to be considered a part of the same cluster.
    min sample      The minimum number of points needed to form a cluster
    
    Returns a series of cluster labels. Points which recieve a cluster label of '0' do not q

In [10]:
# Use module to find all clusters of 8 or more companies
df['cluster'] = db_scan_clustering(df, 'latitude', 'longitude', 600, 8)

In [11]:
# Use my plot folium script to plot quickly plot the points on an exploratory map
# Note - the map will note display inside the GitHub preview. You will need to download
# and open the notebook.

from plot_folium import get_folium_map
import warnings # Turn of deprecation warnings
warnings.simplefilter("ignore")


m = get_folium_map(df[df.cluster != -1], 'latitude', 'longitude', 
                   colour_by = 'cluster', 
                   popup_name_col = 'organization_name')

m