In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# station_list_with_entry_exit_count
#slist = pd.read_csv("/content/drive/My Drive/Data Links/stations_entry_exit.csv")

# #Dhanush Link
slist = pd.read_csv("/content/drive/My Drive/Data Links/stations_entry_exit.csv")

slist_df = pd.DataFrame(slist)

In [4]:
slist.head()

Unnamed: 0,STATION_ID,NAME,lat,lon,ENTRY_COUNT,EXIT_COUNT
0,MSTN_001,Anacostia,38.862971,-76.995373,150542,154504
1,MSTN_002,Archives-Navy Meml,38.893673,-77.021917,210151,206475
2,MSTN_003,Benning Road,38.890983,-76.938367,58776,52299
3,MSTN_004,Brookland-CUA,38.933219,-76.994537,138135,136010
4,MSTN_005,Capitol South,38.88507,-77.005142,159656,173799


# Entry, Exit Common

In [5]:
entry_data_slist = slist.loc[slist.index.repeat(slist.ENTRY_COUNT//1000)]
#removed //1000

In [6]:
entry_data_slist.head()

Unnamed: 0,STATION_ID,NAME,lat,lon,ENTRY_COUNT,EXIT_COUNT
0,MSTN_001,Anacostia,38.862971,-76.995373,150542,154504
0,MSTN_001,Anacostia,38.862971,-76.995373,150542,154504
0,MSTN_001,Anacostia,38.862971,-76.995373,150542,154504
0,MSTN_001,Anacostia,38.862971,-76.995373,150542,154504
0,MSTN_001,Anacostia,38.862971,-76.995373,150542,154504


In [7]:
entry_data_slist.size

92556

In [8]:
exit_data_slist = slist.loc[slist.index.repeat(slist.EXIT_COUNT//1000)]
#removed //1000

In [9]:
exit_data_slist.size

92592

# **Evaluation**

In [10]:
import matplotlib.pyplot as plt
from sklearn import cluster
from sklearn.cluster import KMeans, OPTICS
from sklearn.metrics import silhouette_samples, silhouette_score
from geopy.distance import great_circle

coords = entry_data_slist.loc[:,["lat","lon"]]

In [11]:
from sklearn.cluster import KMeans

In [12]:
#https://realpython.com/k-means-clustering-python/

 **Feature Agglomeration**

In [13]:
from sklearn.cluster import AffinityPropagation

In [1]:
fa = AffinityPropagation(random_state=2)
fa.fit(np.radians(coords))

fa_silhouette = silhouette_score(coords, fa.labels_ ).round (2)


NameError: ignored

In [None]:
fa_silhouette

**DBSCAN**

In [None]:
kms_per_radian = 6371.0088
epsilon = 1.5 / kms_per_radian
db = cluster.DBSCAN(eps=epsilon, min_samples=370, algorithm='ball_tree', metric='haversine')
db.fit(np.radians(coords))

dbscan_silhouette = silhouette_score(coords, db.labels_ ).round (2)

In [None]:
dbscan_silhouette

**Kmeans**

In [None]:
km = KMeans(n_clusters=14)
km.fit(coords)

kmeans_silhouette = silhouette_score(coords, km.labels_ ).round(2)

In [None]:
kmeans_silhouette

**OPTICS**

In [None]:
kms_per_radian = 6371.0088
epsilon = 1.5 / kms_per_radian
clust = OPTICS(min_samples=370, min_cluster_size=epsilon)

clust.fit(np.radians(coords))

In [None]:
Optics_silhouette = silhouette_score(coords, clust.labels_ ).round (2)

In [None]:
Optics_silhouette

**Agglomerative**



In [None]:
from sklearn.cluster import AgglomerativeClustering

In [None]:
clustering = AgglomerativeClustering().fit(coords)

In [None]:
Agglomerative_silhouette = silhouette_score(coords, clustering.labels_ ).round (2)


In [None]:
Agglomerative_silhouette

K-Means Evaluation Using Elbow Method

In [None]:
# Run the Kmeans algorithm and get the index of data points clusters
sse = []
#list_k = list(range(1, 90))
list_k = list(range(2, 20))

for k in list_k:
    km = KMeans(n_clusters=k)
    km.fit(coords)
    #sse.append(km.inertia_)
    kmeans_silhouette = silhouette_score(coords, km.labels_ ).round(2)
    sse.append(kmeans_silhouette)

# Plot sse against k
plt.figure(figsize=(6, 6))
plt.plot(list_k, sse, '-o')
plt.xlabel(r'Number of clusters *k*')
plt.ylabel('Sum of squared distance');

In [None]:
# As we can see in the above elbow plot
# there is no elbow, hence k means not good for our analysis

K-Means Evaluation Using Silhoutte Analysis

In [None]:
for i, k in enumerate([4, 5, 6, 7, 8]):
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)
    
    # Run the Kmeans algorithm
    km = KMeans(n_clusters=k)
    labels = km.fit_predict(coords)
    centroids = km.cluster_centers_

    # Get silhouette samples
    silhouette_vals = silhouette_samples(coords, labels)

    # Silhouette plot
    y_ticks = []
    y_lower, y_upper = 0, 0
    for i, cluster in enumerate(np.unique(labels)):
        cluster_silhouette_vals = silhouette_vals[labels == cluster]
        cluster_silhouette_vals.sort()
        y_upper += len(cluster_silhouette_vals)
        ax1.barh(range(y_lower, y_upper), cluster_silhouette_vals, edgecolor='none', height=1)
        ax1.text(-0.03, (y_lower + y_upper) / 2, str(i + 1))
        y_lower += len(cluster_silhouette_vals)

    # Get the average silhouette score and plot it
    avg_score = np.mean(silhouette_vals)
    ax1.axvline(avg_score, linestyle='--', linewidth=2, color='green')
    ax1.set_yticks([])
    ax1.set_xlim([-0.1, 1])
    ax1.set_xlabel('Silhouette coefficient values')
    ax1.set_ylabel('Cluster labels')
    ax1.set_title('Silhouette plot for the various clusters', y=1.02);
    
    # Scatter plot of data colored with labels
    ax2.scatter(coords['lon'], coords['lat'], c=labels)
    ax2.scatter(centroids[:, 1], centroids[:, 0], marker='*', c='r', s=200)
    ax2.set_xlabel('Longitude')
    ax2.set_ylabel('Latitude')
    ax2.set_title('Visualization of clustered data', y=1.02)
    ax2.set_aspect('equal')
    plt.tight_layout()
    plt.suptitle(f'Silhouette analysis using k = {k}', fontsize=16, fontweight='semibold', y=1.05);

# Entry K-Means

In [None]:
# K MEANS
coords_for_entry = entry_data_slist.loc[:,["lat","lon"]]

k_means = cluster.KMeans(n_clusters=7, max_iter=10, random_state=1)
k_means.fit(coords_for_entry)
cluster_labels = k_means.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords_for_entry[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))
centermost_points = centroids = k_means.cluster_centers_

# Print centroids
centermost_points

In [None]:
lats, lons = zip(*centermost_points)
rep_points_entry = pd.DataFrame({'lon':lons, 'lat':lats})

In [None]:
fig, ax = plt.subplots(figsize=[10, 6])
rs_scatter = ax.scatter(rep_points_entry['lon'], rep_points_entry['lat'], c='#99cc99', edgecolor='None', alpha=0.7, s=120)
df_scatter = ax.scatter(slist_df['lon'], slist_df['lat'], c='k', alpha=0.9, s=3)
ax.set_title('Full data set vs KMeans reduced set')
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
ax.legend([df_scatter, rs_scatter], ['Full set', 'Reduced set'], loc='upper right')
plt.show()

In [None]:
# centermost_points_entry = pd.DataFrame(columns = ['lat', 'lon'])
# Check if chart_studio is installed
import sys
'chart_studio' in sys.modules
# False == not installed

In [None]:
!pip install chart_studio

In [None]:
import chart_studio.plotly as py
from plotly.tools import FigureFactory as ff
from chart_studio import tools as tl

tl.set_credentials_file(username='aryapriyank', api_key='B9gB3B6AJrR8OgtXFSTz')
newdata = rep_points_entry.iloc[0:, [1,0]]
table = ff.create_table(newdata)
py.iplot(table, filename='Plot station points on gmap')

In [None]:
import plotly.graph_objs as go

mapbox_access_token = 'pk.eyJ1IjoiYXJ5YXByaXlhbmsiLCJhIjoiY2wxYjZsbndmMnBjMjNicHd1NWJkems0YiJ9.hSO-JZ6PtffZ5gzsNdTlCw'

site_lat = rep_points_entry.lat
site_lon = rep_points_entry.lon
# locations_name = slist.NAME

data_for_entry_map = [
    go.Scattermapbox(
        lat=site_lat,
        lon=site_lon,
        mode='markers',
        marker=dict(
            size=3,
            color='rgb(255, 0, 0)',
            opacity=1
        ),
        # text=locations_name,
        hoverinfo='text'
    ),
    go.Scattermapbox(
        lat=site_lat,
        lon=site_lon,
        mode='markers',
        marker=dict(
            size=8,
            color='rgb(34, 139, 34)',
            opacity=0.8
        ),
        hoverinfo='none'
    )]



layout_for_entry_map = go.Layout(
    title='station plots',
    autosize=True,
    hovermode='closest',
    showlegend=False,
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=dict(
            lat=38.9,
            lon=-77.1
        ),
        pitch=0,
        zoom=10,
        style='light'
    ),
)


# Generate the figure using the iplot function 
  
fig_entry = dict(data=data_for_entry_map, layout=layout_for_entry_map)
py.iplot(fig_entry, filename='Plot station points on gmap')

# Exit K-Means

In [None]:
# K MEANS
import matplotlib.pyplot as plt
from sklearn import cluster
from geopy.distance import great_circle

coords_for_exit = exit_data_slist.loc[:,["lat","lon"]]

k_means = cluster.KMeans(n_clusters=6, max_iter=10, random_state=1)
k_means.fit(coords) 
cluster_labels = k_means.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))
centermost_points = centroids = k_means.cluster_centers_

# Print centroids
centermost_points

In [None]:
lats, lons = zip(*centermost_points)
rep_points_exit = pd.DataFrame({'lon':lons, 'lat':lats})

In [None]:
fig, ax = plt.subplots(figsize=[10, 6])
rs_scatter = ax.scatter(rep_points_exit['lon'], rep_points_exit['lat'], c='#99cc99', edgecolor='None', alpha=0.7, s=120)
df_scatter = ax.scatter(slist_df['lon'], slist_df['lat'], c='k', alpha=0.9, s=3)
ax.set_title('Full data set vs KMeans reduced set')
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
ax.legend([df_scatter, rs_scatter], ['Full set', 'Reduced set'], loc='upper right')
plt.show()

In [None]:
newdata = rep_points_exit.iloc[0:, [1,0]]
table = ff.create_table(newdata)
py.iplot(table, filename='Plot station points on gmap')

In [None]:
site_lat = rep_points_exit.lat
site_lon = rep_points_exit.lon
# locations_name = slist.NAME

data_for_exit_map = [
    go.Scattermapbox(
        lat=site_lat,
        lon=site_lon,
        mode='markers',
        marker=dict(
            size=3,
            color='rgb(255, 0, 0)',
            opacity=1
        ),
        # text=locations_name,
        hoverinfo='text'
    ),
    go.Scattermapbox(
        lat=site_lat,
        lon=site_lon,
        mode='markers',
        marker=dict(
            size=8,
            color='rgb(242, 68, 55)',
            opacity=0.8
        ),
        hoverinfo='none'
    )]



layout_for_exit_map = go.Layout(
    title='station plots',
    autosize=True,
    hovermode='closest',
    showlegend=False,
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=dict(
            lat=38.9,
            lon=-77.1
        ),
        pitch=0,
        zoom=10,
        style='light'
    ),
)


# Generate the figure using the iplot function 
  
fig_exit = dict(data=data_for_exit_map, layout=layout_for_exit_map)
py.iplot(fig_exit, filename='Plot station points on gmap')

# Scatter Plot K-Means

In [None]:
import plotly.express as px
# px.set_mapbox_access_token(open(".mapbox_token").read())
px.set_mapbox_access_token(mapbox_access_token)

fig_entry_sc = px.scatter_mapbox(slist, lat="lat", lon="lon", color="STATION_ID", size="EXIT_COUNT",
                  color_continuous_scale=px.colors.cyclical.IceFire, size_max=15, zoom=10)
fig_entry_sc.show()

In [None]:
fig_exit_sc = px.scatter_mapbox(slist, lat="lat", lon="lon", color="STATION_ID", size="EXIT_COUNT",
                  color_continuous_scale=px.colors.cyclical.IceFire, size_max=15, zoom=10)
fig_exit_sc.show()