In [1]:
# In order to run this code, you will need to create a plotly account and get a plotly api key
# Create plotly_cred.py in working directory which should contain plotly_username and plotly_api_key variables
# nlp.py needs to be run before this program unless the /data folder already contains dbscan_recent_years_pc.pkl
# Supporting write-up: https://medium.com/@rchen1990/clustering-and-tracking-aviation-accidents-over-time-3406ac63028e

import json
import os
from os import listdir
from os.path import isfile, join
import pandas as pd
import numpy as np
import re
import pickle as pk
from gensim.models import KeyedVectors
from sklearn.cluster import DBSCAN
import time
from datetime import datetime
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
import cufflinks
import plotly


cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)



In [2]:
# ----- Change working directory to data-scientist-excercise02 clone if not already the working directory
print(os.getcwd())
os.chdir('C:\\Users\\mh656tu\\Documents\\PDBD\\Bio\\DSRTI\\data-scientist-exercise02\\')
print(os.getcwd())

C:\Users\mh656tu\Documents\PDBD\Bio\DSRTI\data-scientist-exercise02\src
C:\Users\mh656tu\Documents\PDBD\Bio\DSRTI\data-scientist-exercise02


In [4]:
# ----- Set plotly credentials
import plotly_cred

In [5]:
plotly.tools.set_credentials_file(username=plotly_cred.plotly_username, api_key=plotly_cred.plotly_api_key)

In [6]:
def wrap_manually(text, n):
    """
    Function to insert <br> to manually wrap text in plotly.

    Parameters:
    -----------
    text: string to manually wrap
    n: number of words to wrap after

    Returns:
    --------
    ret: string with <br> inserted
    """
    a = text.split()
    ret = ''
    for i in range(0, len(a), n):
        ret += ' '.join(a[i:i+n]) + '<br>'

    return ret

In [10]:
# ----- Read in pickle results DataFrame which is produced from the nlp.py code
df = pd.read_pickle('data/dbscan_recent_years_pc.pkl')
df.loc[:, 'pc_plotly'] = df['probable_cause'].apply(
        lambda x: wrap_manually(x, 10))

In [11]:
# ----- Check DataFrame
print(df.shape)
print(df.columns)
df.head()

(5433, 39)
Index(['EventId', 'label', 'narrative', 'probable_cause', 'Investigation Type',
       'Accident Number', 'Event Date', 'Location', 'Country', 'Latitude',
       'Longitude', 'Airport Code', 'Airport Name', 'Injury Severity',
       'Aircraft Damage', 'Aircraft Category', 'Registration Number', 'Make',
       'Model', 'Amateur Built', 'Number of Engines', 'Engine Type',
       'FAR Description', 'Schedule', 'Purpose of Flight', 'Air Carrier',
       'Total Fatal Injuries', 'Total Serious Injuries',
       'Total Minor Injuries', 'Total Uninjured', 'Weather Condition',
       'Broad Phase of Flight', 'Report Status', 'Publication Date',
       'Event Year Month', 'Event Year', 'Event Date DT Format', '_merge',
       'pc_plotly'],
      dtype='object')


Unnamed: 0,EventId,label,narrative,probable_cause,Investigation Type,Accident Number,Event Date,Location,Country,Latitude,...,Total Uninjured,Weather Condition,Broad Phase of Flight,Report Status,Publication Date,Event Year Month,Event Year,Event Date DT Format,_merge,pc_plotly
0,20131113X11809,0,NTSB investigators used data provided by vario...,The pilot's failure to see and avoid wires whi...,Accident,WPR14CA052,10/28/2013,"Milo, OR",United States,42.953611,...,,VMC,,Probable Cause,07/06/2015,2013-10,2013,2013-10-28,both,The pilot's failure to see and avoid wires whi...
1,20110523X52725,1,NTSB investigators used data provided by vario...,The pilot's loss of directional control while ...,Accident,CEN11CA353,05/12/2011,"Guymon, OK",United States,36.686111,...,4.0,VMC,LANDING,Probable Cause,06/08/2017,2011-05,2011,2011-05-12,both,The pilot's loss of directional control while ...
2,20110912X00553,2,NTSB investigators may not have traveled in su...,The loss of engine power following takeoff for...,Accident,CEN11LA636,09/11/2011,"Kokomo, IN",United States,40.528056,...,,VMC,CLIMB,Probable Cause,11/07/2012,2011-09,2011,2011-09-11,both,The loss of engine power following takeoff for...
3,20141202X05625,-1,NTSB investigators used data provided by vario...,The pilot's inadvertent selection of an incorr...,Accident,ERA15CA065,11/25/2014,"Hilton Head Island, SC",United States,32.224445,...,1.0,IMC,APPROACH,Probable Cause,01/22/2015,2014-11,2014,2014-11-25,both,The pilot's inadvertent selection of an incorr...
4,20120821X31159,3,NTSB investigators used data provided by vario...,The pilot’s improper use of the brakes during ...,Accident,WPR12CA363,08/19/2012,"Hillsboro, OR",United States,45.428334,...,1.0,VMC,LANDING,Probable Cause,12/05/2012,2012-08,2012,2012-08-19,both,The pilot’s improper use of the brakes during ...


In [12]:
# ----- Plot distribution of labels
df['label'].iplot(kind='hist', bins=243, xTitle='Clusters', yTitle='Count', title='Cluster Distribution')

In [15]:
# ----- Print distribution of clusters
cluster_counts = df.label.value_counts()
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(cluster_counts)

-1      3583
 5       293
 2       218
 12      165
 9       160
 6        74
 7        71
 11       35
 63       27
 19       21
 70       20
 41       20
 82       19
 86       16
 22       16
 29       16
 80       12
 65       12
 32       12
 53       12
 3        11
 48       11
 15       10
 25        9
 88        9
 57        9
 23        7
 149       7
 17        6
 145       6
 24        6
 167       6
 73        6
 112       6
 126       5
 166       5
 35        5
 188       5
 46        5
 111       5
 28        5
 179       5
 92        5
 72        5
 76        5
 125       4
 183       4
 216       4
 52        4
 196       4
 1         4
 147       4
 180       4
 143       4
 168       4
 206       4
 51        4
 38        4
 4         4
 124       4
 75        4
 177       4
 116       4
 204       3
 109       3
 45        3
 108       3
 155       3
 142       3
 107       3
 113       3
 59        3
 37        3
 114       3
 71        3
 40        3
 26        3

In [16]:
# ----- Make the event date the index to plot time series
df2 = df.set_index('Event Date DT Format')

In [17]:
# ----- Prepare list to show and hide certain cluster labels in the default visualization
show_no_show = ['legendonly']*243
indexes_prep = [2, 3, 5, 7, 16]
indexes = [x+1 for x in indexes_prep]
for index in indexes:
    show_no_show[index] = True
# print(show_no_show)

In [18]:
# Visualization of probable cause clusters over time
py.iplot(
        {
            'data': [
                    {
                     'x': df2[df2['label'] == label].index,
                     'y': df2[df2['label'] == label]['label'],
                     'text': df2[df2['label'] == label]['pc_plotly'],
                     'hoverinfo': 'y+text',
                     'hoverlabel': dict(namelength=-1,
                                        font=dict(size=11),
                                        align='left'),
                     'name': 'Cluster #: '+str(label), 'mode': 'markers',
                     'visible': show_no_show[label+1]
                    } for label in range(-1, 242, 1)
                     ],
            'layout': {
                    'title': 'Probable Causes by Cluster over Time',
                    'xaxis': {'title': 'Date'},
                    'yaxis': {'title': 'Cluster #',
                              'range': [-1, 242]},
                    'hovermode': 'closest'
                    }
        }, filename='scatter')

High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~chenrocky/0 or inside your plot.ly account where it is named 'scatter'
