In [1]:
# using plotly, visualize our pandas dataframe for our lung nodules

# for plotly to work locally the following modules must be imported
import plotly.express as px
import plotly.graph_objects as go
import chart_studio.plotly as py
import cufflinks as cf 
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot,   iplot
init_notebook_mode(connected=True)
cf.go_offline()


# other imports
import numpy as np 
import pandas as pd 

%matplotlib inline

In [2]:
df_data = pd.read_csv('Candidates_All_mhd_Paths_Ubuntu.csv', index_col='Unnamed: 0')
df_data.head()

Unnamed: 0,seriesuid,coordX,coordY,coordZ,class,diameter_mm,File_Paths
0,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,-56.08,-67.85,-311.92,0,0.0,/media/e_quitee/Data Drive/LUNASet/subset8/sub...
1,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,53.21,-244.41,-245.17,0,0.0,/media/e_quitee/Data Drive/LUNASet/subset8/sub...
2,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,103.66,-121.8,-286.62,0,0.0,/media/e_quitee/Data Drive/LUNASet/subset8/sub...
3,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,-33.66,-72.75,-308.41,0,0.0,/media/e_quitee/Data Drive/LUNASet/subset8/sub...
4,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,-32.25,-85.36,-362.51,0,0.0,/media/e_quitee/Data Drive/LUNASet/subset8/sub...


In [3]:
df_data.describe()

Unnamed: 0,coordX,coordY,coordZ,class,diameter_mm
count,551065.0,551065.0,551065.0,551065.0,551065.0
mean,-2.005825,8.944721,-157.857852,0.002452,0.014954
std,71.711619,89.428035,277.21232,0.049453,0.42255
min,-201.1135,-317.33,-895.22,0.0,0.0
25%,-62.126786,-35.780058,-224.79,0.0,0.0
50%,-17.271143,19.61,-172.405402,0.0,0.0
75%,61.87,59.08,-120.132968,0.0,0.0
max,217.433291,292.38,1906.140031,1.0,32.27


In [4]:
fig = px.scatter_3d(
    df_data.loc[df_data['class'] == 1],
    x='coordX',
    y='coordY',
    z='coordZ',
    color='diameter_mm',
    opacity=0.6,
    template='plotly_dark',
    width=800, 
    height=800,
    color_continuous_scale='Viridis',
    title='3d Scatter of Lung Nodule Locations'
)

fig.show()

In [6]:
# create a histogram graph that shows the distribution of diameters of the nodules in each dataset

# histogram only takes in one type of data and from there counts
# the occurences of unique values
fig = px.histogram(
    df_data.loc[df_data['class'] == 1]['diameter_mm'],
    nbins=15,
    # labels={'value': 'diameter_mm'},
    x='diameter_mm',
    title='Diameter Distribution for Lung Nodules',
    marginal='violin',
    template='plotly_dark',
    color_discrete_sequence=['orange'],
    opacity=0.7,
    width=1000,
    height=500


)

fig.update_layout(
    yaxis_title_text='Frequency_Counts',
    showlegend=False,
    bargap=0.2
)

In [71]:
# create a scatter plot containing subplots of the distributions of the X/Y/Z coordinates vs diameter
fig = make_subplots(rows=1, cols=3, subplot_titles=['Diameter vs\n coordX', 'Diameter vs\n coordY', 'Diameter vs\n coordZ'], shared_yaxes=True)

# setting mode='markers' removes the lines connecting individual
# plots
fig.add_trace(
    go.Scatter(
        x=df_data.loc[df_data['class'] == 1]['coordX'],
        y=df_data.loc[df_data['class'] == 1]['diameter_mm'],
        mode='markers',
        opacity=0.4,
        # marker= dict(
        #     size=5,
        #     color=df_data.loc[df_data['class'] == 1]['diameter_mm'],
        #     colorscale='Viridis',
        #     showscale=True
        # ),
        marker_color='orange',
        showlegend=False,
    ),
    row=1,
    col=1
)

fig.add_trace(
    go.Scatter(
        x=df_data.loc[df_data['class'] == 1]['coordY'],
        y=df_data.loc[df_data['class'] == 1]['diameter_mm'],
        mode='markers',
        opacity=0.4,
        # marker= dict(
        #     size=5,
        #     color=df_data.loc[df_data['class'] == 1]['diameter_mm'],
        #     colorscale='Viridis',
        #     showscale=True
        # ),
        marker_color='yellow',
        showlegend=False
    ),
    row=1,
    col=2,
)

fig.add_trace(
    go.Scatter(
        x=df_data.loc[df_data['class'] == 1]['coordZ'],
        y=df_data.loc[df_data['class'] == 1]['diameter_mm'],
        mode='markers',
        opacity=0.4,
        # marker= dict(
        #     size=5,
        #     color=df_data.loc[df_data['class'] == 1]['diameter_mm'],
        #     colorscale='Viridis',
        #     showscale=True
        # ),
        marker_color='red',
        showlegend=False
    ),
    row=1,
    col=3
)


fig.update_layout(
    template='plotly_dark',
    title='Lung Nodule Diameter vs. Coordinates',
    font=dict(
        family='Monospace',
        size=18,
        # color='#ebdd78'
    ),
    yaxis_title='Diameter (mm)'
)

fig.update_xaxes(title_text='coordX', row=1, col=1)
fig.update_xaxes(title_text='coordY', row=1, col=2)
fig.update_xaxes(title_text='coordZ', row=1, col=3)

In [38]:
# let's try using the figure factory class
# using figure factory create a KDE -- a KDE allows us to estimate 
# the probability density function from our dataset

# ff.create_distplot() takes in the pd.Series as a list! (all your data are passed in as a list)
# it only takes in one series data and creates a histogram with a kernel density estimation
# and the attributes for it also takes in lists -- where each elements of the list correspond to the
# data that will be plotted on the graph
fig = ff.create_distplot(
    [df_data.loc[df_data['class'] == 1]['diameter_mm']],
    group_labels=['sizes'],
    bin_size=[2],
    colors=['rgba(235, 221, 120, 0.5)'], # use rgba() as an argument for the colors
)

fig.update_layout(
    template='plotly_dark',
    title='Kernel Density Estimation -- Nodule Size Distributions',
    xaxis_title='Diameter (mm)',
    yaxis_title='Probability',
    font=dict(
        family='Monospace',
    )
)

fig.show()