In [2]:
import numpy as np
import plotly as py 
import plotly.offline as pyo
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

In [3]:
np.random.seed(42)

In [4]:
random_x = np.random.randint(1,101,100)

In [5]:
random_y = np.random.randint(1,101,100)

## SCATTER PLOTS 

In [6]:
# Gives the plot data
data = [go.Scatter(x=random_x, y=random_y, mode='markers')]

# Gives the labels, etc. Two apporaches for including a dictionary for x and y axis
# hovermode closest handles multiple points on same vertical 
layout = go.Layout(title='Hello First Plot',
                   xaxis={'title':'My X Axis'},
                   yaxis = dict(title='MY Y AXIS'),
                   hovermode='closest')

# Enter it into a fig
fig=go.Figure(data=data,layout=layout)

pyo.iplot(fig,filename='scatter.html')

In [7]:
# Change the markers 

data = [go.Scatter(x=random_x, y=random_y, mode='markers',
                   marker=dict(size=12,
                              color='rgb(51,204,153)',
                              symbol='pentagon',
                              line = {'width':2}
                              ))]

# Gives the labels, etc. Two apporaches for including a dictionary for x and y axis
# hovermode closest handles multiple points on same vertical 
layout = go.Layout(title='Hello First Plot',
                   xaxis={'title':'My X Axis'},
                   yaxis = dict(title='MY Y AXIS'),
                   hovermode='closest')

# Enter it into a fig
fig=go.Figure(data=data,layout=layout)

pyo.iplot(fig,filename='scatter.html')

## LINE CHARTS

In [8]:
# X axis is typically sorted in some way... such as time data. 

x_values = np.linspace(0,1,100)
y_values = np.random.randn(100)

In [9]:
# Creats a Scatter plot
trace0 = go.Scatter(x=x_values,y=y_values+5,
                  mode='markers',name='markers')

# Creates a second trace which is the lines
trace1 = go.Scatter(x=x_values,y=y_values,
                  mode='lines',name='mylines')

trace2 = go.Scatter(x=x_values,y=y_values-5,
                  mode='lines+markers',name='Combined!')

# multiple plots is why data is a list                    
data = [trace0,trace1,trace2]
layout = go.Layout(title='Line Charts')
fig = go.Figure(data=data,layout=layout)

pyo.iplot(fig)

## Real World Data 

In [10]:
import pandas as pd 

In [11]:
df = pd.read_csv('nst-est2017-alldata.csv')

In [12]:
df.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,NAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,...,RDOMESTICMIG2015,RDOMESTICMIG2016,RDOMESTICMIG2017,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015,RNETMIG2016,RNETMIG2017
0,10.0,0,0,0.0,United States,308745538.0,308758105.0,309338421.0,311644280.0,313993272.0,...,0.0,0.0,0.0,2.7209,2.920371,2.883643,3.173228,3.516743,3.513394,3.423941
1,20.0,1,0,0.0,Northeast Region,55317240.0,55318350.0,55388349.0,55642659.0,55860261.0,...,-6.103092,-6.619089,-5.55957,1.46795,0.779137,0.605873,-0.082832,-0.903931,-1.307503,-0.28893
2,20.0,2,0,0.0,Midwest Region,66927001.0,66929794.0,66973360.0,67141501.0,67318295.0,...,-3.458531,-3.307295,-2.30464,-1.187519,-1.010696,-0.120354,-0.752477,-1.323952,-1.160735,-0.191323
3,20.0,3,0,0.0,South Region,114555744.0,114563024.0,114869241.0,116060993.0,117291728.0,...,3.788037,3.592695,2.900528,5.544289,5.831747,5.362083,6.31731,7.336162,7.113818,6.30401
4,20.0,4,0,0.0,West Region,71945553.0,71946937.0,72107471.0,72799127.0,73522988.0,...,1.61345,2.099001,1.475519,2.798796,3.521423,3.396627,4.163576,5.067452,5.488965,4.737979


In [13]:
df2 = df[df['DIVISION']=='1']

In [14]:
df2.set_index('NAME',inplace=True)

In [15]:
df2

Unnamed: 0_level_0,SUMLEV,REGION,DIVISION,STATE,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,...,RDOMESTICMIG2015,RDOMESTICMIG2016,RDOMESTICMIG2017,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015,RNETMIG2016,RNETMIG2017
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Connecticut,40.0,1,1,9.0,3574097.0,3574114.0,3580171.0,3591927.0,3597705.0,3602470.0,...,-8.376089,-8.106331,-6.206914,0.993851,-0.542726,-0.420268,-2.479362,-3.464252,-3.112421,-1.257548
Maine,40.0,1,1,23.0,1328361.0,1328362.0,1327568.0,1327968.0,1328101.0,1327975.0,...,-0.781424,1.807361,4.032798,0.685361,0.178459,0.392308,1.301528,0.389959,3.000731,5.216532
Massachusetts,40.0,1,1,25.0,6547629.0,6547808.0,6564943.0,6612178.0,6659627.0,6711138.0,...,-3.270088,-4.423353,-3.374712,4.364383,4.266338,5.21137,4.538697,3.276287,2.261905,3.24609
New Hampshire,40.0,1,1,33.0,1316470.0,1316460.0,1316700.0,1318345.0,1320923.0,1322622.0,...,-0.850002,1.333509,3.500622,-0.403029,0.632751,0.061281,3.793602,0.852258,3.037729,5.170643
Rhode Island,40.0,1,1,44.0,1052567.0,1052945.0,1053169.0,1052154.0,1052761.0,1052784.0,...,-4.21851,-4.093718,-3.640649,-2.170688,-0.991964,-1.139847,1.029624,0.368598,0.526146,0.891742
Vermont,40.0,1,1,50.0,625741.0,625741.0,625842.0,626210.0,625606.0,626044.0,...,-3.415672,-3.615938,-1.472321,-0.333852,-1.915617,0.131027,-1.313404,-1.932614,-2.115708,0.024058


In [16]:
# List comprehension that states 'for columns in columns of df2, select the ones that only start with POP'
# avoids dealing with 121 columns 
list_of_pop_columns = [col for col in df2.columns if col.startswith('POP')]

In [17]:
list_of_pop_columns

['POPESTIMATE2010',
 'POPESTIMATE2011',
 'POPESTIMATE2012',
 'POPESTIMATE2013',
 'POPESTIMATE2014',
 'POPESTIMATE2015',
 'POPESTIMATE2016',
 'POPESTIMATE2017']

In [18]:
df2=df2[list_of_pop_columns]

In [19]:
df2.index

Index(['Connecticut', 'Maine', 'Massachusetts', 'New Hampshire',
       'Rhode Island', 'Vermont'],
      dtype='object', name='NAME')

In [20]:
df2

Unnamed: 0_level_0,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015,POPESTIMATE2016,POPESTIMATE2017
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Connecticut,3580171.0,3591927.0,3597705.0,3602470.0,3600188.0,3593862.0,3587685.0,3588184.0
Maine,1327568.0,1327968.0,1328101.0,1327975.0,1328903.0,1327787.0,1330232.0,1335907.0
Massachusetts,6564943.0,6612178.0,6659627.0,6711138.0,6757925.0,6794002.0,6823721.0,6859819.0
New Hampshire,1316700.0,1318345.0,1320923.0,1322622.0,1328684.0,1330134.0,1335015.0,1342795.0
Rhode Island,1053169.0,1052154.0,1052761.0,1052784.0,1054782.0,1055916.0,1057566.0,1059639.0
Vermont,625842.0,626210.0,625606.0,626044.0,625665.0,624455.0,623354.0,623657.0


In [21]:
# USE List comprehension to automatically build traces for Plotly 
# Build a scatter plot for every state in df2 index 

data = [go.Scatter(x=df2.columns,
                    y=df2.loc[name],
                    mode='lines',
                    name=name) for name in df2.index]
pyo.iplot(data)    

## Example 2

In [22]:
df = pd.read_csv('2010YumaAZ.csv')

In [23]:
df.head()

Unnamed: 0,LST_DATE,DAY,LST_TIME,T_HR_AVG
0,20100601,TUESDAY,0:00,25.2
1,20100601,TUESDAY,1:00,24.1
2,20100601,TUESDAY,2:00,24.4
3,20100601,TUESDAY,3:00,24.9
4,20100601,TUESDAY,4:00,22.8


In [24]:
days = ['TUESDAY','WEDNESDAY','THURSDAY','FRIDAY','SATURDAY',"SUNDAY",'MONDAY']

In [25]:
df[df['DAY']=='TUESDAY']['T_HR_AVG']

0     25.2
1     24.1
2     24.4
3     24.9
4     22.8
5     19.8
6     18.8
7     21.2
8     24.2
9     27.1
10    29.3
11    30.6
12    32.6
13    34.0
14    34.0
15    34.9
16    34.6
17    33.8
18    33.5
19    32.8
20    31.0
21    29.0
22    27.6
23    26.3
Name: T_HR_AVG, dtype: float64

In [28]:
data = []

# write for loop to create traces for the days data list 
for i in days:
    
    trace=go.Scatter(x=df['LST_TIME'],y=df[df['DAY']==i]['T_HR_AVG'],mode='lines',name=i)
    data.append(trace)
    
layout = go.Layout(title='Daily Temp Avgs')

fig = go.Figure(data=data,layout=layout)
pyo.iplot(fig)


In [33]:
# Another way to do it using list comprehension 

data =[{'x':df['LST_TIME'],
    'y':df[df["DAY"]==day]['T_HR_AVG'],
       'name':day} for day in df['DAY'].unique()]


layout = go.Layout(title='Daily Temp Avgs')

fig = go.Figure(data=data,layout=layout)
pyo.iplot(fig)

## BAR CHARTS

In [34]:
df = pd.read_csv('../Data/2018WinterOlympics.csv')

In [35]:
df.head()

Unnamed: 0,Rank,NOC,Gold,Silver,Bronze,Total
0,1,Norway,14,14,11,39
1,2,Germany,14,10,7,31
2,3,Canada,11,8,10,29
3,4,United States,9,8,6,23
4,5,Netherlands,8,6,6,20


In [36]:
# Basic Bar Chart 

data = [go.Bar(x=df['NOC'],y=df['Total'])]
layout = go.Layout(title='Medals')
fig = go.Figure(data=data,layout=layout)
pyo.iplot(fig)

In [37]:
# Seperate bronze, silver, gold into different traces. This will put the traces side-by-side 

trace1=go.Bar(x=df['NOC'],y=df['Gold'],name='Gold',marker={'color':'#FFD700'})
trace2=go.Bar(x=df['NOC'],y=df['Silver'],name='Silver',marker={'color':'#9EA0A1'})
trace3=go.Bar(x=df['NOC'],y=df['Bronze'],name='Bronze',marker={'color':'#CD7F32'})



data = [trace1,trace2,trace3]
layout = go.Layout(title='Medals')
fig = go.Figure(data=data,layout=layout)
pyo.iplot(fig)


In [38]:
# To get similar plot but bars stacked on top of each other, use barmode in 

trace1=go.Bar(x=df['NOC'],y=df['Gold'],name='Gold',marker={'color':'#FFD700'})
trace2=go.Bar(x=df['NOC'],y=df['Silver'],name='Silver',marker={'color':'#9EA0A1'})
trace3=go.Bar(x=df['NOC'],y=df['Bronze'],name='Bronze',marker={'color':'#CD7F32'})



data = [trace1,trace2,trace3]
layout = go.Layout(title='Medals',barmode='stack')
fig = go.Figure(data=data,layout=layout)
pyo.iplot(fig)

## Bar Chart Example 

In [39]:
df = pd.read_csv('../Data/mocksurvey.csv')

In [40]:
df.columns

Index(['Unnamed: 0', 'Strongly Agree', 'Somewhat Agree', 'Neutral',
       'Somewhat Disagree', 'Strongly Disagree'],
      dtype='object')

In [41]:
df.set_index('Unnamed: 0', inplace = True)

In [42]:
df.head()

Unnamed: 0_level_0,Strongly Agree,Somewhat Agree,Neutral,Somewhat Disagree,Strongly Disagree
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Question 1,0.45,0.25,0.1,0.12,0.08
Question 2,0.12,0.07,0.48,0.18,0.15
Question 3,0.05,0.22,0.19,0.23,0.31


In [43]:
# list comprehension to create traces for every answer in the columns 
data = [go.Bar(x=df.index, y=df[i], name=i) for i in df.columns]

In [44]:
layout = go.Layout(title='Survey Results', barmode='stack')

In [45]:
# Vertical bar plot 

fig = go.Figure(data=data,layout=layout)
pyo.iplot(fig)

In [47]:
# Horizontal bar plot, switch x and y, the add orientation = 'h' 
data = [go.Bar(x=df[i], y=df.index, name=i, orientation='h') for i in df.columns]
layout = go.Layout(title='Survey Results', barmode='stack')
fig = go.Figure(data=data,layout=layout)
pyo.iplot(fig)

## BUBBLE PLOTS 

In [48]:
df = pd.read_csv('../Data/mpg.csv')

In [49]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [50]:
# text lets you choose what to display when hovering over a marker, in this case, the name of the car

data = [go.Scatter(x=df['horsepower'],y=df['mpg'],text=df['name'],mode='markers',
                  marker=dict(size=2*df['cylinders']))]
layout = go.Layout(title='Bubble Chart',xaxis={'title':'horsepower'},yaxis = dict(title='MPG'))
fig=go.Figure(layout=layout,data=data)
pyo.iplot(fig)

In [65]:
# Change the bubble size to weight and color by cyclinder  

data = [go.Scatter(x=df['horsepower'],
                   y=df['mpg'],
                   text=df['name'],mode='markers',
                   marker=dict(size=df['weight']/200,
                   color=df['cylinders'],
                   colorscale='Jet',
                   showscale=True))]

layout = go.Layout(title='Bubble Chart',xaxis={'title':'horsepower'},yaxis = dict(title='MPG'),hovermode='x')

fig=go.Figure(layout=layout,data=data)
pyo.iplot(fig)

#### Bubble Chart Example 3

In [67]:
data = [go.Scatter(x=df['displacement'],
                  y=df['acceleration'],
                  text=df['name'],
                  mode='markers',
                  marker=dict(size=df['weight']/400))]

layout = go.Layout(title='My Bubble Solution',hovermode='closest')

fig = go.Figure(data=data,layout=layout)

pyo.iplot(fig)

## BOX PLOTS 

In [68]:
# Line of data 

y = [1,14,14,15,16,18,18,19,19,20,20,23,24,26,27,27,28,29,33,54]

data = [go.Box(y=y)]

pyo.iplot(data)


In [69]:
# Line of data 
# Show all the points next to the box plot 

y = [1,14,14,15,16,18,18,19,19,20,20,23,24,26,27,27,28,29,33,54]

data = [go.Box(y=y,boxpoints='all')]

pyo.iplot(data)

In [72]:
# Line of data 
# Show all the points next to the box plot 
# more formating, jitter value between 0-1 spreads out data left and right, pointpos positions the points to left and right of center 
# of box based around 0, -1 to the left, 1 to the right  

y = [1,14,14,15,16,18,18,19,19,20,20,23,24,26,27,27,28,29,33,54]

data = [go.Box(y=y,boxpoints='all',jitter=0.2,pointpos=2)]

pyo.iplot(data)

In [73]:
# only show outliers in boxpoints

y = [1,14,14,15,16,18,18,19,19,20,20,23,24,26,27,27,28,29,33,54]

data = [go.Box(y=y,boxpoints='outliers')]

pyo.iplot(data)

#### Example 2

In [74]:
# frequency of 3 letter words used in different publications by two different authors

snodgrass = [.209,.205,.196,.210,.202,.207,.224,.223,.220,.201]
twain = [.225,.262,.217,.240,.230,.229,.235,.217]

data = [go.Box(y=snodgrass,name='Snodgrass'),
       go.Box(y=twain,name='twain')]

pyo.iplot(data)

#### Example 3

In [76]:
df = pd.read_csv('../Data/abalone.csv')

In [78]:
df.shape

(4177, 9)

In [77]:
df.head()

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [79]:
# Take two random samples of different sizes
a = np.random.choice(df['rings'],30,replace=False)
b = np.random.choice(df['rings'],70,replace=False)

In [80]:
data = [go.Box(y=a,name='A'),go.Box(y=b,name='B')]

layout = go.Layout(title='2 random Abalone Samples')

fig = go.Figure(data=data,layout=layout)

pyo.iplot(fig)

## HISTOGRAMS 

In [83]:
df = pd.read_csv('../Data/mpg.csv')

data = [go.Histogram(x=df['mpg'])]
layout = go.Layout(title='Histogram')
fig = go.Figure(data=data,layout=layout)

pyo.iplot(fig)

In [84]:
# bin size editing

data = [go.Histogram(x=df['mpg'],xbins=dict(start=0,end=50,size=10))]
layout = go.Layout(title='Histogram')
fig = go.Figure(data=data,layout=layout)

pyo.iplot(fig)

In [85]:
df = pd.read_csv('../Data/abalone.csv')

In [87]:
data = [go.Histogram(x=df['length'],xbins=dict(start=0,end=1,size=0.02))]
layout = go.Layout(title='abalone length')
fig = go.Figure(data=data,layout=layout)
pyo.iplot(fig)

## DISTPLOT

In [88]:
# Need to import more fancy plotly plotting package

import plotly.figure_factory as ff 


calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.



In [89]:
x = np.random.randn(1000)

In [91]:
hist_data = [x]
group_labels = ['distplot']

fig = ff.create_distplot(hist_data, group_labels)

pyo.iplot(fig)

In [92]:
x1 = np.random.randn(200)-2
x2 = np.random.randn(200)
x3 = np.random.randn(200)+2
x4 = np.random.randn(200)+4

hist_data=[x1,x2,x3,x4]
group_labels=['x1','x2','x3','x4']

fig = ff.create_distplot(hist_data, group_labels,bin_size=[.2,.1,.3,.4])

pyo.iplot(fig)

In [97]:
# for the snodgrass - twain example 

hist_data=[snodgrass,twain]
group_labels=['snodgrass words','twain words']

fig = ff.create_distplot(hist_data, group_labels,bin_size=[.003,.003])
pyo.iplot(fig)

In [98]:
df = pd.read_csv('../Data/iris.csv')

In [100]:
df['class'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [101]:
df['petal_length'].max()

6.9

In [103]:
trace0 = df[df['class']=='Iris-setosa']['petal_length']
trace1 = df[df['class']=='Iris-versicolor']['petal_length']
trace2 = df[df['class']=='Iris-virginica']['petal_length']



hist_data=[trace0,trace1,trace2]
group_labels=['Iris-setosa','versicolor','virginica']

fig = ff.create_distplot(hist_data, group_labels)
pyo.iplot(fig)

## HEATMAP

In [105]:
df = pd.read_csv('../Data/2010SantaBarbaraCA.csv')

In [106]:
df.head()

Unnamed: 0,LST_DATE,DAY,LST_TIME,T_HR_AVG
0,20100601,TUESDAY,0:00,12.7
1,20100601,TUESDAY,1:00,12.7
2,20100601,TUESDAY,2:00,12.3
3,20100601,TUESDAY,3:00,12.5
4,20100601,TUESDAY,4:00,12.7


In [117]:
data = [go.Heatmap(x=df['DAY'],y=df['LST_TIME'],z=df['T_HR_AVG'].values,colorscale='Jet')]

layout = go.Layout(title='SB CA Temps')

fig=go.Figure(data,layout)
pyo.iplot(fig)

In [120]:
# Plot multiple Heatmaps! 

from plotly import tools

df1 = pd.read_csv('../Data/2010SitkaAK.csv')
df2 = pd.read_csv('../Data/2010SantaBarbaraCA.csv')
df3 = pd.read_csv('../Data/2010YumaAZ.csv')


In [121]:
trace1 = go.Heatmap(x=df1['DAY'],y=df1['LST_TIME'],z=df1['T_HR_AVG'].values,colorscale='Jet',zmin=5,zmax=40)
trace2 = go.Heatmap(x=df2['DAY'],y=df2['LST_TIME'],z=df2['T_HR_AVG'].values,colorscale='Jet',zmin=5,zmax=40)
trace3 = go.Heatmap(x=df3['DAY'],y=df3['LST_TIME'],z=df3['T_HR_AVG'].values,colorscale='Jet',zmin=5,zmax=40)

In [124]:
fig = tools.make_subplots(rows=1,cols=3,subplot_titles=['Sitka','SB CA','Yuma AZ'],shared_yaxes=True)

fig.append_trace(trace1,1,1)
fig.append_trace(trace2,1,2)
fig.append_trace(trace3,1,3)

fig['layout'].update(title='Temps for three cities')

pyo.iplot(fig)