In [11]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import re
from os import listdir
from os.path import isfile, join
import plotly.graph_objects as go


us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Palau': 'PW',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
}

abbrev_us_state = dict(map(reversed, us_state_abbrev.items()))


def remove_chars(input_):
    '''
    removes unwanted characters from a line read from a file
    input parameters:
        input_: A list containing a line read from a file
    output:
        output_list: A list containing the line with unwanted characters removed
    '''
    output_ = input_.strip('\n').split(',')
    output_list = [re.sub(r'^"|"$', '', i) for i in output_]
    return output_list

def clean_colony_data(file_):
    '''
    Reads in the USDA honey colony data files and outputs 2 cleaned dataframes
    One pertaining to the colony count data per state, and the second dataframe containing
    the colony diseases per state.
    
    input parameters: 
        file_: string containing file path 
        
    returns:
        colony_df: Dataframe containing data of colony counts per state
        disease_df: Dataframe containing the colony disease counts per state
    '''
    #Remove unwanted characters in file lines
    f = [remove_chars(i) for i in open(file_)]
    
    #looking at the excel table we know that colony data has 10 columns
    #disease data has nine columns, and that these rowtypes are classified
    #as data rows with character 'd'. Thus we subset these specific
    #data rows by their lengths.
    
    colony_data = [i for i in f if len(i) == 10 for j in i if j == 'd']
    disease_data = [i for i in f if len(i) == 9 for j in i if j == 'd']
    
    #The data for each quarter starts with Alabama and ends with the United States total
    #By getting these indexes, we can separate all quarters in the file
    colony_start_indexes = [colony_data.index(colony_data[i]) for i in range(len(colony_data)) for j in colony_data[i] if j == 'Alabama']
    disease_start_indexes = [disease_data.index(disease_data[i]) for i in range(len(disease_data)) for j in disease_data[i] if j == 'Alabama']
    
    colony_end_indexes = [colony_data.index(colony_data[i]) for i in range(len(colony_data)) for j in colony_data[i] if j == 'Wyoming']
    disease_end_indexes = [disease_data.index(disease_data[i]) for i in range(len(disease_data)) for j in disease_data[i] if j == 'Wyoming']
    
    #subset the data by with the index values collected
    colony_subsets = [colony_data[colony_start_indexes[i]: colony_end_indexes[i]] for i in range(len(colony_start_indexes))]    
    quarters = ['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8']
    
    #append the quarter labels
    for i in range(len(colony_subsets)):
        for j in colony_subsets[i]:
            j.append(quarters[i])
    
    cleaned_colony_data = [j for i in colony_subsets for j in i]
    
    #Subset the disease data and append the quarter labels
    disease_subsets = [disease_data[disease_start_indexes[i]: disease_end_indexes[i]] for i in range(len(disease_start_indexes))]    
    for i in range(len(disease_subsets)):
        for j in disease_subsets[i]:
            j.append(quarters[i])
    
    cleaned_disease_data = [j for i in disease_subsets for j in i]
    
    #Convert the cleaned data into dataframes
    colony_df = pd.DataFrame(cleaned_colony_data)
    colony_df.columns = ["table_no", "row_type", "state", "initial_count", "max", "lost", "lost_perc", "added", "renovated", "renovated_perc", "quarter"]
    
    disease_df = pd.DataFrame(cleaned_disease_data)
    disease_df.columns = ["table_no", "row_type", "state", "varroa_mites", "other_pests", "diseases", "pesticides", "other", "unknown", "quarter"]
    
    #drop unwanted columns and replace non numeric chars
    colony_df.drop(columns=['table_no', 'row_type'], inplace = True)
    disease_df.drop(columns=['table_no', 'row_type'], inplace = True)

    colony_df.replace(['(X)', '-'], "", inplace = True)
    colony_df.replace(['(Z)'], "0", inplace = True)

    disease_df.replace(['(X)', '-'], '', inplace = True)
    disease_df.replace(['(Z)'], '0', inplace = True)

    #Set column data types
    categoricals = ['state', 'quarter']
    for (columnName, columnData) in colony_df.iteritems():
        if(columnName not in categoricals):
            colony_df[columnName] = pd.to_numeric(colony_df[columnName], errors = 'coerce', downcast = 'float')
        else:
            colony_df[columnName] = colony_df[columnName].astype(str)


    for (columnName, columnData) in disease_df.iteritems():
        if(columnName not in categoricals):
            disease_df[columnName] = pd.to_numeric(disease_df[columnName], errors = 'coerce', downcast = 'float')
        else:
            disease_df[columnName] = disease_df[columnName].astype(str)
    
    colony_df = colony_df[colony_df.state != ""]
    disease_df = disease_df[disease_df.state != ""]
    
    colony_df.state = [abbrev_us_state[i] if i in list(abbrev_us_state.keys()) else i for i in colony_df.state]
    colony_df['state_code'] = [us_state_abbrev[i] if i in list(us_state_abbrev.keys()) else i for i in colony_df.state]
    
    disease_df.state = [abbrev_us_state[i] if i in list(abbrev_us_state.keys()) else i for i in disease_df.state]
    disease_df['state_code'] = [us_state_abbrev[i] if i in list(us_state_abbrev.keys()) else i for i in disease_df.state]
    
    return (colony_df, disease_df)  






def clean_production_data(file_):
    '''
    Reads in the USDA honey colony data files and outputs 2 cleaned dataframes
    One pertaining to the colony count data per state, and the second dataframe containing
    the colony diseases per state.
    
    input parameters: 
        file_: string containing file path 
        
    returns:
        colony_df: Dataframe containing data of colony counts per state
        disease_df: Dataframe containing the colony disease counts per state
    '''
    #Remove unwanted characters in file lines
    f = [remove_chars(i) for i in open(file_)]
    
    #looking at the excel table we know that colony data has 10 columns
    #disease data has nine columns, and that these rowtypes are classified
    #as data rows with character 'd'. Thus we subset these specific
    #data rows by their lengths.
    
    prod_data = [i for i in f if len(i) == 9 for j in i if j == 'd']
    
    #The data for each quarter starts with Alabama and ends with the United States total
    #By getting these indexes, we can separate all quarters in the file
    prod_start_indexes = [prod_data.index(prod_data[i]) for i in range(len(prod_data)) for j in prod_data[i] if j == 'Alabama' or j == 'AL'] 
    prod_end_indexes = [prod_data.index(prod_data[i]) for i in range(len(prod_data)) for j in prod_data[i] if j == 'Wyoming' or j == 'WY']
   
    #subset the data by with the index values collected
    prod_subsets = [prod_data[prod_start_indexes[i]: prod_end_indexes[i]] for i in range(len(prod_start_indexes))]    
    quarters = ['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8']
    
    #append the quarter labels
    for i in range(len(prod_subsets)):
        for j in prod_subsets[i]:
            j.append(quarters[i])
    
    cleaned_prod_data = [j for i in prod_subsets for j in i]
    
    
    #Convert the cleaned data into dataframes
    prod_df = pd.DataFrame(cleaned_prod_data)
    prod_df.columns = ["table_no", "row_type", "state", "honey_colonies", "yield_per_col", "production", 'stocks', 'avg_price_per_lb', 'prod_value', 'quarter']
    
    #drop unwanted columns and replace non numeric chars
    prod_df.drop(columns=['table_no', 'row_type'], inplace = True)

    prod_df.replace(['(X)', '-'], "", inplace = True)
    prod_df.replace(['(Z)'], "0", inplace = True)

    #Set column data types
    categoricals = ['state', 'quarter']
    for (columnName, columnData) in prod_df.iteritems():
        if(columnName not in categoricals):
            prod_df[columnName] = pd.to_numeric(prod_df[columnName], errors = 'coerce', downcast = 'float')
        else:
            prod_df[columnName] = prod_df[columnName].astype(str)
            
    prod_df = prod_df[prod_df.state != ""]
    
    prod_df.state = [abbrev_us_state[i] if i in list(abbrev_us_state.keys()) else i for i in prod_df.state]
    
    prod_df['state_code'] = [us_state_abbrev[i] if i in list(us_state_abbrev.keys()) else i for i in prod_df.state]
    
    return prod_df  

def get_data():
    
    #Data paths
    colony_path = '.\\colony_data'
    production_path = '.\\production_data'
    colony_files = [f for f in listdir(colony_path) if isfile(join(colony_path, f))]
    production_files = [f for f in listdir(production_path) if isfile(join(production_path, f))]
    
    colony_files = [colony_path + '\\' + i for i in colony_files]
    production_files = [production_path + '\\' + i for i in production_files]
    
    #---------------Production Data-----------------------
    all_prod_data = [clean_production_data(i) for i in production_files]
    #Get 2018 data from last index Q2
    prod_2018 = all_prod_data[17]
    prod_2018 = prod_2018[prod_2018.quarter == 'Q2'].copy()
    prod_2018['year'] = 2018

    #Get 2000-2017 year data from Q1
    all_prod_data = [i[i.quarter == 'Q1'].copy() for i in all_prod_data]
    years = list(range(2000, 2018))

    for i in range(len(years)):
        all_prod_data[i].loc[:,'year'] = years[i]

    all_prod_data.append(prod_2018)
    honey_prod = pd.concat(all_prod_data)
    honey_prod.drop(columns=['quarter'], inplace = True)
    
    
    #----------------Colony Data--------------------------
    
    #clean all separated data
    all_col_data = [clean_colony_data(i) for i in colony_files]
    all_disease_data = [i[1] for i in all_col_data]
    all_col_data = [i[0] for i in all_col_data]
    
    #remove overlapped data
    all_disease_data = [i[i.quarter != 'Q5'] for i in all_disease_data]
    all_disease_data = [i[i.quarter != 'Q6'] for i in all_disease_data]
    
    all_col_data = [i[i.quarter != 'Q5'] for i in all_col_data]
    all_col_data = [i[i.quarter != 'Q6'] for i in all_col_data]
    
    #Include year data
    col_years = list(range(2015,2019))
    for i in range(len(all_disease_data)):
        all_disease_data[i].loc[:,'year'] = col_years[i] 
        all_col_data[i].loc[:,'year'] = col_years[i]
    
    #Combine dataframe lists 
    diseases_df = pd.concat(all_disease_data, ignore_index = True)
    col_df = pd.concat(all_col_data, ignore_index = True)
    
    colony_data = pd.concat([diseases_df, col_df], axis = 1)
    
    colony_data = colony_data.loc[:,~colony_data.columns.duplicated()]
    colony_data['period'] = colony_data["year"].map(str) + colony_data["quarter"]
    
    return (honey_prod, colony_data)




def generate_map_object2(input_, period_, category_):
    '''
    Returns a plotly chloropleth graph object

    input: 
        input_: A dataframe of honey_data containing
        relevant production data
        
        period_: A string value containing the year and quarter of 
                 the data to be displayed. Ex: '2015Q1'
        
        category_: The variable to be used for density on the map.
                   Can be any of the following: 
                   - varroa_mites 
                   - diseases 
                   - other 
                   - unknown 
                   - pesticides
                   - other_pests

    returns:
        fig: A chloropleth graph object

    ''' 
    
    df = input_[input_['period'] == period_]
    
    
    
    fig = go.Figure(data=go.Choropleth(
       
        locations=df.state_code,
        z=df[category_],
        zmin = 0,
        zmax = 70,
        locationmode='USA-states',
        colorscale='Reds',
        autocolorscale=False,
        #text=test_df['text'], # hover text
        marker_line_color='white', # line markers between states
        colorbar_title="population %",
        #coloraxis = dict(cmin = 0, cmax = 100),
     
    ))

    fig.update_layout(
        #color_axis=dict(color_axis = dict(colorbar=dict(len=)))
        height=900,
        width=1200,
        title_text=str(period_) + ' '+ category_ + ' Stressor<br>(Hover for breakdown)',
        geo = dict(
            scope='usa',
            projection=go.layout.geo.Projection(type = 'albers usa'),
            showlakes=True, # lakes
            lakecolor='rgb(255, 255, 255)'),
    )
    
    return fig

## Production Data

In [4]:
all_data = get_data()
honey_df = all_data[0]
honey_df.shape

(760, 9)

In [5]:
honey_df.head()

Unnamed: 0,state,honey_colonies,yield_per_col,production,stocks,avg_price_per_lb,prod_value,state_code,year
0,Alabama,16.0,78.0,1248.0,187.0,59.0,736.0,AL,2000
1,Arizona,40.0,59.0,2360.0,1322.0,73.0,1723.0,AZ,2000
2,Arkansas,55.0,93.0,5115.0,3529.0,57.0,2916.0,AR,2000
3,California,440.0,70.0,30800.0,11396.0,59.0,18172.0,CA,2000
4,Colorado,29.0,60.0,1740.0,957.0,62.0,1079.0,CO,2000


## Colony and Disease Data

In 2015 the USDA began collecting more in-depth data on honey colony counts and health stressors that may affect them on a quarterly basis. These reports contain counts that are recorded by the thousands and stressor data that is recorded percentage wise to the populations per state.

In [6]:
colony_data = all_data[1]
colony_data.shape

(704, 18)

In [7]:
colony_data.head()

Unnamed: 0,state,varroa_mites,other_pests,diseases,pesticides,other,unknown,quarter,state_code,year,initial_count,max,lost,lost_perc,added,renovated,renovated_perc,period
0,Alabama,10.0,5.4,0.0,2.2,9.1,9.4,Q1,AL,2015,7000.0,7000.0,1800.0,26.0,2800.0,250.0,4.0,2015Q1
1,Arizona,26.9,20.5,0.1,0.0,1.8,3.1,Q1,AZ,2015,35000.0,35000.0,4600.0,13.0,3400.0,2100.0,6.0,2015Q1
2,Arkansas,17.6,11.4,1.5,3.4,1.0,1.0,Q1,AR,2015,13000.0,14000.0,1500.0,11.0,1200.0,90.0,1.0,2015Q1
3,California,24.700001,7.2,3.0,7.5,6.5,2.8,Q1,CA,2015,1440000.0,1690000.0,255000.0,15.0,250000.0,124000.0,7.0,2015Q1
4,Colorado,14.6,0.9,1.8,0.6,2.6,5.9,Q1,CO,2015,3500.0,12500.0,1500.0,12.0,200.0,140.0,1.0,2015Q1


Of the stressors recorded in the data, they include varroa mites, other pests, diseases, pesticides, other, and unknown. The counts and population data are recorded, where renovated_perc signifies the percentage of colonies that were renovated and the lost_perc signifies the percentage of colonies lost.

In [8]:
ca_df = colony_data[colony_data.state_code =='CA']
ca_df.head(16)

Unnamed: 0,state,varroa_mites,other_pests,diseases,pesticides,other,unknown,quarter,state_code,year,initial_count,max,lost,lost_perc,added,renovated,renovated_perc,period
3,California,24.700001,7.2,3.0,7.5,6.5,2.8,Q1,CA,2015,1440000.0,1690000.0,255000.0,15.0,250000.0,124000.0,7.0,2015Q1
47,California,50.900002,21.5,7.8,23.0,14.2,3.1,Q2,CA,2015,1040000.0,1050000.0,104000.0,10.0,170000.0,285000.0,27.0,2015Q2
91,California,39.900002,20.4,12.5,19.6,15.0,4.7,Q3,CA,2015,730000.0,800000.0,76000.0,10.0,40000.0,93000.0,12.0,2015Q3
135,California,37.299999,12.7,6.2,11.7,10.5,7.3,Q4,CA,2015,750000.0,1260000.0,149000.0,12.0,39000.0,75000.0,6.0,2015Q4
179,California,38.099998,15.2,9.0,13.5,8.4,4.4,Q1,CA,2016,1130000.0,1410000.0,200000.0,14.0,210000.0,139000.0,10.0,2016Q1
223,California,49.700001,14.1,11.2,14.2,15.3,2.6,Q2,CA,2016,1110000.0,1150000.0,108000.0,9.0,240000.0,185000.0,16.0,2016Q2
267,California,42.200001,17.799999,8.5,18.799999,10.4,6.9,Q3,CA,2016,740000.0,820000.0,82000.0,10.0,33000.0,78000.0,10.0,2016Q3
311,California,41.900002,13.7,8.0,9.1,9.6,3.5,Q4,CA,2016,770000.0,1330000.0,205000.0,15.0,40000.0,35000.0,3.0,2016Q4
355,California,37.700001,12.6,7.5,9.2,6.6,6.3,Q1,CA,2017,1170000.0,1470000.0,195000.0,13.0,137000.0,93000.0,6.0,2017Q1
399,California,38.0,7.7,3.1,13.0,4.8,3.4,Q2,CA,2017,980000.0,990000.0,90000.0,9.0,160000.0,220000.0,22.0,2017Q2


In [9]:
ca_df.shape

(16, 18)

In [12]:
# Create traces
fig = go.Figure()
fig.add_trace(go.Scatter(x=ca_df.period, y=ca_df.lost_perc,
                    mode='lines+markers',
                    name='colonies lost'))
fig.add_trace(go.Scatter(x=ca_df.period, y=ca_df.varroa_mites,
                    mode='lines+markers',
                    name='varroa mites'))
fig.add_trace(go.Scatter(x=ca_df.period, y=ca_df.pesticides,
                    mode='lines+markers', name='pesticides'))

fig.add_trace(go.Scatter(x=ca_df.period, y=ca_df.other_pests,
                    mode='lines+markers', name='other pests'))

fig.add_trace(go.Scatter(x=ca_df.period, y=ca_df.diseases,
                    mode='lines+markers', name='diseases'))

fig.update_layout(title='Health Stressors to Bee Colonies Jan 2015 - Dec 2018',
                   xaxis_title='Quarter',
                   yaxis_title='Percentage of Colonies Affected')


fig.show()

In [39]:
mymap = generate_map_object2(colony_data, '2015Q1', 'varroa_mites')

In [40]:
mymap.show()

In [41]:
fig = go.Figure()
annotations = []
for i in colony_data.state.unique():
    #print(colony_data[colony_data.state == i].shape)
    x_=list(colony_data[colony_data.state == i].period)
    y_=list(colony_data[colony_data.state == i].varroa_mites)
    
    for j in y_:
        if j > 60:
            color_ = 'red'
            line_size = 4
            mode_size = 12
        else:
            color_ = 'rgb(189,189,189)'
            line_size = 2
            mode_size = 8
            
    fig.add_trace(go.Scatter(x=x_, y=y_, mode='lines',
        name=i,
        line=dict(color=color_, width=line_size),
        connectgaps=True,
    ))

    # endpoints
    fig.add_trace(go.Scatter(
        x=[x_[0], x_[-1]],
        y=[y_[0], y_[-1]],
        name=i,
        mode='markers',
        marker=dict(color=color_, size=mode_size)
    ))
    
    # labeling the left_side of the plot
    if color_ == 'red':
        annotations.append(dict(xref='paper', x=0.05, y=y_[0],
                                      xanchor='right', yanchor='middle',
                                      text=i,
                                      font=dict(family='Arial',
                                                size=16),
                                      showarrow=False))

        # labeling the right_side of the plot
        annotations.append(dict(xref='paper', x=0.95, y=y_[-1],
                                      xanchor='left', yanchor='middle',
                                      text='{}%'.format(int(y_[-1])),
                                      font=dict(family='Arial',
                                                size=16),
                                      showarrow=False))
        
    
fig.update_layout(
    width = 800,
    height = 800,
    xaxis=dict(
        showline=True,
        showgrid=False,
        showticklabels=True,
        linecolor='rgb(204, 204, 204)',
        linewidth=2,
        ticks='outside',
        tickfont=dict(
            family='Arial',
            size=12,
            color='rgb(82, 82, 82)',
        ),
    ),
    yaxis=dict(
        showgrid=False,
        zeroline=False,
        showline=False,
        showticklabels=False,
    ),
    autosize=False,
    margin=dict(
        autoexpand=False,
        l=100,
        r=20,
        t=110,
    ),
    showlegend=False,
    plot_bgcolor='white'
)

# Title
annotations.append(dict(xref='paper', yref='paper', x=0.0, y=1.05,
                              xanchor='left', yanchor='bottom',
                              text='Varroa Mites',
                              font=dict(family='Arial',
                                        size=30,
                                        color='rgb(37,37,37)'),
                              showarrow=False))
# Source
annotations.append(dict(xref='paper', yref='paper', x=0.5, y=-0.2,
                              xanchor='center', yanchor='top',
                              text='Source: PewResearch Center & ' +
                                   'Storytelling with data',
                              font=dict(family='Arial',
                                        size=12,
                                        color='rgb(150,150,150)'),
                              showarrow=False))

fig.update_layout(annotations=annotations)

fig.show()

In [14]:
colony_data[colony_data.state=="California"]["max"]

3      1690000.0
47     1050000.0
91      800000.0
135    1260000.0
179    1410000.0
223    1150000.0
267     820000.0
311    1330000.0
355    1470000.0
399     990000.0
443     670000.0
487    1200000.0
531    1540000.0
575    1200000.0
619     700000.0
663    1200000.0
Name: max, dtype: float32

In [230]:
def generate_line_plot(input_, col_names, state_):
    '''
    Returns a multiline graph object of stressors for a specfic US state.
    
    input parameters:
        input_: DataFrame containing data
        col_names: Names of lines to be traced
        state_: Name of US State the
    
    output
    '''
    fig = go.Figure()
    annotations = []
    colors = ['crimson', 'LightSkyBlue', "MediumPurple", "green", "orange", "yellowgreen"]
    color_ix = 0
    for i in col_names:
        
        x_=list(input_[input_.state == state_].period)
        y_=list(input_[input_.state == state_][i])

        line_size = 4
        mode_size = 12
        color_ = colors[color_ix]

        fig.add_trace(go.Scatter(x=x_, y=y_, mode='lines',
            name=i,
            line=dict(color=color_, width=line_size),
            connectgaps=True,
            showlegend=True
        ))

        # endpoints
        max_val = max(y_)
        max_ix = x_[np.argmax(np.array(y_))]
        fig.add_trace(go.Scatter(
            x=[max_ix],
            y=[max_val],
            name=i,
            mode='markers+text',
            marker=dict(color=color_, size=mode_size),
            showlegend = False,
            text = '{}%'.format(round(max_val,2)),
            textposition = 'middle right'
        ))
        
        color_ix = color_ix + 1
        
    
    fig.update_layout(
        width = 800,
        height = 800,
        xaxis=dict(
            showline=True,
            showgrid=False,
            showticklabels=True,
            linecolor='rgb(204, 204, 204)',
            linewidth=2,
            ticks='outside',
            tickfont=dict(
                family='Arial',
                size=12,
                color='rgb(82, 82, 82)',
            ),
        ),
        yaxis=dict(
            showgrid=False,
            zeroline=False,
            showline=False,
            showticklabels=False,
        ),
        autosize=False,
        margin=dict(
            autoexpand=False,
            l=100,
            r=20,
            t=110,
        ),
        showlegend=False,
        plot_bgcolor='white'
    )

    # Title
    annotations.append(dict(xref='paper', yref='paper', x=0.0, y=1.05,
                                  xanchor='left', yanchor='bottom',
                                  text='Bee Colony Stressors in the State of ' + state_,
                                  font=dict(family='Arial',
                                            size=30,
                                            color='rgb(37,37,37)'),
                                  showarrow=False))
    # Source
    annotations.append(dict(xref='paper', yref='paper', x=0.5, y=-0.1,
                                  xanchor='center', yanchor='top',
                                  text='Source: United States Department of Agriculture (USDA)',
                                  font=dict(family='Arial',
                                            size=12,
                                            color='rgb(150,150,150)'),
                                  showarrow=False))
    
    tick_text = ["2015", "", "", "", "2016", "", "", "", "2017", "", "", "", "2018", "", "", ""]
    tick_vals = ["2015Q1", "2015Q2", "2015Q3", "2015Q4",
                 "2016Q1", "2016Q2", "2016Q3", "2016Q4", 
                 "2017Q1", "2017Q2", "2017Q3", "2017Q4", 
                 "2018Q1", "2018Q2", "2018Q3", "2018Q4"]
    fig.update_xaxes(ticktext=tick_text, tickvals = tick_vals, tickangle=0, tickfont=dict(family='Rockwell'))
    fig.update_layout(annotations=annotations, showlegend=True, legend_orientation='h', legend=dict(x=0, y=1.03))
    
    return fig

In [231]:
stressors = ["varroa_mites", "other_pests", "diseases", "pesticides", "other", "unknown"]
wow = generate_line_plot(colony_data, stressors, "California")

'top left', 'top center', 'top right', 'middle left',
            'middle center', 'middle right', 'bottom left', 'bottom
            center', 'bottom right']

In [232]:
wow.show()

# Notes:

- Dynamic chloropleth map with dropdown menu (inputs include the 6 stressors) and slider (quarter periods)

- Individual state exploration reveals line plot with each stressors mapped over time

- plot of renovated versus lost

In [236]:
def get_state_dropdown():
    dict_list= []
    for i in us_state_abbrev.keys():
        dict_list.append({'label': i, 'value': i})
    return dict_list

In [237]:
d = get_state_dropdown()

## USDA Sources
- [colony_data](https://usda.library.cornell.edu/concern/publications/rn301137d?locale=en)
- [honey_production_data](https://usda.library.cornell.edu/concern/publications/hd76s004z?locale=en)
- [pesticide_data_program](https://apps.ams.usda.gov/pdp)

In [42]:
colony_data.head()

Unnamed: 0,state,varroa_mites,other_pests,diseases,pesticides,other,unknown,quarter,state_code,year,initial_count,max,lost,lost_perc,added,renovated,renovated_perc,period
0,Alabama,10.0,5.4,0.0,2.2,9.1,9.4,Q1,AL,2015,7000.0,7000.0,1800.0,26.0,2800.0,250.0,4.0,2015Q1
1,Arizona,26.9,20.5,0.1,0.0,1.8,3.1,Q1,AZ,2015,35000.0,35000.0,4600.0,13.0,3400.0,2100.0,6.0,2015Q1
2,Arkansas,17.6,11.4,1.5,3.4,1.0,1.0,Q1,AR,2015,13000.0,14000.0,1500.0,11.0,1200.0,90.0,1.0,2015Q1
3,California,24.700001,7.2,3.0,7.5,6.5,2.8,Q1,CA,2015,1440000.0,1690000.0,255000.0,15.0,250000.0,124000.0,7.0,2015Q1
4,Colorado,14.6,0.9,1.8,0.6,2.6,5.9,Q1,CO,2015,3500.0,12500.0,1500.0,12.0,200.0,140.0,1.0,2015Q1


2/ Includes American and European foulbrood, chalkbrood, stonebrood, paralysis (acute and chronic), kashmir, deformed wing, sacbrood, IAPV, Lake Sinai II, etc.

1/ Tracheal mites, nosema, hive beetle, wax moths, etc.

3/ Includes weather, starvation, insufficient forage, queen failure, hive damage/destroyed, etc.

In [242]:
stressor_keys = {'varroa_mites': "Varroa Mites",
                 'pesticides': "Pesticides",
                 'other': 'Other Categories (Weather, Starvation, etc.)',
                 'unknown': 'Unknown Causes',
                 'other_pests': 'Other Pests (Tracheal Mites, Hive Beetles, Wax Moths, etc.)',
                'diseases': 'Diseases (Foulbrood, Chalkbrood, Stonebrood, Paralysis)',
                'lost_perc': "Colonies Lost"}

In [243]:
stressor_keys['varroa_mites']

'Varroa Mites'

In [244]:
honey_df.head()

Unnamed: 0,state,honey_colonies,yield_per_col,production,stocks,avg_price_per_lb,prod_value,state_code,year
0,Alabama,16.0,78.0,1248.0,187.0,59.0,736.0,AL,2000
1,Arizona,40.0,59.0,2360.0,1322.0,73.0,1723.0,AZ,2000
2,Arkansas,55.0,93.0,5115.0,3529.0,57.0,2916.0,AR,2000
3,California,440.0,70.0,30800.0,11396.0,59.0,18172.0,CA,2000
4,Colorado,29.0,60.0,1740.0,957.0,62.0,1079.0,CO,2000


In [245]:
honey_df[honey_df.state_code == "CA"]

Unnamed: 0,state,honey_colonies,yield_per_col,production,stocks,avg_price_per_lb,prod_value,state_code,year
3,California,440.0,70.0,30800.0,11396.0,59.0,18172.0,CA,2000
3,California,425.0,65.0,27625.0,7735.0,69.0,19061.0,CA,2001
3,California,470.0,50.0,23500.0,3525.0,132.0,31020.0,CA,2002
3,California,480.0,67.0,32160.0,6432.0,139.0,44702.0,CA,2003
3,California,390.0,45.0,17550.0,5792.0,101.0,17726.0,CA,2004
3,California,400.0,75.0,30000.0,9300.0,84.0,25200.0,CA,2005
3,California,380.0,52.0,19760.0,7706.0,99.0,19562.0,CA,2006
3,California,340.0,40.0,13600.0,3672.0,104.0,14144.0,CA,2007
3,California,360.0,51.0,18360.0,4039.0,139.0,25520.0,CA,2008
3,California,355.0,33.0,11715.0,2109.0,139.0,16284.0,CA,2009


In [252]:
honey_df[honey_df.year==2010].sort_values(by='honey_colonies', ascending = False).head(10)

Unnamed: 0,state,honey_colonies,yield_per_col,production,stocks,avg_price_per_lb,prod_value,state_code,year
28,North Dakota,510.0,91.0,46410.0,12995.0,150.0,69615.0,ND,2010
3,California,410.0,67.0,27470.0,6318.0,155.0,42579.0,CA,2010
33,South Dakota,265.0,58.0,15370.0,4765.0,151.0,23209.0,SD,2010
5,Florida,200.0,69.0,13800.0,1794.0,156.0,21528.0,FL,2010
22,Montana,157.0,74.0,11618.0,2905.0,158.0,18356.0,MT,2010
18,Minnesota,128.0,66.0,8448.0,1774.0,155.0,13094.0,MN,2010
35,Texas,100.0,72.0,7200.0,792.0,151.0,10872.0,TX,2010
8,Idaho,97.0,27.0,2619.0,1179.0,161.0,4217.0,ID,2010
17,Michigan,71.0,58.0,4118.0,1524.0,167.0,6877.0,MI,2010
39,Washington,71.0,37.0,2627.0,1077.0,157.0,4124.0,WA,2010


In [325]:
def generate_bubble_chart(input_, year_, n):
    fig = go.Figure()
    w_ = input_[input_.year==year_].sort_values(by='honey_colonies', ascending = False).head(n).state
    x_=input_[input_.year==year_].sort_values(by='honey_colonies', ascending = False).head(n).avg_price_per_lb
    y_=input_[input_.year==year_].sort_values(by='honey_colonies', ascending = False).head(n).yield_per_col
    z_ = input_[input_.year==year_].sort_values(by='honey_colonies', ascending = False).head(n).honey_colonies
    
    annotations = []
    for q,i,j,k in zip(w_,x_,y_,z_):
        fig.add_trace(go.Scatter(
            x= [i],
            y= [j],
            name = str(q),
            mode='markers',
            marker=dict(
                opacity=0.6,
                size=[int(k)/5],
            ),
            showlegend = True,
            text = q + '<br>' + 'No. of Colonies: {}'.format(k) + 'k',
            textposition = 'top center'
        ))
        
     # Title
    annotations.append(dict(xref='paper', yref='paper', x=0.0, y=1.05,
                                  xanchor='left', yanchor='bottom',
                                  text='Top ' + str(n) + " Honey Producing States In " + str(year_),
                                  font=dict(family='Arial',
                                            size=30,
                                            color='rgb(37,37,37)'),
                                  showarrow=False))
    # Source
    annotations.append(dict(xref='paper', yref='paper', x=0.5, y=-0.18,
                                  xanchor='center', yanchor='top',
                                  text='Source: United States Department of Agriculture (USDA)',
                                  font=dict(family='Arial',
                                            size=12,
                                            color='rgb(150,150,150)'),
                                  showarrow=False))
    
    fig.update_layout(annotations=annotations,
                     xaxis_title = "Avg. Price Per Pound ($US)",
                     yaxis_title = "Yield Per Colony (lbs.)",
                     plot_bgcolor = 'white')
    
    return fig

In [326]:
woo = generate_bubble_chart(honey_df, 2010, 15)

In [None]:
woo.show()