<a href="https://colab.research.google.com/github/difuse-dartmouth/BIOSAT_W23/blob/main/BioSat_DIFUSE_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Packages

In [None]:
#import relevant packages
import pandas as pd
import matplotlib as plt
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# statistics packages
from sklearn import preprocessing
from sklearn import metrics
from matplotlib.pyplot import figure
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from google.colab import widgets as g_widgets
from google.colab import output
from matplotlib import pylab
from mpl_toolkits.mplot3d import Axes3D
import ipywidgets as widgets 
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
# mapping packages
!pip install geopandas
import geopandas as gpd
import shapely.geometry
import plotly.express as px
import plotly.graph_objs as go
from shapely.ops import transform
! pip install pyproj
import pyproj


from google.colab import data_table
# !pip install itables
# from itables import init_notebook_mode
from IPython.display import HTML, Markdown, display
import warnings 
warnings.filterwarnings('ignore')


#load in data from Git repo
url = 'https://raw.githubusercontent.com/difuse-dartmouth/BIOSAT_W23/main/completed_module/data/Bio_Sat_Module_2015_Data_DIFUSE_23W.csv?token=GHSAT0AAAAAAB5O72MYTV6R3TRYYZDB5C76ZAODMUA'
df = pd.read_csv(url, index_col=False)

# create df without the text columns to not confuse plotting commands
df2 = df.drop(columns=['State','County'])

def printmd(string):
    display(Markdown(string))


# =======
# Create data to use for borders: MASSACHUSETTS
# =======

url15 = "https://raw.githubusercontent.com/difuse-dartmouth/BIOSAT_W23/main/completed_module/data/JSON%20Files/ms_json.json?token=GHSAT0AAAAAAB5O72MYLXSJY664PBSMLO4CZAODOAQ"
ma_counties = gpd.read_file(url15)

#Filter to MA counties
df_ma = df.loc[df['State'] == "Massachusetts"]
df_ma = df_ma.reset_index(drop=True)

#Sort json data by county (alphabetical)
ma_counties = ma_counties.sort_values("name10")
ma_counties = ma_counties.reset_index(drop=True)

#Join county data to geographical data
ma = ma_counties.join(df_ma)

#Drop unnecessary columns 
ma_data = ma.drop(columns=["id","statefp10","countyns10","geoid10","namelsad10","lsad10","classfp10",
                           "mtfcc10","csafp10","cbsafp10","metdivfp10","funcstat10","aland10","awater10","intptlat10","intptlon10"])

ma_data.sort_values(by=['countyfp10'], inplace=True)

#define centroids 
ma_data["center"] = ma_data.geometry.centroid
MA_parishes = ma_data[['name10','geometry']].set_index("name10")
map_variables_ma = list(ma_data.columns)

#remove irrelevant fields for analysis
removed_cols = ['countyfp10','name10','geometry', 'County','State', 'center']
for item in removed_cols:
  map_variables_ma.remove(item)

# Scale data to use for bubble layer
data = ma_data[[m for m in map_variables_ma]] 
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(2,50))
data_scaled = min_max_scaler.fit_transform(data)
scaled_ma = pd.DataFrame(data_scaled, columns=list(map_variables_ma))

# =======
# Create data to use for borders: MARYLAND
# =======

url3 = 'https://raw.githubusercontent.com/difuse-dartmouth/BIOSAT_W23/main/completed_module/data/JSON%20Files/md_json.json?token=GHSAT0AAAAAAB5O72MYOUVGRKKSHTVEQXXWZAODNSQ'
md_counties = gpd.read_file(url3)

# transform file to lon/lat
md_counties = md_counties.set_crs(crs = 3857, allow_override = True) # set it to the crs that it is (labeled wrong)
md_counties = md_counties.to_crs(4326)

#Filter to MD counties
df_md = df.loc[df['State'] == "Maryland"]
df_md = df_md.reset_index(drop=True)

#Sort json data by county (alphabetical)
md_counties = md_counties.sort_values("county")
md_counties = md_counties.loc[md_counties['county'] != "Baltimore City"] # remove baltimore city, baltimore county data from DIFUSE set contains baltimore city data
md_counties = md_counties.reset_index(drop=True)

#Join county data to geographical data
md = md_counties.join(df_md)

#Drop unnecessary columns 
md_data = md.drop(columns=["OBJECTID","district","county_fip","creation_d","last_updat","shape_Leng","shape_Area"])
md_data.sort_values(by=['countynum'], inplace=True)

#define centroids 
md_data["center"] = md_data.geometry.centroid
MD_parishes = md_data[['county','geometry']].set_index("county")
map_variables_md = list(md_data.columns)

#remove irrelevant fields for analysis
removed_cols = ['countynum','county','geometry', 'County','State', 'center']
for item in removed_cols:
  map_variables_md.remove(item)

# Scale data to use for bubble layer
data = md_data[[m for m in map_variables_md]] 
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(2,50))
data_scaled = min_max_scaler.fit_transform(data)
scaled_md = pd.DataFrame(data_scaled, columns=list(map_variables_md))

# =======
# Create data to use for borders: CONNECTICUT
# =======

#read in spatial data for map from Git repo into geopandas dataframe
url2 = 'https://raw.githubusercontent.com/difuse-dartmouth/BIOSAT_W23/main/completed_module/data/JSON%20Files/ct_json.json?token=GHSAT0AAAAAAB5O72MYW66G4HIJ2JRE6BMCZAODNJA'
ct_counties = gpd.read_file(url2)
ct_counties.sort_values(by=['GEOID10'], inplace=True) #only one with county names

#Filter to CT counties
df_ct = df.loc[df['State'] == "Connecticut"]
df_ct = df_ct.reset_index(drop=True)

#Sort json data by county (alphabetical)
ct_counties = ct_counties.sort_values("NAME10")
ct_counties = ct_counties.reset_index(drop=True)

#Join county data to geographical data
ct = ct_counties.join(df_ct)

#Drop unnecessary columns 
ct_data = ct.drop(columns=["STATEFP10","COUNTYFP10","COUNTYNS10","GEOID10","NAMELSAD10","LSAD10","CLASSFP10","MTFCC10","CSAFP10",
                           "METDIVFP10","FUNCSTAT10","ALAND10","AWATER10","geo_id_ff2","GeoID_AFF1"])

ct_data.sort_values(by=['CBSAFP10'], inplace=True)

#define centroids 
ct_data["center"] = ct_data.geometry.centroid
CT_parishes = ct_data[['NAME10','geometry']].set_index("NAME10")
map_variables_ct = list(ct_data.columns)


#remove irrelevant fields for analysis
removed_cols = ['CBSAFP10','NAME10','INTPTLAT10','INTPTLON10','geometry', 'County','State', 'center']
for item in removed_cols:
  map_variables_ct.remove(item)


# Scale data to use for bubble layer
data = ct_data[[m for m in map_variables_ct]] 
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(2,50))
data_scaled = min_max_scaler.fit_transform(data)
scaled_ct = pd.DataFrame(data_scaled, columns=list(map_variables_ct))

# =======
# Create data to use for  borders: NEW HAMPSHIRE
# =======

url4 = 'https://raw.githubusercontent.com/difuse-dartmouth/BIOSAT_W23/main/completed_module/data/JSON%20Files/nh_json.json?token=GHSAT0AAAAAAB5O72MYAV63UWAVKZLDRQVYZAODOKQ'
nh_counties = gpd.read_file(url4)

#Filter to NH counties
df_nh = df.loc[df['State'] == "New Hampshire"]
df_nh = df_nh.reset_index(drop=True)

#Sort json data by county (alphabetical)
nh_counties = nh_counties.sort_values("county_nam")
nh_counties = nh_counties.reset_index(drop=True)

#Join county data to geographical data
nh = nh_counties.join(df_nh)

#Drop unnecessary columns 
nh_data = nh.drop(columns=['permanent_', 'source_fea', 'source_dat', 'source_d_1', 'source_ori', 'loaddate'])
nh_data.sort_values(by=['stco_fipsc'], inplace=True)

#define centroids 
nh_data["center"] = nh_data.geometry.centroid
NH_parishes = nh_data[['county_nam','geometry']].set_index("county_nam")
map_variables_nh = list(nh_data.columns)

#remove irrelevant fields for analysis
removed_cols = ['fcode','state_fips','state_name','county_fip','county_nam','stco_fipsc','population','areasqkm',
                'gnis_id','gnis_name','shape_Leng','shape_Area','ObjectID','geometry','County', 'State', 'center']
for item in removed_cols:
  map_variables_nh.remove(item)

# Scale data to use for bubble layer
data = nh_data[[m for m in map_variables_nh]] 
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(2,50))
data_scaled = min_max_scaler.fit_transform(data)
scaled_nh = pd.DataFrame(data_scaled, columns=list(map_variables_nh))

# =======
# FOR ALL PLOTS
# =======

#define list of variable names for Variable(Chloropleth) in map tab
variable_list2 = ['Lyme Disease Cases per 100,000 Residents',
                  'White-tailed Deer Population Estimate']
variable_list1 = ['Percent Open Water', 'Percent Developed Open Space', 'Percent Developed Low Intensity','Percent Developed High Intensity', 'Percent Developed Medium Intensity', 'Percent Barren Land', 'Percent Deciduous Forest',
                  'Percent Evergreen Forest','Percent Mixed Forest', 'Percent Shrub/Scrub', 'Percent Grassland/Herbaceous', 'Percent Pasture/Hay','Percent Cultivated Crops',
                  'Percent Woody Wetlands', 'Percent Emergent Herbaceous Wetlands', 'Precipitation (in)', 'Average Temperature (F)']

#color bar options
color_scales = ["BlueYellow", "PurpleOrange", "Purples", "Reds", "RedBlue", "BlueRed"]

#store column dates of dataset
feature_names = list(df)
feature_names2 = list(df2)

#store states & counties
states= df["State"].unique()
county= df["County"].unique()

# re-cast states and counties as shorter variables
s_abrev = df["State"]
abrev = {'Maryland':'MD', 'Massachusetts':'MA', 'New Hampshire':'NH', 'Connecticut':'CT'}
s_abrev = s_abrev.replace(abrev)

c_short = df["County"]

def shorten(inputstring):
  return inputstring[0:-7]

for i in range(len(c_short)):
  c_short[i] = shorten(c_short[i])


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting geopandas
  Downloading geopandas-0.12.2-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyproj>=2.6.1.post1
  Downloading pyproj-3.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fiona>=1.8
  Downloading Fiona-1.9.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.0/16.0 MB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
Collecting click-plugins>=1.0
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Collecting cligj>=0.5
  Downloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Collecting munch>=2.3.2
  Downloading munch-2.5.0-p



# Interactive Panel for Student Activities

In [None]:
#@title Main Tabs

# =======
# CREATE DROPDOWNS
# =======
#function for creating dropdowns using widget functionality in Colab
def create_dropdown(choices, first, des):
  return widgets.Dropdown(
      options = choices,
      value = choices[first],
      description = des+":", 
      disabled=False,
      )

# create dropdowns for tabs
widget1 = create_dropdown(feature_names2, 0, "X Variable")
widget2 = create_dropdown(feature_names2, 1, "Y Variable")
widget3 = create_dropdown([2,3,4,5,6], 0, "K equals")
widget4 = create_dropdown([1,2],0,"PC number")
widget5 = create_dropdown(color_scales, 2, "Colorbar")
widget6 = create_dropdown(variable_list1, 1, "Variable")
widget7 = create_dropdown(variable_list2, 0, "Lyme/Deer")
widget_s = create_dropdown(feature_names, 0, "Sort By")
widget9 = create_dropdown(["Connecticut", "New Hampshire", "Maryland", "Massachusetts"], 1, "State")
# -------


# =======
# MAP FUNCTION
# =======
#Draw the interactive map with the counties (For now, let's try choropleth)
def draw_map(drop6, drop7, drop8, drop9):
  fig = go.Figure()

  # Add choropleth layer
  color_dict = {'BlueYellow':'Viridis', 'PurpleOrange':'Inferno', 
                'Purples':'Purples', 'Reds':'Reds', 'RedBlue':'Rdbu', 
                'BlueRed':'IceFire'}

# STATE SELECTION
# each map needss its geojson data, county names, bubble centroids, initial map position, and scaled data

  if drop9 == "Connecticut":
    geoj_temp = CT_parishes.__geo_interface__
    loc_temp = ct_data.NAME10
    bubble_lat = ct_data.center.y
    bubble_lon = ct_data.center.x
    map_lat = 41.5631
    map_lon = -72.4388
    scaled = scaled_ct
    start_zoom = 7.5

  elif drop9 == "Maryland":
    geoj_temp = MD_parishes.__geo_interface__
    loc_temp = md_data.county
    bubble_lat = md_data.center.y
    bubble_lon = md_data.center.x
    map_lat = 39.0805
    map_lon = -77.2193
    scaled = scaled_md
    start_zoom = 6.75
    
  elif drop9 == "Massachusetts":
    geoj_temp = MA_parishes.__geo_interface__
    loc_temp = ma_data.name10
    bubble_lat = ma_data.center.y
    bubble_lon = ma_data.center.x
    map_lat = 42.2153
    map_lon = -71.7508
    scaled = scaled_ma
    start_zoom = 6.75

  else:
    geoj_temp = NH_parishes.__geo_interface__
    loc_temp = nh_data.county_nam
    bubble_lat = nh_data.center.y
    bubble_lon = nh_data.center.x
    map_lat = 43.6939
    map_lon = -71.5724
    scaled = scaled_nh
    start_zoom = 6.75

# Draw Counties
  fig.add_trace(go.Choroplethmapbox(geojson= geoj_temp, 
                                    locations= loc_temp, 
                                    z = df[drop7], 
                                    colorbar = {'title': drop7},
                                    colorscale = color_dict.get(drop8)))
  
#  Add bubble layer
  fig.add_trace(go.Scattermapbox(lat = bubble_lat, 
                                 lon = bubble_lon, mode = 'markers',
                                 marker=go.scattermapbox.Marker(size=scaled[drop6],
                                                                color='Tomato'),  
                                 name = drop6
                ))


  fig.update_layout(mapbox_style="carto-positron", 
                  legend=dict(yanchor="top",y=0.99,xanchor="left",x=0.01), 
                  showlegend=True, 
                  mapbox_zoom=start_zoom, 
                  mapbox_center = {"lat": map_lat, "lon": map_lon}, 
                  width=1000, height=750, margin={"r":0,"t":0,"l":0,"b":0}
                  )
  fig.show()
# -------

data_table.disable_dataframe_formatter()

# widget for SelectionBox (used in table function)
select_columns = widgets.SelectMultiple(
    options=feature_names2,
    value=['Percent Open Water'],
    #rows=10,
    description='Columns:',
    layout= {'width':'400px', 'height':'200px'}
)

# =======
# TABLE FUNCTION
# =======
def show_df(columns, drop_s):
  
  df_county_state = df[["County", "State"]] # always have county and state
  for a in columns:
    df_county_state = pd.concat([df_county_state, df[a]], axis=1) # append selected columns

  # check if sort var is in selection, if so, sort
  for a in columns:
    if (a == drop_s):
      df_county_state = df_county_state.sort_values(by=[drop_s])  
  # check if sort var is in defaults, if so, sort
  if (drop_s == "State") or (drop_s == "County"):
    df_county_state = df_county_state.sort_values(by=[drop_s])

  display(df_county_state)
# -------    


# =======
# K MEANS FUNCTION
# =======
def plot_data_kmeans(drop1,drop2,drop3):
  if (drop1 == drop2):
    print("X Variable cannot equal Y Variable") #error message for plotting something vs. itself
    return
  f, ax = plt.subplots(figsize = [12,10])
  plt.xlabel(drop1, fontsize = 20)
  plt.ylabel(drop2, fontsize = 20)
  df3 = df2[[drop1, drop2]]
  df3 = df3.dropna(axis='rows') # remove NA entries
  x = df3[[drop1]].values
  y = df3[[drop2]].values

  # run k-means
  df3_std = StandardScaler().fit_transform(df3)
  kmeans_df3 = KMeans(init="random", n_clusters = drop3, n_init=10, random_state=1)
  kmeans_df3.fit(df3_std)
  df3['cluster'] = kmeans_df3.labels_
  # group the data by cluster for plotting
  groups = df3.groupby('cluster')
  df3.rename(columns = {drop1:'x'}, inplace = True) # rename columns to 'x' and 'y' for plotting below
  df3.rename(columns = {drop2:'y'}, inplace = True)

  # plot command
  for name,group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', markersize =14, label=name)

  # point labels 
  for x, y, c, s in zip(x, y, c_short, s_abrev):
    label = f"{c}, {s}"
    ax.annotate(label, (x,y), textcoords = 'data', fontsize = 8)

  plt.grid(True)
  plt.yticks(fontsize=14)
  plt.xticks(fontsize=14) 
  plt.show()
# -------


# =======
# PCA FUNCTION
# =======
# function for creating a plot of PCA results and Scree plot of variable weight in PCA
def plot_data_PCA(drop4):
  # set up figure
  f, ax = plt.subplots(figsize = [12,10])
  f2, ax2 = plt.subplots(figsize = [12,10])
  ax.set_xlabel('PC1', fontsize = 20)
  ax.set_ylabel('PC2', fontsize = 20)
  ax.set_title('Data After PCA',fontsize = 20)
  ax2.set_xlabel('Variable', fontsize = 20)
  ax2.set_ylabel('Weight of Variable', fontsize = 20)
  # title based on selection
  if (drop4 == 1):
    ax2.set_title('Component Plot of Principal Component 1',fontsize = 20)
  else:
    ax2.set_title('Component Plot of Principal Component 2',fontsize = 20)
    

  # scale data and compute PCA
  df2_std = StandardScaler().fit_transform(df2)
  pca = PCA(n_components = 2)
  df2_new = pca.fit_transform(df2_std)

  # plot
  ax.plot(df2_new[:,0], df2_new[:,1], marker='o', linestyle='', markersize = 14) # scatter of PCA result
  ax.grid(True)

  # point labels 
  for x, y, c, s in zip(df2_new[:,0], df2_new[:,1], c_short, s_abrev):
    label = f"{c}, {s}"
    ax.annotate(label, (x,y), textcoords = 'data', fontsize = 8)
  
  weights = abs( pca.components_ ) # find components of chosen PC
  x = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19]
  var_names = ['Open Water','Dev Open Space','Dev Low','Dev High','Dev Med','Barren Land',
               'Decidious Forest','Evergreen Forest','Mixed Forest','Shrub','Grassland','Pasture',
               'Crops','Woody Wetlands','Herbaceous Wetlands','Precipitation','Temperature',
               'Lyme Disease','Deer Population']
  ax2.bar(x, weights[drop4-1,:], width = 0.8, bottom = None, tick_label = var_names) # barplot of weight
  ax2.tick_params(axis='x',labelrotation = 90)
  
  plt.show()
# -------


# =======
# CREATE TABS
# =======

#function for creating the different tabs and creating content in each tab
def create_tab(location):
  tb = g_widgets.TabBar(['Table View','Map Plots','PCA Results', 
                         'K-means Plots'], location=location)
  with tb.output_to('Table View'):
    printmd("**Description**")
    printmd("This tab displays a table of the data. The State and County are always displayed, "
    "and then you can select the additional columns you wish to see in the box below.")
    printmd("Select one variable just by clicking on it to highlight it. Select multiple variables by clicking while holding the 'shift' or 'command' key")
    printmd("Finally, a dropdown at the bottom of the selection box allows you to choose a variable to sort the data by (in ascending order). "
      "Note that if the variable you select to sort by is not in the columns you have selected, the data will not be sorted.")
    print("\n")
    widgets.interact(show_df, columns = select_columns, drop_s = widget_s)


  with tb.output_to('Map Plots'):
    printmd("**Description**")
    printmd("This tab displays a map of the data."
      "The first dropdown selects the variable that you would like to display in the bubbles. "
      "The second dropdown selects the variable that you would like to shade the counties by. "
      "The third dropdown allows you to select a color scale to shade the states with. Try a few and see which you like best."
      "The last dropdown selects the state that you want to view in the map. You can only view one state at a time.")
    printmd("Finally, the map is interactive and you can use the buttons in the upper right corner to zoom in, zoom out, and pan your view of the map."
      "These buttons will appear when your mouse is over the map.")
    print("\n")
    widgets.interact(draw_map, drop6 = widget7, drop7 = widget6, drop8=widget5, drop9 = widget9)

  with tb.output_to('PCA Results'):
    printmd("**Description**")
    printmd("This tab displays the results of a PCA conducted on all variables.")
    printmd("It first shows a new scatter plot on the axes of PC1 and PC2 (the two most important principal components). "
    "All points are labeled with the county and state that they correspond to.")
    printmd("The dropdown allows you to select which principal component (PC) to display on the component plot below the PCA results. "
    "A component plot is a visualization of the contribution of each variable to the variation of the data, "
    "a higher value for a variable means that variable is more important in explaining variation.")
    print("\n")
    widgets.interact(plot_data_PCA, drop4=widget4)

  with tb.output_to('K-means Plots'):
    printmd("**Description**")
    printmd("In this tab, you can produce k-means plots. These plots are scatter plots with points colored according to their k-means cluster. "
    "The points are also labeled with the county and state that they correspond to.")
    printmd("The first 2 dropdowns allow you to select variables of interest as X and Y.")
    printmd("The final dropdown allows you to select the number of groups to create (the value of k).")
    print("\n")
    widgets.interact(plot_data_kmeans, drop1=widget1, drop2=widget2, drop3=widget3)
    

create_tab('top')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Description**

This tab displays a table of the data. The State and County are always displayed, and then you can select the additional columns you wish to see in the box below.

Select one variable just by clicking on it to highlight it. Select multiple variables by clicking while holding the 'shift' or 'command' key

Finally, a dropdown at the bottom of the selection box allows you to choose a variable to sort the data by (in ascending order). Note that if the variable you select to sort by is not in the columns you have selected, the data will not be sorted.





interactive(children=(SelectMultiple(description='Columns:', index=(0,), layout=Layout(height='200px', width='…

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Description**

This tab displays a map of the data.The first dropdown selects the variable that you would like to display in the bubbles. The second dropdown selects the variable that you would like to shade the counties by. The third dropdown allows you to select a color scale to shade the states with. Try a few and see which you like best.The last dropdown selects the state that you want to view in the map. You can only view one state at a time.

Finally, the map is interactive and you can use the buttons in the upper right corner to zoom in, zoom out, and pan your view of the map.These buttons will appear when your mouse is over the map.





interactive(children=(Dropdown(description='Lyme/Deer:', options=('Lyme Disease Cases per 100,000 Residents', …

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Description**

This tab displays the results of a PCA conducted on all variables.

It first shows a new scatter plot on the axes of PC1 and PC2 (the two most important principal components). All points are labeled with the county and state that they correspond to.

The dropdown allows you to select which principal component (PC) to display on the component plot below the PCA results. A component plot is a visualization of the contribution of each variable to the variation of the data, a higher value for a variable means that variable is more important in explaining variation.





interactive(children=(Dropdown(description='PC number:', options=(1, 2), value=1), Output()), _dom_classes=('w…

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Description**

In this tab, you can produce k-means plots. These plots are scatter plots with points colored according to their k-means cluster. The points are also labeled with the county and state that they correspond to.

The first 2 dropdowns allow you to select variables of interest as X and Y.

The final dropdown allows you to select the number of groups to create (the value of k).





interactive(children=(Dropdown(description='X Variable:', options=('Percent Open Water', 'Percent Developed Op…

<IPython.core.display.Javascript object>

##<u>Credits</u>
**BioSat Team:** Andy Bean ('23), May Oo Khine ('23), Emma Nguyen ('25), David Guerra, Jay Garaycochea

**DIFUSE Project Manager:** Taylor Hickey ('23)

**DIFUSE PI's:** Prof. Petra Bonfert-Taylor (Thayer School), Prof. Laura Ray (Thayer School), Prof. Scott Pauls (Mathematics), Prof. Lori Loeb (Computer Science)

**Development Team:** Andy Bean compiled code used for the PCA and K-means components, and helped troubleshoot the map component. May Oo Khine compiled and authored code used for the Table view component and compiled code for the map component. Emma Nguyen compiled code used for the interactive map. Andy Bean and Emma Nguyen also created the associated assignment documents and Canvas quizzes for the module. All team members contributed to finding and cleaning the data set and shape files for use in this module.

*This Colab draws on code from the DIFUSE ENVS 3 module, credited below:*

> **ENVS 3 Team:** James Busch (Ph.D. Candidate), William Chen ('23), J.T. Erbaugh (NSF Postdoctoral Fellow), Richard Howarth (Professor of Environmental Studies)

> **ENVS 3 Development:** James Busch compiled and authored code for the web application, correlation matrix, and linear regression components. William Chen compiled and authored the code used for the interactive map.

---

*The DIFUSE project is supported by the National Science Foundation under grant no. IUSE - 1917002*

*Please acknowledge the ENVS3 Team, BioSat Team and the Dartmouth DIFUSE program if you share or utilize this resource*