<a href="https://colab.research.google.com/github/envirodatascience/final-project-insect-team/blob/main/temp_data_insect_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

FINAL PROJECT: Insect Group

In this Colab, we will be summarizing temperature data by State and Year (2000-2024)

Data from NOAA: https://www.ncei.noaa.gov/cdo-web/datasets

Documentation: https://www.ncei.noaa.gov/data/global-summary-of-the-year/doc/GSOY_documentation.pdf

In [None]:
# import modules

import pandas as pd
import numpy as np

In [None]:
# download the data
# TODO: save to GitHub, read in from there
df_temp = pd.read_csv('https://www.ncei.noaa.gov/orders/cdo/3998324.csv')

In [None]:
df_temp.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,DT00,DT32,DX32,DX70,DX90,EMNT,EMXT,HDSD,HTDD,TAVG,TMAX,TMIN
0,USC00069388,"WEST THOMPSON LAKE, CT US",41.9442,-71.9031,109.7,2000,,,,,,,,,,,,
1,USC00069388,"WEST THOMPSON LAKE, CT US",41.9442,-71.9031,109.7,2001,,,,,,,,,,,,
2,USC00069388,"WEST THOMPSON LAKE, CT US",41.9442,-71.9031,109.7,2002,,,,,,,,,,,,
3,USC00069388,"WEST THOMPSON LAKE, CT US",41.9442,-71.9031,109.7,2003,,,,,,,,,,,,
4,USC00069388,"WEST THOMPSON LAKE, CT US",41.9442,-71.9031,109.7,2004,,,,,,,,,,,,


In [None]:
df_temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 504 entries, 0 to 503
Data columns (total 18 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   STATION    504 non-null    object 
 1   NAME       504 non-null    object 
 2   LATITUDE   504 non-null    float64
 3   LONGITUDE  504 non-null    float64
 4   ELEVATION  504 non-null    float64
 5   DATE       504 non-null    int64  
 6   DT00       385 non-null    float64
 7   DT32       385 non-null    float64
 8   DX32       385 non-null    float64
 9   DX70       385 non-null    float64
 10  DX90       385 non-null    float64
 11  EMNT       385 non-null    float64
 12  EMXT       385 non-null    float64
 13  HDSD       368 non-null    float64
 14  HTDD       368 non-null    float64
 15  TAVG       382 non-null    float64
 16  TMAX       385 non-null    float64
 17  TMIN       385 non-null    float64
dtypes: float64(15), int64(1), object(2)
memory usage: 71.0+ KB


Data Documentation:


*   STATION:
*   DATE: Year
*   DX70: Number days with maximum temperature greater than 70F (21.1C)
*   DX90: Number days with maximum temperature greater than 90F (32.2C)
*   EMNT: Extreme minimum temperature
*   EMXT: Extreme maximum temperature
*   HTDD: Heating Degree Days
*   TAVG: Average Average Temperature
*   TMAX: Average Maximum Temperature
*   TMIN: Average Minimum Temperature



In [None]:
# check for duplicate rows

df_temp.duplicated().sum()

np.int64(0)

In [None]:
# find nunique for years

df_temp['DATE'].unique()
# TODO: add a value counts line

array([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2011, 2012,
       2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023,
       2024, 2009, 2010])

In [None]:
# group data by year and temperature averages

df_temp.groupby(['STATION', 'DATE']).agg({'DX90': 'sum', 'EMNT': 'min', 'EMXT': 'max', 'TAVG': 'mean', 'TMAX': 'mean', 'TMIN': 'mean'}).reset_index()

Unnamed: 0,STATION,DATE,DX90,EMNT,EMXT,TAVG,TMAX,TMIN
0,USC00060227,2000,4.0,-6.0,93.0,47.1,57.6,36.5
1,USC00060227,2001,9.0,-1.0,95.0,48.7,59.8,37.6
2,USC00060227,2002,22.0,,94.0,,59.8,
3,USC00060227,2003,7.0,-13.0,95.0,46.3,56.5,36.1
4,USC00060227,2004,0.0,-10.0,88.0,46.3,56.5,36.2
...,...,...,...,...,...,...,...,...
499,USW00094702,2020,11.0,11.0,95.0,55.3,62.9,47.7
500,USW00094702,2021,13.0,12.0,96.0,54.8,62.4,47.1
501,USW00094702,2022,11.0,6.0,94.0,53.7,61.7,45.7
502,USW00094702,2023,4.0,-4.0,93.0,54.8,62.5,47.1


In [None]:
# show rows with all missing values

df_temp[df_temp.isna().all(axis=1)]

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,DT00,DT32,DX32,DX70,DX90,EMNT,EMXT,HDSD,HTDD,TAVG,TMAX,TMIN


In [None]:
# display rows with tavg, tmin, and tmax for the yea data missing

df_temp[(df_temp['TAVG'].isna()) & (df_temp['TMIN'].isna()) & (df_temp['TMAX'].isna())]

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,DT00,DT32,DX32,DX70,DX90,EMNT,EMXT,HDSD,HTDD,TAVG,TMAX,TMIN
0,USC00069388,"WEST THOMPSON LAKE, CT US",41.94420,-71.90310,109.7,2000,,,,,,,,,,,,
1,USC00069388,"WEST THOMPSON LAKE, CT US",41.94420,-71.90310,109.7,2001,,,,,,,,,,,,
2,USC00069388,"WEST THOMPSON LAKE, CT US",41.94420,-71.90310,109.7,2002,,,,,,,,,,,,
3,USC00069388,"WEST THOMPSON LAKE, CT US",41.94420,-71.90310,109.7,2003,,,,,,,,,,,,
4,USC00069388,"WEST THOMPSON LAKE, CT US",41.94420,-71.90310,109.7,2004,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421,USC00064488,"MANSFIELD HOLLOW LAK, CT US",41.75694,-72.18556,76.2,2007,,,,,,,,,,,,
448,USW00054734,"DANBURY MUNICIPAL AIRPORT, CT US",41.37215,-73.48337,138.0,2001,,,,,,,,,,,,
472,USC00067958,"STAFFORDVILLE, CT US",41.99840,-72.26060,224.3,2002,,,,,,,,,,,,
497,USC00060808,"TRUMBULL, CT US",41.22764,-73.17445,30.5,2009,,,,,,,,,,,,


In [None]:
# drop these rows of data
# TODO: add justification for dropping NAs, other decisions
df_temp_clean = df_temp.drop(df_temp[(df_temp['TAVG'].isna()) & (df_temp['TMIN'].isna()) & (df_temp['TMAX'].isna())].index).reset_index(drop=True)

In [None]:
# check dataframe

df_temp_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 388 entries, 0 to 387
Data columns (total 18 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   STATION    388 non-null    object 
 1   NAME       388 non-null    object 
 2   LATITUDE   388 non-null    float64
 3   LONGITUDE  388 non-null    float64
 4   ELEVATION  388 non-null    float64
 5   DATE       388 non-null    int64  
 6   DT00       385 non-null    float64
 7   DT32       385 non-null    float64
 8   DX32       385 non-null    float64
 9   DX70       385 non-null    float64
 10  DX90       385 non-null    float64
 11  EMNT       385 non-null    float64
 12  EMXT       385 non-null    float64
 13  HDSD       349 non-null    float64
 14  HTDD       349 non-null    float64
 15  TAVG       382 non-null    float64
 16  TMAX       385 non-null    float64
 17  TMIN       385 non-null    float64
dtypes: float64(15), int64(1), object(2)
memory usage: 54.7+ KB


In [None]:
df_temp_clean.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,DT00,DT32,DX32,DX70,DX90,EMNT,EMXT,HDSD,HTDD,TAVG,TMAX,TMIN
0,USC00069388,"WEST THOMPSON LAKE, CT US",41.9442,-71.9031,109.7,2005,15.0,145.0,32.0,134.0,11.0,-11.0,95.0,,,48.4,59.4,37.3
1,USC00069388,"WEST THOMPSON LAKE, CT US",41.9442,-71.9031,109.7,2007,4.0,155.0,39.0,148.0,11.0,-1.0,92.0,,,48.0,59.1,37.0
2,USC00069388,"WEST THOMPSON LAKE, CT US",41.9442,-71.9031,109.7,2008,0.0,154.0,27.0,124.0,7.0,2.0,95.0,6333.0,6333.0,48.6,59.4,37.8
3,USC00069388,"WEST THOMPSON LAKE, CT US",41.9442,-71.9031,109.7,2011,12.0,144.0,33.0,127.0,3.0,-12.0,98.0,,,49.4,60.0,38.8
4,USC00069388,"WEST THOMPSON LAKE, CT US",41.9442,-71.9031,109.7,2012,0.0,143.0,10.0,138.0,6.0,3.0,95.0,5493.0,5493.0,50.7,61.1,40.2


In [None]:
# get a summary of the average, average maximum, and average minimum temperatures by year for each location

df_temp_clean.groupby(['DATE']).agg({'TMIN': 'mean', 'TAVG': 'mean', 'TMAX': 'mean'}).reset_index()

Unnamed: 0,DATE,TMIN,TAVG,TMAX
0,2000,39.730769,49.566667,59.425
1,2001,40.709091,51.254545,61.8
2,2002,41.791667,51.808333,61.646154
3,2003,39.715385,48.976923,58.230769
4,2004,39.923077,49.7,59.184615
5,2005,39.745455,49.572727,59.372727
6,2006,41.725,51.127273,60.409091
7,2007,39.841176,49.741176,59.664706
8,2008,40.4,50.03125,59.64375
9,2009,39.138462,48.438462,57.753846
