<a href="https://colab.research.google.com/github/envirodatascience/final-project-insect-team/blob/main/01_Precipitation_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Part 1: Import modules, read in data

In [None]:
# load the pandas package and read in the data frame
import pandas as pd
url = 'https://www.ncei.noaa.gov/orders/cdo/3997103.csv'
df = pd.read_csv(url)

## 1.1 Look into the data

In [None]:
# first rows of the data
df.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,PRCP
0,USC00067002,"ROUND POND, CT US",41.3008,-73.5369,243.8,1949,38.41
1,USC00067002,"ROUND POND, CT US",41.3008,-73.5369,243.8,1950,47.65
2,USC00067002,"ROUND POND, CT US",41.3008,-73.5369,243.8,1951,54.05
3,USC00067002,"ROUND POND, CT US",41.3008,-73.5369,243.8,1952,60.71
4,USC00067002,"ROUND POND, CT US",41.3008,-73.5369,243.8,1953,49.63


In [None]:
# the types of columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3910 entries, 0 to 3909
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   STATION    3910 non-null   object 
 1   NAME       3910 non-null   object 
 2   LATITUDE   3910 non-null   float64
 3   LONGITUDE  3910 non-null   float64
 4   ELEVATION  3910 non-null   float64
 5   DATE       3910 non-null   int64  
 6   PRCP       3364 non-null   float64
dtypes: float64(4), int64(1), object(2)
memory usage: 214.0+ KB


In [None]:
# the shape of the data
df.shape

(3910, 7)

In [None]:
# column names
df.columns

Index(['STATION', 'NAME', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'DATE', 'PRCP',
       'PRCP_ATTRIBUTES'],
      dtype='object')

# Describing the Data


In [None]:
df.describe()

Unnamed: 0,LATITUDE,LONGITUDE,ELEVATION,DATE,PRCP
count,3910.0,3910.0,3910.0,3910.0,3364.0
mean,41.586961,-72.837674,116.722532,1972.158568,48.063199
std,0.264447,0.494189,96.809515,33.756201,8.827999
min,41.062171,-73.7,0.9,1858.0,18.35
25%,41.35024,-73.2296,46.0,1952.0,42.01
50%,41.5833,-72.93333,91.4,1969.0,47.275
75%,41.7951,-72.39281,167.6,1999.0,53.7125
max,42.027433,-71.830421,438.9,2025.0,88.52


# Handle missing data


In [None]:
# find missing data
df.isnull().sum()

Unnamed: 0,0
STATION,0
NAME,0
LATITUDE,0
LONGITUDE,0
ELEVATION,0
DATE,0
PRCP,546


In [None]:
# find rows with the NAs
df[df.isna().any(axis=1)]

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,PRCP
41,USC00067002,"ROUND POND, CT US",41.30080,-73.53690,243.8,1999,
74,USC00069783,"WOODBURY 3 W, CT US",41.56667,-73.26667,285.0,1965,
76,USC00067361,"DERBY SHELTON, CT US",41.31667,-73.08333,18.0,1887,
77,USC00067361,"DERBY SHELTON, CT US",41.31667,-73.08333,18.0,1888,
78,USC00067361,"DERBY SHELTON, CT US",41.31667,-73.08333,18.0,1889,
...,...,...,...,...,...,...,...
3841,USC00064767,"MIDDLETOWN 4 W, CT US",41.55000,-72.71667,112.5,1992,
3846,USC00064767,"MIDDLETOWN 4 W, CT US",41.55000,-72.71667,112.5,1997,
3848,USC00060961,"BULLS BRIDGE DAM, CT US",41.67630,-73.50860,79.2,1952,
3867,USC00060961,"BULLS BRIDGE DAM, CT US",41.67630,-73.50860,79.2,1971,


In [None]:
# group by year of the NAs
df[df.isna().any(axis=1)].groupby('DATE').count()


Unnamed: 0_level_0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,PRCP
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1884,1,1,1,1,1,0
1885,2,2,2,2,2,0
1886,2,2,2,2,2,0
1887,2,2,2,2,2,0
1888,2,2,2,2,2,0
...,...,...,...,...,...,...
2021,3,3,3,3,3,0
2022,3,3,3,3,3,0
2023,2,2,2,2,2,0
2024,4,4,4,4,4,0


In [None]:
# drop NAs - we want yearly averages for CT ultimately, so NAs don't add anything for us
df.dropna(inplace=True)

In [None]:
# Rename some columns
# rename STATION to STATION_ID,NAME TO STATION_NAME
df.rename(columns={'STATION': 'STATION_ID', 'NAME': 'STATION_NAME'}, inplace=True)

# Calculating average precipitation per year


In [None]:
# average, minimum, and maximum precipitation per year
df_avg = df.groupby('DATE').agg(PAVG = ('PRCP', 'mean'),
                                PMIN = ('PRCP', 'min'),
                                PMAX = ('PRCP', 'max')).reset_index()
df_avg.head()

Unnamed: 0,DATE,PAVG,PMIN,PMAX
0,1858,41.89,41.89,41.89
1,1859,53.31,53.31,53.31
2,1860,37.38,37.38,37.38
3,1861,47.03,47.03,47.03
4,1862,48.02,48.02,48.02


In [None]:
# save dataset to runtime
df_avg.to_csv('noaa_precip_yearly_ct.csv', index=False)