In [132]:
import json
import pandas as pd

In [133]:
with open('ei_intermediate_file_paths.json') as output_path_file:
    file_paths = json.load(output_path_file)

NOAA_INPUT_PATH = file_paths.get("raw_noaa.csv")

NOAA_STAGE1_OUTPUT_PATH = file_paths.get("stage1_noaa_output.csv")

In [134]:
noaa_df = pd.read_csv(NOAA_INPUT_PATH)

In [135]:
print(noaa_df.head(5))

       STATION                                  NAME  LATITUDE  LONGITUDE  \
0  USW00013893  MEMPHIS INTERNATIONAL AIRPORT, TN US  35.05639   -89.9864   
1  USW00013893  MEMPHIS INTERNATIONAL AIRPORT, TN US  35.05639   -89.9864   
2  USW00013893  MEMPHIS INTERNATIONAL AIRPORT, TN US  35.05639   -89.9864   
3  USW00013893  MEMPHIS INTERNATIONAL AIRPORT, TN US  35.05639   -89.9864   
4  USW00013893  MEMPHIS INTERNATIONAL AIRPORT, TN US  35.05639   -89.9864   

   ELEVATION     DATE  AWND  CDSD  DP01  DP10  ...  HDSD   PRCP  SNOW  TAVG  \
0       76.6  2009-09   6.3  2023    16    11  ...     1   8.60   0.0  75.7   
1       76.6  2009-10   7.6  2044    14     8  ...   155  10.55   0.0  60.7   
2       76.6  2009-11   6.0  2047     6     2  ...   454   1.37   0.0  55.1   
3       76.6  2009-12   7.8  2047     9     6  ...  1200   5.13   0.0  40.9   
4       76.6  2010-01   8.1     0     9     7  ...  2065   3.93   1.4  37.1   

   TMAX  TMIN   WDF2   WDF5  WSF2  WSF5  
0  83.4  68.0   80.0

In [136]:
# List all column names
columns = noaa_df.columns
print("Column names:\n", columns)

# Check data types of each column
data_types = noaa_df.dtypes
print("Data types of columns:\n", data_types)

# Find out the number of rows in the dataset
num_rows = len(noaa_df)
print("Number of rows:", num_rows)

Column names:
 Index(['STATION', 'NAME', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'DATE', 'AWND',
       'CDSD', 'DP01', 'DP10', 'DX32', 'DX90', 'EMNT', 'EMXP', 'EMXT', 'HDSD',
       'PRCP', 'SNOW', 'TAVG', 'TMAX', 'TMIN', 'WDF2', 'WDF5', 'WSF2', 'WSF5'],
      dtype='object')
Data types of columns:
 STATION       object
NAME          object
LATITUDE     float64
LONGITUDE    float64
ELEVATION    float64
DATE          object
AWND         float64
CDSD           int64
DP01           int64
DP10           int64
DX32           int64
DX90           int64
EMNT           int64
EMXP         float64
EMXT           int64
HDSD           int64
PRCP         float64
SNOW         float64
TAVG         float64
TMAX         float64
TMIN         float64
WDF2         float64
WDF5         float64
WSF2         float64
WSF5         float64
dtype: object
Number of rows: 181


In [137]:
# Check for null values
null_values = noaa_df.isnull().sum()
print("Null values per column:\n", null_values)

Null values per column:
 STATION      0
NAME         0
LATITUDE     0
LONGITUDE    0
ELEVATION    0
DATE         0
AWND         1
CDSD         0
DP01         0
DP10         0
DX32         0
DX90         0
EMNT         0
EMXP         0
EMXT         0
HDSD         0
PRCP         0
SNOW         0
TAVG         0
TMAX         0
TMIN         0
WDF2         1
WDF5         3
WSF2         1
WSF5         3
dtype: int64


In [138]:
# Check for duplicate rows
duplicates = noaa_df.duplicated().sum()
print("Number of duplicate rows:", duplicates)

# Check if there are values for every month between Sep 2009 and Sep 2024
noaa_df['DATE'] = pd.to_datetime(noaa_df['DATE'], errors='coerce')
num_nat_values = noaa_df['DATE'].isna().sum()
print("Number of rows labeled as NaT:", num_nat_values)

missing_months = pd.date_range(start='2009-09-01', end='2024-09-30', freq='MS').difference(noaa_df['DATE'].dropna().dt.to_period('M').drop_duplicates().dt.to_timestamp())
print("Missing months:\n", missing_months)

Number of duplicate rows: 0
Number of rows labeled as NaT: 0
Missing months:
 DatetimeIndex([], dtype='datetime64[ns]', freq='MS')


## Filter for only required columns

No issues noted, so no data cleaning required. Just need to filter down to the columns and months needed for follow-on analysis

In [139]:
# Filter to only the required columns
filtered_columns = [
    'NAME',
    'DATE',
    'TAVG',
    'TMAX',
    'TMIN',
    'PRCP'
]

filtered_noaa_df = noaa_df[filtered_columns]
print("Filtered DataFrame:\n", filtered_noaa_df.head(12))

Filtered DataFrame:
                                     NAME       DATE  TAVG  TMAX  TMIN   PRCP
0   MEMPHIS INTERNATIONAL AIRPORT, TN US 2009-09-01  75.7  83.4  68.0   8.60
1   MEMPHIS INTERNATIONAL AIRPORT, TN US 2009-10-01  60.7  69.0  52.5  10.55
2   MEMPHIS INTERNATIONAL AIRPORT, TN US 2009-11-01  55.1  65.1  45.1   1.37
3   MEMPHIS INTERNATIONAL AIRPORT, TN US 2009-12-01  40.9  48.3  33.5   5.13
4   MEMPHIS INTERNATIONAL AIRPORT, TN US 2010-01-01  37.1  45.2  29.0   3.93
5   MEMPHIS INTERNATIONAL AIRPORT, TN US 2010-02-01  37.4  45.2  29.6   3.87
6   MEMPHIS INTERNATIONAL AIRPORT, TN US 2010-03-01  52.6  62.5  42.8   3.62
7   MEMPHIS INTERNATIONAL AIRPORT, TN US 2010-04-01  66.0  76.4  55.5   7.02
8   MEMPHIS INTERNATIONAL AIRPORT, TN US 2010-05-01  74.2  83.2  65.1   9.26
9   MEMPHIS INTERNATIONAL AIRPORT, TN US 2010-06-01  84.6  93.6  75.6   0.31
10  MEMPHIS INTERNATIONAL AIRPORT, TN US 2010-07-01  84.9  93.8  75.9   6.30
11  MEMPHIS INTERNATIONAL AIRPORT, TN US 2010-08-01  86

In [140]:
# Summary statistics to understand the dataset better (excluding the DATE column)
filtered_summary_stats = filtered_noaa_df.drop(columns=['DATE']).describe()
print("Summary statistics:\n", filtered_summary_stats)

Summary statistics:
              TAVG        TMAX        TMIN        PRCP
count  181.000000  181.000000  181.000000  181.000000
mean    63.958564   73.319890   54.594475    4.725967
std     14.983628   15.046622   14.982793    2.812537
min     36.100000   44.700000   24.700000    0.120000
25%     49.800000   59.900000   40.600000    2.600000
50%     64.800000   74.600000   54.800000    4.220000
75%     78.400000   87.500000   69.500000    6.300000
max     87.200000   97.100000   77.300000   16.200000


In [141]:
filtered_noaa_df.to_csv(NOAA_STAGE1_OUTPUT_PATH, index=False)

print("Filtered NOAA data saved to:", NOAA_STAGE1_OUTPUT_PATH)

Filtered NOAA data saved to: intermediate/stage1-output/stage1_noaa_output.csv
