In [1]:
# Import Dependencies
import pandas as pd
from pathlib import Path
import hvplot.pandas

### Data Preprocessing

In [2]:
# Read csv
model_data = Path("Resources/weather_energy_merged.csv")
model_df = pd.read_csv(model_data)
model_df.head()

Unnamed: 0,state,st_abb,date,tmin (f),tmax (f),tavg (f),ppt,coal (kt),natural gas (kMcf),petroleum coke (kt),petroleum liquids (kb)
0,Alabama,AL,2001-01-01,29.1524,51.8234,40.487,119.501,3076,7802,0,340
1,Arkansas,AR,2001-01-01,26.6612,45.7412,36.2012,88.598,1320,2885,0,222
2,Arizona,AZ,2001-01-01,28.2722,51.9206,40.0964,41.955,1773,7222,0,268
3,California,CA,2001-01-01,31.7372,52.7198,42.2276,87.828,154,106635,94,625
4,Colorado,CO,2001-01-01,11.2748,37.4252,24.35,25.485,1777,6538,0,52


In [3]:
# Review the info
model_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13248 entries, 0 to 13247
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   state                   13248 non-null  object 
 1   st_abb                  13248 non-null  object 
 2   date                    13248 non-null  object 
 3   tmin (f)                13248 non-null  float64
 4   tmax (f)                13248 non-null  float64
 5   tavg (f)                13248 non-null  float64
 6   ppt                     13248 non-null  float64
 7   coal (kt)               13248 non-null  object 
 8   natural gas (kMcf)      13248 non-null  object 
 9   petroleum coke (kt)     13248 non-null  object 
 10  petroleum liquids (kb)  13248 non-null  object 
dtypes: float64(4), object(7)
memory usage: 1.1+ MB


In [4]:
# Transform date to datetime
model_df['date'] = pd.to_datetime(model_df['date'])

# Extract useful features from the date
model_df['year'] = model_df['date'].dt.year
model_df['month'] = model_df['date'].dt.month

# Drop the original date column
model_df.drop(['date', 'st_abb'], axis=1, inplace=True)

# Convert object columns to numeric
model_df['coal (kt)'] = pd.to_numeric(model_df['coal (kt)'], errors='coerce')
model_df['natural gas (kMcf)'] = pd.to_numeric(model_df['natural gas (kMcf)'], errors='coerce')
model_df['petroleum coke (kt)'] = pd.to_numeric(model_df['petroleum coke (kt)'], errors='coerce')
model_df['petroleum liquids (kb)'] = pd.to_numeric(model_df['petroleum liquids (kb)'], errors='coerce')

# Transform categorical variables using get_dummies while keeping the original DataFrame
model_df = pd.get_dummies(model_df, columns=["state"], drop_first=True)

# Rename the columns to remove the 'state_' prefix
model_df.columns = model_df.columns.str.replace('state_', '', regex=False)

# Fill in null values
model_df.fillna(0, inplace=True)

# Display the first few rows of the DataFrame
model_df.head()

Unnamed: 0,tmin (f),tmax (f),tavg (f),ppt,coal (kt),natural gas (kMcf),petroleum coke (kt),petroleum liquids (kb),year,month,...,state_South Dakota,state_Tennessee,state_Texas,state_Utah,state_Vermont,state_Virginia,state_Washington,state_West Virginia,state_Wisconsin,state_Wyoming
0,29.1524,51.8234,40.487,119.501,3076.0,7802.0,0.0,340.0,2001,1,...,False,False,False,False,False,False,False,False,False,False
1,26.6612,45.7412,36.2012,88.598,1320.0,2885.0,0.0,222.0,2001,1,...,False,False,False,False,False,False,False,False,False,False
2,28.2722,51.9206,40.0964,41.955,1773.0,7222.0,0.0,268.0,2001,1,...,False,False,False,False,False,False,False,False,False,False
3,31.7372,52.7198,42.2276,87.828,154.0,106635.0,94.0,625.0,2001,1,...,False,False,False,False,False,False,False,False,False,False
4,11.2748,37.4252,24.35,25.485,1777.0,6538.0,0.0,52.0,2001,1,...,False,False,False,False,False,False,False,False,False,False


In [5]:
# Export to csv
model_df.to_csv('Resources/weather_energy_preprocessed.csv', index=False)