In [1]:
import pandas as pd
import psycopg2
import numpy as np
import os

from dotenv import load_dotenv
from sqlalchemy import create_engine

# load up the variables from the .env file.
load_dotenv()

True

### States Abbreviation Table

In [2]:
# File to Load (Remember to Change These)
file_to_load = "Resources/StatetoAbbrev.csv"
states_data = pd.read_csv(file_to_load)
states_data.head()

Unnamed: 0,State,Abbrev,Code
0,Alabama,Ala.,AL
1,Alaska,Alaska,AK
2,Arizona,Ariz.,AZ
3,Arkansas,Ark.,AR
4,California,Calif.,CA


In [3]:
states_data['State_id']=states_data.index+1
states_data.head()

Unnamed: 0,State,Abbrev,Code,State_id
0,Alabama,Ala.,AL,1
1,Alaska,Alaska,AK,2
2,Arizona,Ariz.,AZ,3
3,Arkansas,Ark.,AR,4
4,California,Calif.,CA,5


### Renewable Potential energy

In [28]:
# File to Load (Remember to Change These)
file_to_load = "Resources/usretechnicalpotential.csv"

# Read Renewable energy potential file and store into Pandas data frame
renewable_energy_data = pd.read_csv(file_to_load)
for cl in renewable_energy_data.columns:
    print(cl)

Unnamed: 0
urbanUtilityScalePV_GWh
urbanUtilityScalePV_GW
urbanUtilityScalePV_km2
ruralUtilityScalePV_GWh
ruralUtilityScalePV_GW
ruralUtilityScalePV_km2
rooftopPV_GWh
rooftopPV_GW
CSP_GWh
CSP_GW
CSP_km2
onshoreWind_GWh
onshoreWind_GW
onshoreWind_km2
offshoreWind_GWh
offshoreWind_GW
offshoreWind_km2
biopowerSolid_GWh
biopowerSolid_GW
biopowerSolid_BDT
biopowerGaseous_GWh
biopowerGaseous_GW
biopowerGaseous_Tonnes-CH4
geothermalHydrothermal_GWh
geothermalHydrothermal_GW
EGSGeothermal_GWh
EGSGeothermal_GW
hydropower_GWh
hydropower_GW
hydropower_countOfSites


In [29]:
renewable_energy_data.rename(columns={'Unnamed: 0' : 'State'}, inplace=True)
renewable_energy_df = renewable_energy_data[['State']].copy()
renewable_energy_df['Wind_Gwh'] = renewable_energy_data['onshoreWind_GWh'].copy()

In [30]:
renewable_energy_data.rename(columns={'Unnamed: 0' : 'State'}, inplace=True)
renewable_potential = renewable_energy_data[[
                        'State',
                        'urbanUtilityScalePV_GWh', 'ruralUtilityScalePV_GWh',
                        'rooftopPV_GWh', 'CSP_GWh',
                        'onshoreWind_GWh', 'offshoreWind_GWh',                        
                        'biopowerSolid_GWh', 'biopowerGaseous_GWh',
                        'geothermalHydrothermal_GWh',
                        'EGSGeothermal_GWh',
                        'hydropower_GWh']]
renewable_potential.head()

Unnamed: 0,State,urbanUtilityScalePV_GWh,ruralUtilityScalePV_GWh,rooftopPV_GWh,CSP_GWh,onshoreWind_GWh,offshoreWind_GWh,biopowerSolid_GWh,biopowerGaseous_GWh,geothermalHydrothermal_GWh,EGSGeothermal_GWh,hydropower_GWh
0,Alabama,35850,3706838,15475.0,0,283,0.0,11193,1533,0,535489.0,4102
1,Alaska,166,8282976,,0,1373433,,513,61,15437,,23675
2,Arizona,121305,11867693,22736.0,12544333,26036,,1087,837,8329,1239147.0,1303
3,Arkansas,28960,4986388,8484.0,0,22892,,14381,1063,0,628621.0,6093
4,California,246008,8855917,106411.0,8490916,89862,2662579.0,12408,15510,130921,1344179.0,30023


In [31]:
combined_df = pd.merge(states_data,renewable_potential,how='outer', on='State')
combined_df.head()

Unnamed: 0,State,Abbrev,Code,State_id,urbanUtilityScalePV_GWh,ruralUtilityScalePV_GWh,rooftopPV_GWh,CSP_GWh,onshoreWind_GWh,offshoreWind_GWh,biopowerSolid_GWh,biopowerGaseous_GWh,geothermalHydrothermal_GWh,EGSGeothermal_GWh,hydropower_GWh
0,Alabama,Ala.,AL,1,35850,3706838,15475.0,0,283,0.0,11193,1533,0,535489.0,4102
1,Alaska,Alaska,AK,2,166,8282976,,0,1373433,,513,61,15437,,23675
2,Arizona,Ariz.,AZ,3,121305,11867693,22736.0,12544333,26036,,1087,837,8329,1239147.0,1303
3,Arkansas,Ark.,AR,4,28960,4986388,8484.0,0,22892,,14381,1063,0,628621.0,6093
4,California,Calif.,CA,5,246008,8855917,106411.0,8490916,89862,2662579.0,12408,15510,130921,1344179.0,30023


In [32]:
m = len(combined_df.index)
print(m)

51


In [33]:
renewable_df = combined_df.drop(columns=['State', 'Abbrev', 'Code'])
#renewable_df.set_index('State_id', inplace=True)
renewable_df.head()

Unnamed: 0,State_id,urbanUtilityScalePV_GWh,ruralUtilityScalePV_GWh,rooftopPV_GWh,CSP_GWh,onshoreWind_GWh,offshoreWind_GWh,biopowerSolid_GWh,biopowerGaseous_GWh,geothermalHydrothermal_GWh,EGSGeothermal_GWh,hydropower_GWh
0,1,35850,3706838,15475.0,0,283,0.0,11193,1533,0,535489.0,4102
1,2,166,8282976,,0,1373433,,513,61,15437,,23675
2,3,121305,11867693,22736.0,12544333,26036,,1087,837,8329,1239147.0,1303
3,4,28960,4986388,8484.0,0,22892,,14381,1063,0,628621.0,6093
4,5,246008,8855917,106411.0,8490916,89862,2662579.0,12408,15510,130921,1344179.0,30023


In [34]:
for cl in renewable_df.columns:
    print(cl)

State_id
urbanUtilityScalePV_GWh
ruralUtilityScalePV_GWh
rooftopPV_GWh
CSP_GWh
onshoreWind_GWh
offshoreWind_GWh
biopowerSolid_GWh
biopowerGaseous_GWh
geothermalHydrothermal_GWh
EGSGeothermal_GWh
hydropower_GWh


### Total Net electricity
will be saved as 'Electricity_Production'

In [10]:
# File to Load (Remember to Change These)
file_to_load = "Resources/Total Net Electricity Generation-StateRankings.csv"

# Read Renewable energy potential file and store into Pandas data frame
total_electricity_data = pd.read_csv(file_to_load)
total_electricity_data.head()

Unnamed: 0,Rank,State,"Total Net Electricity Generation, thousand MWh",Note: Rankings are based on the full source data values.
0,1,TX,38524,
1,2,FL,20787,
2,3,PA,16521,
3,4,CA,15465,
4,5,IL,12997,


In [21]:
total_electricity = total_electricity_data.drop(columns='Note: Rankings are based on the full source data values.')
total_electricity.rename(columns={'Total Net Electricity Generation, thousand MWh': 'Total_Net_GWh'}, inplace=True)
total_electricity.head()

Unnamed: 0,Rank,State,Total_Net_GWh
0,1,TX,38524
1,2,FL,20787
2,3,PA,16521
3,4,CA,15465
4,5,IL,12997


In [22]:
combined_df = pd.merge(states_data,total_electricity,how='outer', left_on='Code', right_on='State')
combined_df.head()

Unnamed: 0,State_x,Abbrev,Code,State_id,Rank,State_y,Total_Net_GWh
0,Alabama,Ala.,AL,1,7,AL,10575
1,Alaska,Alaska,AK,2,48,AK,476
2,Arizona,Ariz.,AZ,3,10,AZ,9350
3,Arkansas,Ark.,AR,4,31,AR,3532
4,California,Calif.,CA,5,4,CA,15465


In [23]:
n = len(combined_df.index)
for i in range(n):
    if combined_df.loc[i,'Code'] != combined_df.loc[i, 'State_y']:
        print(i, combined_df.loc[i,'Code'], combined_df.loc[i, 'State_y'])

In [25]:
total_electricity_df = combined_df[['State_id', 'Rank', 'Total_Net_GWh']]
#total_electricity_df.set_index('State_id', inplace=True)
total_electricity_df.head()

Unnamed: 0,State_id,Rank,Total_Net_GWh
0,1,7,10575
1,2,48,476
2,3,10,9350
3,4,31,3532
4,5,4,15465


### Load Tables

In [15]:
# set the variables for the pull from SQL.  username and password are in an .env file.  You will need those to make this run
# on your machine.

host = 'localhost'
port = 5432
dbname='ETL_Project'
username = os.environ.get('USER_NAME')
pwd = os.environ.get('PASSWORD')


In [16]:
# set up the connection to the database "ETL_Project"
connection_string = f'{username}:{pwd}@{host}:{port}/{dbname}'
engine = create_engine(f'postgresql://{connection_string}')

In [17]:
engine.table_names()

['Total_Energy_Consumed',
 'Total_Energy_Production',
 'Coal_Production',
 'Crudeoil_Production',
 'NaturalGas_Production',
 'Fossil_Fuel_Production',
 'Fossil_Fuel_Estimates',
 'Renewal_Energy_Estimates',
 'Renewal_Energy_Potential',
 'Electricity_Production',
 'State_Abb']

In [18]:
# set up the connection to the database "ETL_Project"
"""
conn = psycopg2.connect("host='{}' port={} dbname='{}' user={} password={}".format(host, port, dbname, username, pwd))
sql = "SQL statement"
dat = sqlio.read_sql_query(sql, conn)"""


'\nconn = psycopg2.connect("host=\'{}\' port={} dbname=\'{}\' user={} password={}".format(host, port, dbname, username, pwd))\nsql = "SQL statement"\ndat = sqlio.read_sql_query(sql, conn)'

In [38]:
# Load 'State Abb' table
states_data.to_sql(name='State_Abb', con=engine, if_exists='append', index=False)

In [None]:
# Load 'Renewable_Energy_Potential' table
renewable_df.to_sql(name='Renewable_Energy_Potential', con=engine, if_exists='append', index=False)

In [None]:
# Load 'Total Net Electricity' table
total_electricity_df.to_sql(name='Electricity_Production', con=engine, if_exists='append', index=False)

### Confirm data has been added by querying the State_Abb table

In [40]:
pd.read_sql_query('select * from "State_Abb"', con=engine).head()

Unnamed: 0,State_id,State,Abbrev,Code
0,1,Alabama,Ala.,AL
1,2,Alaska,Alaska,AK
2,3,Arizona,Ariz.,AZ
3,4,Arkansas,Ark.,AR
4,5,California,Calif.,CA


### Confirm data has been added by querying the Renewable_Energy_Potential table

In [41]:
pd.read_sql_query('select * from "Renewable_Energy_Potential"', con=engine).head()

Unnamed: 0,State_id,urbanUtilityScalePV_GWh,ruralUtilityScalePV_GWh,rooftopPV_GWh,CSP_GWh,onshoreWind_GWh,offshoreWind_GWh,biopowerSolid_GWh,biopowerGaseous_GWh,geothermalHydrothermal_GWh,EGSGeothermal_GWh,hydropower_GWh
0,1,35850,3706838,15475.0,0,283,0.0,11193,1533,0,535489.0,4102
1,2,166,8282976,,0,1373433,,513,61,15437,,23675
2,3,121305,11867693,22736.0,12544333,26036,,1087,837,8329,1239147.0,1303
3,4,28960,4986388,8484.0,0,22892,,14381,1063,0,628621.0,6093
4,5,246008,8855917,106411.0,8490916,89862,2662579.0,12408,15510,130921,1344179.0,30023
