In [1]:
import pandas as pd
import psycopg2
import numpy as np
import os

from dotenv import load_dotenv
from sqlalchemy import create_engine

# load up the variables from the .env file.
load_dotenv()

True

### States Abbreviation Table

In [2]:
# File to Load (Remember to Change These)
file_to_load = "Resources/StatetoAbbrev.csv"
states_data = pd.read_csv(file_to_load)
states_data.head()

Unnamed: 0,State,Abbrev,Code
0,Alabama,Ala.,AL
1,Alaska,Alaska,AK
2,Arizona,Ariz.,AZ
3,Arkansas,Ark.,AR
4,California,Calif.,CA


In [3]:
states_data['State_id']=states_data.index+1
states_data.head()

Unnamed: 0,State,Abbrev,Code,State_id
0,Alabama,Ala.,AL,1
1,Alaska,Alaska,AK,2
2,Arizona,Ariz.,AZ,3
3,Arkansas,Ark.,AR,4
4,California,Calif.,CA,5


### Renewable Potential energy

In [4]:
# File to Load (Remember to Change These)
file_to_load = "Resources/usretechnicalpotential.csv"

# Read Renewable energy potential file and store into Pandas data frame
renewable_energy_data = pd.read_csv(file_to_load)
renewable_energy_data.columns

Index(['Unnamed: 0', 'urbanUtilityScalePV_GWh', 'urbanUtilityScalePV_GW',
       'urbanUtilityScalePV_km2', 'ruralUtilityScalePV_GWh',
       'ruralUtilityScalePV_GW', 'ruralUtilityScalePV_km2', 'rooftopPV_GWh',
       'rooftopPV_GW', 'CSP_GWh', 'CSP_GW', 'CSP_km2', 'onshoreWind_GWh',
       'onshoreWind_GW', 'onshoreWind_km2', 'offshoreWind_GWh',
       'offshoreWind_GW', 'offshoreWind_km2', 'biopowerSolid_GWh',
       'biopowerSolid_GW', 'biopowerSolid_BDT', 'biopowerGaseous_GWh',
       'biopowerGaseous_GW', 'biopowerGaseous_Tonnes-CH4',
       'geothermalHydrothermal_GWh', 'geothermalHydrothermal_GW',
       'EGSGeothermal_GWh', 'EGSGeothermal_GW', 'hydropower_GWh',
       'hydropower_GW', 'hydropower_countOfSites'],
      dtype='object')

In [5]:
renewable_energy_data.rename(columns={'Unnamed: 0' : 'State'}, inplace=True)
bio_power_potential = renewable_energy_data[['State', 'biopowerSolid_GWh', 'biopowerGaseous_GWh']]
bio_power_potential.head()

Unnamed: 0,State,biopowerSolid_GWh,biopowerGaseous_GWh
0,Alabama,11193,1533
1,Alaska,513,61
2,Arizona,1087,837
3,Arkansas,14381,1063
4,California,12408,15510


In [6]:
combined_df = pd.merge(states_data,bio_power_potential,how='outer', on='State')
combined_df.head()

Unnamed: 0,State,Abbrev,Code,State_id,biopowerSolid_GWh,biopowerGaseous_GWh
0,Alabama,Ala.,AL,1,11193,1533
1,Alaska,Alaska,AK,2,513,61
2,Arizona,Ariz.,AZ,3,1087,837
3,Arkansas,Ark.,AR,4,14381,1063
4,California,Calif.,CA,5,12408,15510


In [7]:
m = len(combined_df.index)
print(m)

51


In [8]:
bio_power = combined_df[['State_id', 'biopowerSolid_GWh', 'biopowerGaseous_GWh']]
bio_power.set_index('State_id', inplace=True)
bio_power.head()

Unnamed: 0_level_0,biopowerSolid_GWh,biopowerGaseous_GWh
State_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,11193,1533
2,513,61
3,1087,837
4,14381,1063
5,12408,15510


### Total Net electricity

In [9]:
# File to Load (Remember to Change These)
file_to_load = "Resources/Total Net Electricity Generation-StateRankings.csv"

# Read Renewable energy potential file and store into Pandas data frame
total_electricity_data = pd.read_csv(file_to_load)
total_electricity_data.head()

Unnamed: 0,Rank,State,"Total Net Electricity Generation, thousand MWh",Note: Rankings are based on the full source data values.
0,1,TX,38524,
1,2,FL,20787,
2,3,PA,16521,
3,4,CA,15465,
4,5,IL,12997,


In [10]:
total_electricity = total_electricity_data.drop(columns='Note: Rankings are based on the full source data values.')
total_electricity.rename(columns={'Total Net Electricity Generation, thousand MWh': 'Total_Net(GWh)'}, inplace=True)
total_electricity.head()

Unnamed: 0,Rank,State,Total_Net(GWh)
0,1,TX,38524
1,2,FL,20787
2,3,PA,16521
3,4,CA,15465
4,5,IL,12997


In [11]:
combined_df = pd.merge(states_data,total_electricity,how='outer', left_on='Code', right_on='State')
combined_df.head()

Unnamed: 0,State_x,Abbrev,Code,State_id,Rank,State_y,Total_Net(GWh)
0,Alabama,Ala.,AL,1,7,AL,10575
1,Alaska,Alaska,AK,2,48,AK,476
2,Arizona,Ariz.,AZ,3,10,AZ,9350
3,Arkansas,Ark.,AR,4,31,AR,3532
4,California,Calif.,CA,5,4,CA,15465


In [12]:
n = len(combined_df.index)
for i in range(n):
    if combined_df.loc[i,'Code'] != combined_df.loc[i, 'State_y']:
        print(i, combined_df.loc[i,'Code'], combined_df.loc[i, 'State_y'])

In [13]:
total_electricity_df = combined_df[['State_id', 'Rank', 'Total_Net(GWh)']]
total_electricity_df.set_index('State_id', inplace=True)
total_electricity_df.head()

Unnamed: 0_level_0,Rank,Total_Net(GWh)
State_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,7,10575
2,48,476
3,10,9350
4,31,3532
5,4,15465


### Load Tables

In [14]:
# set the variables for the pull from SQL.  username and password are in an .env file.  You will need those to make this run
# on your machine.

host = 'localhost'
port = 5432
dbname='ETL_Project'
username = os.environ.get('USER_NAME')
pwd = os.environ.get('PASSWORD')


In [15]:
# set up the connection to the database "ETL_Project"
connection_string = f'{username}:{pwd}@{host}:{port}/{dbname}'
engine = create_engine(f'postgresql://{connection_string}')

In [16]:
engine.table_names()

['State_Abb', 'Total_Net_Electricity', 'Bio_Power_Potential']

In [89]:
# set up the connection to the database "ETL_Project"
"""
conn = psycopg2.connect("host='{}' port={} dbname='{}' user={} password={}".format(host, port, dbname, username, pwd))
sql = "SQL statement"
dat = sqlio.read_sql_query(sql, conn)"""


'\nconn = psycopg2.connect("host=\'{}\' port={} dbname=\'{}\' user={} password={}".format(host, port, dbname, username, pwd))\nsql = "SQL statement"\ndat = sqlio.read_sql_query(sql, conn)'

In [90]:
#states_data.to_sql(name='State_Abb', con=engine, if_exists='append', index=False)
#total_electricity_df.to_sql(name='Total_Net_Electricity', con=engine, if_exists='append', index=False)
renewable_energy_data.to_sql(name='Bio_Power_Potential', con=engine, if_exists='append', index=False)