In [1]:
import pandas as pd

"""
A. Read the energy data
""" 
energy = pd.read_csv("../dataset/energy/organised_Gen.csv")

energy.drop(energy.loc[energy['TYPE OF PRODUCER'] == 'Total Electric Power Industry'].index, inplace=True)
energy.drop(energy.loc[energy['ENERGY SOURCE'] == 'Total'].index, inplace=True)

"""
B. Find Ohio(OH) data records from 2017/01 to 2021/12
"""
energy_oh = energy.loc[(energy['YEAR'].isin(list(range(2017, 2022)))) & 
           (energy['STATE'] == 'OH')].copy()

energy_oh

Unnamed: 0.1,Unnamed: 0,YEAR,MONTH,STATE,TYPE OF PRODUCER,ENERGY SOURCE,GENERATION (Megawatthours)
361063,1387,2017,1,OH,"Combined Heat and Power, Industrial Power",Coal,865.0
361064,1388,2017,1,OH,"Combined Heat and Power, Industrial Power",Natural Gas,17511.0
361065,1389,2017,1,OH,"Combined Heat and Power, Industrial Power",Other Gases,13276.0
361066,1390,2017,1,OH,"Combined Heat and Power, Industrial Power",Other,945.0
361067,1391,2017,1,OH,"Combined Heat and Power, Industrial Power",Petroleum,56.0
...,...,...,...,...,...,...,...
485518,24752,2021,12,OH,"Electric Generators, Electric Utilities",Other,-50.0
485519,24753,2021,12,OH,"Electric Generators, Electric Utilities",Petroleum,939.0
485520,24754,2021,12,OH,"Electric Generators, Electric Utilities",Solar Thermal and Photovoltaic,474.0
485521,24755,2021,12,OH,"Electric Generators, Electric Utilities",Other Biomass,0.0


In [2]:
"""
C. According to energy source, calculate the yearly sumof electricity GENERATION based on ENERGY SOURCE.
(Note that 'total' is not a type of ENERGY SOURCE) 
"""
year_energy_oh = energy_oh.groupby(['YEAR','ENERGY SOURCE'])['GENERATION (Megawatthours)'].sum()

year_energy_oh

YEAR  ENERGY SOURCE                 
2017  Coal                              68343636.0
      Hydroelectric Conventional          277349.0
      Natural Gas                       28799515.0
      Nuclear                           17687789.0
      Other                                -1540.0
      Other Biomass                       469134.0
      Other Gases                         785851.0
      Petroleum                          1241105.0
      Solar Thermal and Photovoltaic      105181.0
      Wind                               1588561.0
      Wood and Wood Derived Fuels         257787.0
2018  Coal                              58726961.0
      Hydroelectric Conventional          244019.0
      Natural Gas                       44214505.0
      Nuclear                           18315007.0
      Other                                 5431.0
      Other Biomass                       472597.0
      Other Gases                         786890.0
      Petroleum                          1315

In [3]:
"""
D. Select the top 3 energy sources generation for every year.
"""

data = []
for year in range(2017, 2022):
    top = year_energy_oh[year].nlargest(3)
    for t in top:
        col = [year, year_energy_oh[year][year_energy_oh[year] == t].index[0], t]
        data += [col]

top3_year_energy_oh = pd.DataFrame(data, columns=['YEAR', 'ENERGY SOURCE', 'GENERATION (Megawatthours)'])

top3_year_energy_oh

Unnamed: 0,YEAR,ENERGY SOURCE,GENERATION (Megawatthours)
0,2017,Coal,68343636.0
1,2017,Natural Gas,28799515.0
2,2017,Nuclear,17687789.0
3,2018,Coal,58726961.0
4,2018,Natural Gas,44214505.0
5,2018,Nuclear,18315007.0
6,2019,Natural Gas,51325455.0
7,2019,Coal,46764527.0
8,2019,Nuclear,17010561.0
9,2020,Natural Gas,52381593.0


In [4]:
"""
E. Output a CSV file which should contain 3 columns: YEAR, ENERGY SOURCE, GENERATION
"""
top3_year_energy_oh.to_csv('../output/energy/top3_year_energy_oh.csv')

top3_year_energy_oh

Unnamed: 0,YEAR,ENERGY SOURCE,GENERATION (Megawatthours)
0,2017,Coal,68343636.0
1,2017,Natural Gas,28799515.0
2,2017,Nuclear,17687789.0
3,2018,Coal,58726961.0
4,2018,Natural Gas,44214505.0
5,2018,Nuclear,18315007.0
6,2019,Natural Gas,51325455.0
7,2019,Coal,46764527.0
8,2019,Nuclear,17010561.0
9,2020,Natural Gas,52381593.0


In [5]:
"""
F. From the Ohio data (described in B), calculate the yearly sum of total GENERATION.
"""
energy_oh_year = energy_oh.groupby(['YEAR'])['GENERATION (Megawatthours)'].sum()

energy_oh_year

YEAR
2017    119554368.0
2018    126184608.0
2019    120001123.0
2020    120992735.0
2021    123335613.0
Name: GENERATION (Megawatthours), dtype: float64

In [6]:
"""
G. Output a CSV file which should contain 2 columns: YEAR, TOTAL_GENERATION
"""
energy_oh_year.to_csv('../output/energy/energy_oh_year.csv')

energy_oh_year

YEAR
2017    119554368.0
2018    126184608.0
2019    120001123.0
2020    120992735.0
2021    123335613.0
Name: GENERATION (Megawatthours), dtype: float64

In [7]:
"""
H. From the Ohio data again, calculate the summation of total GENERATION based on TYPE OF PRODUCER.
"""
total_gen_producer = energy_oh.groupby(['TYPE OF PRODUCER'])['GENERATION (Megawatthours)'].sum()

total_gen_producer

TYPE OF PRODUCER
Combined Heat and Power, Commercial Power             1223287.0
Combined Heat and Power, Electric Power               5971736.0
Combined Heat and Power, Industrial Power             3512833.0
Electric Generators, Electric Utilities              78841367.0
Electric Generators, Independent Power Producers    520519224.0
Name: GENERATION (Megawatthours), dtype: float64

In [8]:
"""
I. Output a CSV file which should contain 2 columns: TYPE OF PRODUCER, GENERATION
"""
total_gen_producer.to_csv('../output/energy/type_producer_generation.csv')