In [1]:
import pandas as pd

"""
A. Read the energy data
""" 
energy = pd.read_csv("../dataset/energy/organised_Gen.csv")

"""
B. Find Ohio(OH) data records from 2017/01 to 2021/12
"""
energy_oh = energy.loc[(energy['YEAR'].isin(list(range(2017, 2022)))) & 
           (energy['STATE'] == 'OH')].copy()
energy_oh.drop(energy_oh.loc[energy_oh['ENERGY SOURCE'] == "Total"].index, inplace=True)

energy_oh

Unnamed: 0.1,Unnamed: 0,YEAR,MONTH,STATE,TYPE OF PRODUCER,ENERGY SOURCE,GENERATION (Megawatthours)
361051,1375,2017,1,OH,Total Electric Power Industry,Coal,5989876.0
361052,1376,2017,1,OH,Total Electric Power Industry,Hydroelectric Conventional,27109.0
361053,1377,2017,1,OH,Total Electric Power Industry,Natural Gas,2234568.0
361054,1378,2017,1,OH,Total Electric Power Industry,Nuclear,1567839.0
361055,1379,2017,1,OH,Total Electric Power Industry,Other Gases,60208.0
...,...,...,...,...,...,...,...
485518,24752,2021,12,OH,"Electric Generators, Electric Utilities",Other,-50.0
485519,24753,2021,12,OH,"Electric Generators, Electric Utilities",Petroleum,939.0
485520,24754,2021,12,OH,"Electric Generators, Electric Utilities",Solar Thermal and Photovoltaic,474.0
485521,24755,2021,12,OH,"Electric Generators, Electric Utilities",Other Biomass,0.0


In [2]:
"""
C. According to energy source, calculate the yearly sumof electricity GENERATION based on ENERGY SOURCE.
(Note that 'total' is not a type of ENERGY SOURCE) 
"""
energy.drop(energy.loc[energy['ENERGY SOURCE'] == "Total"].index, inplace=True)

year_energy = energy.groupby(['YEAR','ENERGY SOURCE'])['GENERATION (Megawatthours)'].sum()

for year in energy['YEAR'].unique():
    print(year)
    print(year_energy[year])
    print()

2001
ENERGY SOURCE
Coal                              7.615824e+09
Geothermal                        5.496200e+07
Hydroelectric Conventional        8.678442e+08
Natural Gas                       2.556516e+09
Nuclear                           3.075305e+09
Other                             4.762539e+07
Other Biomass                     5.819260e+07
Other Gases                       3.615789e+07
Petroleum                         4.995209e+08
Pumped Storage                   -3.529378e+07
Solar Thermal and Photovoltaic    2.171020e+06
Wind                              2.694933e+07
Wood and Wood Derived Fuels       1.407996e+08
Name: GENERATION (Megawatthours), dtype: float64

2002
ENERGY SOURCE
Coal                              7.732521e+09
Geothermal                        5.796524e+07
Hydroelectric Conventional        1.057315e+09
Natural Gas                       2.764023e+09
Nuclear                           3.120256e+09
Other                             5.410764e+07
Other Biomass      

In [3]:
"""
D. Select the top 3 energy sources generation for every year.
"""
for year in energy['YEAR'].unique():
    print(year)
    print(year_energy[year].nlargest(3))
    print()

2001
ENERGY SOURCE
Coal           7.615824e+09
Nuclear        3.075305e+09
Natural Gas    2.556516e+09
Name: GENERATION (Megawatthours), dtype: float64

2002
ENERGY SOURCE
Coal           7.732521e+09
Nuclear        3.120256e+09
Natural Gas    2.764023e+09
Name: GENERATION (Megawatthours), dtype: float64

2003
ENERGY SOURCE
Coal           7.894947e+09
Nuclear        3.054931e+09
Natural Gas    2.599630e+09
Name: GENERATION (Megawatthours), dtype: float64

2004
ENERGY SOURCE
Coal           7.913202e+09
Nuclear        3.154114e+09
Natural Gas    2.840400e+09
Name: GENERATION (Megawatthours), dtype: float64

2005
ENERGY SOURCE
Coal           8.051492e+09
Nuclear        3.127945e+09
Natural Gas    3.043841e+09
Name: GENERATION (Megawatthours), dtype: float64

2006
ENERGY SOURCE
Coal           7.962045e+09
Natural Gas    3.265763e+09
Nuclear        3.148875e+09
Name: GENERATION (Megawatthours), dtype: float64

2007
ENERGY SOURCE
Coal           8.065822e+09
Natural Gas    3.586359e+09
Nuclear

In [4]:
"""
E. Output a CSV file which should contain 3 columns: YEAR, ENERGY SOURCE, GENERATION
"""
header = ["YEAR", "ENERGY SOURCE", "GENERATION (Megawatthours)"]
energy.to_csv('../output/energy/year_energy_source_generation.csv', columns = header)

energy

Unnamed: 0.1,Unnamed: 0,YEAR,MONTH,STATE,TYPE OF PRODUCER,ENERGY SOURCE,GENERATION (Megawatthours)
0,0,2001,1,AK,Total Electric Power Industry,Coal,46903.0
1,1,2001,1,AK,Total Electric Power Industry,Petroleum,71085.0
2,2,2001,1,AK,Total Electric Power Industry,Natural Gas,367521.0
3,3,2001,1,AK,Total Electric Power Industry,Hydroelectric Conventional,104549.0
4,4,2001,1,AK,Total Electric Power Industry,Wind,87.0
...,...,...,...,...,...,...,...
496769,10581,2022,5,WY,"Electric Generators, Electric Utilities",Coal,2071403.0
496770,10582,2022,5,WY,"Electric Generators, Electric Utilities",Hydroelectric Conventional,96790.0
496771,10583,2022,5,WY,"Electric Generators, Electric Utilities",Natural Gas,91570.0
496772,10584,2022,5,WY,"Electric Generators, Electric Utilities",Petroleum,1812.0


In [5]:
"""
F. From the Ohio data (described in B), calculate the yearly sum of total GENERATION.
"""
energy_oh.groupby(['YEAR'])['GENERATION (Megawatthours)'].sum()

YEAR
2017    239108735.0
2018    252369220.0
2019    240002251.0
2020    241985469.0
2021    246671221.0
Name: GENERATION (Megawatthours), dtype: float64

In [6]:
"""
G. Output a CSV file which should contain 2 columns: YEAR, TOTAL_GENERATION
"""
year_total_generation = energy.groupby(['YEAR'])['GENERATION (Megawatthours)'].sum()

year_total_generation.to_csv('../output/energy/year_total_generation.csv')

year_total_generation

YEAR
2001    1.494657e+10
2002    1.543381e+10
2003    1.553274e+10
2004    1.588222e+10
2005    1.622169e+10
2006    1.625881e+10
2007    1.662698e+10
2008    1.647755e+10
2009    1.580132e+10
2010    1.650024e+10
2011    1.640262e+10
2012    1.619106e+10
2013    1.626386e+10
2014    1.637174e+10
2015    1.631040e+10
2016    1.630670e+10
2017    1.613708e+10
2018    1.671311e+10
2019    1.650753e+10
2020    1.602807e+10
2021    1.646216e+10
2022    6.722146e+09
Name: GENERATION (Megawatthours), dtype: float64

In [7]:
"""
H. From the Ohio data again, calculate the summation of total GENERATION based on TYPE OF PRODUCER.
"""
energy_oh.groupby(['TYPE OF PRODUCER'])['GENERATION (Megawatthours)'].sum()

TYPE OF PRODUCER
Combined Heat and Power, Commercial Power             1223287.0
Combined Heat and Power, Electric Power               5971736.0
Combined Heat and Power, Industrial Power             3512833.0
Electric Generators, Electric Utilities              78841367.0
Electric Generators, Independent Power Producers    520519224.0
Total Electric Power Industry                       610068449.0
Name: GENERATION (Megawatthours), dtype: float64

In [8]:
"""
I. Output a CSV file which should contain 2 columns: TYPE OF PRODUCER, GENERATION
"""
type_producer_generation = energy.groupby(['TYPE OF PRODUCER'])['GENERATION (Megawatthours)'].sum()
type_producer_generation = type_producer_generation.drop('Total Electric Power Industry')

type_producer_generation.to_csv('../output/energy/type_producer_generation.csv')

type_producer_generation

TYPE OF PRODUCER
Combined Heat and Power, Commercial Power           4.472786e+08
Combined Heat and Power, Electric Power             6.866598e+09
Combined Heat and Power, Industrial Power           6.225653e+09
Electric Generators, Electric Utilities             1.025074e+11
Electric Generators, Independent Power Producers    5.700226e+10
Name: GENERATION (Megawatthours), dtype: float64