In [1]:
## Dependencies
import pandas as pd
import matplotlib.pyplot as plt
import os

## Feature: Solar consumption

In [2]:
## Source: https://www.eia.gov/totalenergy/data/browser/?tbl=T10.05#/?f=M
filename = "MER_T10_05.csv"
filepath = os.path.join("..","data",filename)
eiadf = pd.read_csv(filepath)
print("Original count: ",len(eiadf))
# Filter rows not to be used
values = eiadf.loc[(eiadf['Description'] == "Solar Energy Consumption") & 
                   (eiadf['Value'] != "Not Available")]
print("Solar consumption count is : ",len(values))
# Remove the data for month 13 (total year consumption)
v = values[['Value', 'YYYYMM']]            # Warning msg!!
v["dates"] = v['YYYYMM'].astype(str)       # make a copy to a string type
v = v.loc[~v['dates'].str.contains('13')]  # remove rows tha contain 13 in the string
v = v.drop(columns=["dates"])              # remove column added
print("After cleanup count is : ",len(v))
v.to_csv("../data/monthdata.csv", index=False)
v.head()

Original count:  6732
Solar consumption count is :  445
After cleanup count is :  399


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Unnamed: 0,Value,YYYYMM
6287,-0.001,198401
6288,0.001,198402
6289,0.002,198403
6290,0.003,198404
6291,0.007,198405


## Feature: Temperature

In [3]:
filename = "TemperatureIndex(REDTI).csv"
filepath = os.path.join("..","data",filename)

temps = pd.read_csv(filepath,index_col=False)
print("Original count: ", len(temps))
t = temps.loc[temps['YEAR']>=1984]
print("Temperature counts: ", len(t))
t.head()

Original count:  1482
Temperature counts:  414


Unnamed: 0,YEAR,REDTI,DEGREE DAYS,MONTH,old,YYYYMM
89,1984,61.6,998.5,1,19841,198401
90,1985,73.5,1056.2,1,19851,198501
91,1986,32.9,858.5,1,19861,198601
92,1987,46.0,922.3,1,19871,198701
93,1988,62.5,1002.7,1,19881,198801


In [4]:
## extra 15 rows of temperature ????

## Feature: Price

In [5]:
## Source: http://www.ecdms.energy.ca.gov
filename = "MER_T09_08.csv"
filepath = os.path.join("..","data",filename)

prices = pd.read_csv(filepath,index_col=False)
print("Original count: ",len(prices))
residential = prices.loc[(prices['Description']== "Average Retail Price of Electricity, Residential") & 
                         (prices['Value'] != "Not Available")]
r = residential.drop(columns=['MSN','Column_Order','Description','Unit'])
print(" residential price count: ", len(r))
r["dates"] = r['YYYYMM'].astype(str)
#remove rows tha contain 13 in the string
r = r.loc[~r['dates'].str.contains('13')]
r = r.drop(columns=["dates"])
print(" after cleanup count: ", len(r))
r.head()

Original count:  3390
 residential price count:  487
 after cleanup count:  417


Unnamed: 0,YYYYMM,Value
22,197607,3.9
23,197608,3.7
24,197609,3.8
25,197610,3.9
26,197611,3.8


## Adding features = X

In [6]:
# Merge values and price 
merge1 = v.merge(r, on="YYYYMM", how="inner")
merge1 = merge1.rename(columns={"Value_x": "Value", "Value_y": "Price"})
merge1.head()

Unnamed: 0,Value,YYYYMM,Price
0,3.132,199001,7.17
1,3.395,199002,7.48
2,4.685,199003,7.57
3,5.16,199004,7.69
4,5.863,199005,7.96


In [7]:
# Merge Temperature


In [8]:
merge2 =merge1.merge(t,on="YYYYMM", how="inner")

In [9]:
merge2 = merge2[['YYYYMM','Value','Price','REDTI']]

In [10]:
merge2.head()

Unnamed: 0,YYYYMM,Value,Price,REDTI
0,199001,3.132,7.17,8.8
1,199002,3.395,7.48,22.7
2,199003,4.685,7.57,29.8
3,199004,5.16,7.69,26.8
4,199005,5.863,7.96,53.9


In [11]:
len(merge2)

327

#### Process dates: months into features

In [12]:
merge2["dates"] = pd.to_datetime(merge2["YYYYMM"], format="%Y%m")

In [13]:
merge2.head()

Unnamed: 0,YYYYMM,Value,Price,REDTI,dates
0,199001,3.132,7.17,8.8,1990-01-01
1,199002,3.395,7.48,22.7,1990-02-01
2,199003,4.685,7.57,29.8,1990-03-01
3,199004,5.16,7.69,26.8,1990-04-01
4,199005,5.863,7.96,53.9,1990-05-01


In [14]:
merge2['year'] = merge2['dates'].dt.year
merge2['month'] = merge2['dates'].dt.month
merge2.head()

Unnamed: 0,YYYYMM,Value,Price,REDTI,dates,year,month
0,199001,3.132,7.17,8.8,1990-01-01,1990,1
1,199002,3.395,7.48,22.7,1990-02-01,1990,2
2,199003,4.685,7.57,29.8,1990-03-01,1990,3
3,199004,5.16,7.69,26.8,1990-04-01,1990,4
4,199005,5.863,7.96,53.9,1990-05-01,1990,5


In [15]:
data = merge2[['Value', 'year', 'month', 'Price','REDTI']]

In [16]:
data.head()

Unnamed: 0,Value,year,month,Price,REDTI
0,3.132,1990,1,7.17,8.8
1,3.395,1990,2,7.48,22.7
2,4.685,1990,3,7.57,29.8
3,5.16,1990,4,7.69,26.8
4,5.863,1990,5,7.96,53.9


In [17]:
datacoded = data.copy()

X = pd.get_dummies(datacoded, columns=["month"])
X.head()

Unnamed: 0,Value,year,Price,REDTI,month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
0,3.132,1990,7.17,8.8,1,0,0,0,0,0,0,0,0,0,0,0
1,3.395,1990,7.48,22.7,0,1,0,0,0,0,0,0,0,0,0,0
2,4.685,1990,7.57,29.8,0,0,1,0,0,0,0,0,0,0,0,0
3,5.16,1990,7.69,26.8,0,0,0,1,0,0,0,0,0,0,0,0
4,5.863,1990,7.96,53.9,0,0,0,0,1,0,0,0,0,0,0,0


In [18]:
len(X)

327

In [19]:
X.to_csv("../data/features.csv")