In [8]:
# Load Libraries
library(MASS)
library(dplyr)

# we've used relative paths.  If you need absolute paths, change these variables or set your
# working directory to the directory of this file.
ENRGENICS_IMPORTEIA_PATH = "/home/david/Programming/github/dhpollack/spl-enRgenics/ENRgenics_ImportEIA/ENRgenics_ImportEIA.r"
ENRGENICS_CLIMATE_DATA_PATH = "../data/climdiv-tmpcst-v1.0.0-20160605"
PATH_VARS = c(ENRGENICS_IMPORTEIA_PATH, ENRGENICS_CLIMATE_DATA_PATH)

# Legend Variable
# see # ftp://ftp.ncdc.noaa.gov/pub/data/cirs/climdiv/state-readme.txt
statecodelegend = c("001"="Alabama","030"="New York","002"="Arizona","031"="North Carolina","003"="Arkansas","032"="North Dakota","004"="California","033"="Ohio","005"="Colorado","034"="Oklahoma","006"="Connecticut","035"="Oregon","007"="Delaware","036"="Pennsylvania","008"="Florida","037"="Rhode Island","009"="Georgia","038"="South Carolina","010"="Idaho","039"="South Dakota","011"="Illinois","040"="Tennessee","012"="Indiana","041"="Texas","013"="Iowa","042"="Utah","014"="Kansas","043"="Vermont","015"="Kentucky","044"="Virginia","016"="Louisiana","045"="Washington","017"="Maine","046"="West Virginia","018"="Maryland","047"="Wisconsin","019"="Massachusetts","048"="Wyoming","020"="Michigan","050"="Alaska","021"="Minnesota","022"="Mississippi","023"="Missouri","024"="Montana","025"="Nebraska","026"="Nevada","027"="New Hampshire","028"="New Jersey","029"="New Mexico","049"="Hawaii")

# Helper Functions
splitel <- function(x) {
  statecode = substr(x,1,3)
  divnum = substr(x,4,4)
  elcode = substr(x,5,6)
  year = substr(x,7,10)
  return(c(statecode, divnum, elcode, year))
}
stfind <- function(x) {
  r = "Other"
  if(x %in% names(statecodelegend)) {
    r = statecodelegend[[x]]
  }
  return(r)
}


load_weather_data <- function(file = PATH_VARS[2]) {
    dat = read.table(file, colClasses = "character")

    c1sp = lapply(dat[,1], splitel)
    c1sp = data.frame(matrix(unlist(c1sp), nrow=length(c1sp), byrow=T),stringsAsFactors=FALSE)
    dat = cbind(dat, c1sp)
    colnames(dat) <- gsub("V","", names(dat))
    library(reshape2)
    dat_long = melt(dat, id.vars=c(names(dat)[1], names(dat)[14:17]), direction="long")
    dat_long$variable = as.numeric(as.character(dat_long$variable)) - 1
    colnames(dat_long) = c("col1", "statecode", "divnum", "elcode", "Year", "Month", "temp")
    statenames = sapply(dat$X1,function(x) {stfind(x)})
    dat_long$statename = statenames
    #st1 = cbind(cbind(state.x77, state.abb),state.name)
    #colnames(st1)[which(colnames(st1) == "state.abb")] = "State"
    #colnames(st1)[which(colnames(st1) == "state.name")] = "statename"
    #st1 = as.data.frame(st1,stringsAsFactors=FALSE)
    #r = dat_long %>% left_join(st1)
    r = dat_long
    numindexes = c("temp")
    r[,numindexes] = sapply(r[,numindexes],as.numeric)
    intindexes = c("Month", "Year")
    r[,intindexes] = sapply(r[,intindexes],as.integer)
    return(r)

}

load_state_data <- function() {
    st1 = cbind(cbind(state.x77, state.abb),state.name)
    colnames(st1)[which(colnames(st1) == "state.abb")] = "State"
    colnames(st1)[which(colnames(st1) == "state.name")] = "statename"
    st1 = as.data.frame(st1,stringsAsFactors=FALSE)
    numindexes = c("Population", "Income", "Illiteracy", "Life Exp", "Murder", "HS Grad", "Frost", "Area")
    st1[,numindexes] = sapply(st1[,numindexes],as.numeric)
    return(st1)
    
}

add_data <- function(df1, df2) {
    return(df1 %>% left_join(df2))
}

load_eia_data_with_all_others <- function(file) {
    source(PATH_VARS[1])
    eiadata = load_eia_data(file)
    otherdata = add_data(load_weather_data(), load_state_data())
    alldata = add_data(eiadata, otherdata[,c("Month", "Year", "State", "statename", "temp", "Population", "Income", "Illiteracy", "Life Exp", "Murder", "HS Grad", "Frost", "Area")])
    return(alldata)
}


In [9]:
# Sample Usage: Below is the sample usage of how to source this file into 
# other quantlets.  Note, I have used relative paths, but you may need to use 
# absolute paths OR set the correct working directory.  These variables 
# assume the quantlet is being sourced from a parallel folder

# load data from Quantlet
#ENRGENICS_ADDOTHER_PATH = "../ENRgenics_AddOther/ENRgenics_AddOther.r"
#source(ENRGENICS_ADDOTHER_PATH)

# start here to directly load data from this file for testing
# paths for ImportEIA and climate Data
#ENRGENICS_IMPORTEIA_PATH = "../ENRgenics_ImportEIA/ENRgenics_ImportEIA.r"
#ENRGENICS_CLIMATE_DATA_PATH = "../data/climdiv-tmpcst-v1.0.0-20160605"
#PATH_VARS = c(ENRGENICS_IMPORTEIA_PATH, ENRGENICS_CLIMATE_DATA_PATH)
# location of EIA data file
#EIA_DATA_PATH = "../data/sales_revenue.csv.0"
#file = EIA_DATA_PATH
#df = load_eia_data_with_all_others(file)
#head(df)

Joining by: "statename"
Joining by: c("Year", "Month", "State")


Unnamed: 0,Year,Month,State,DataStatus,Date,Cat,Revenue,Sales,Customers,Price,ID,statename,temp,Population,Income,Illiteracy,Life Exp,Murder,HS Grad,Frost,Area
1,1990,1,AK,Final,631195200,RESIDENTIAL,17477,181752,0,9.62,1,Alaska,-1.5,365,6315,1.5,69.31,11.3,66.7,152,566432
2,1990,1,AL,Final,631195200,RESIDENTIAL,123332,2070093,0,5.96,2,Alabama,49.8,3615,3624,2.1,69.05,15.1,41.3,20,50708
3,1990,1,AR,Final,631195200,RESIDENTIAL,72506,1026320,0,7.06,3,Arkansas,45.6,2110,3378,1.9,70.66,10.1,39.9,65,51945
4,1990,1,AZ,Final,631195200,RESIDENTIAL,109332,1396499,0,7.83,4,Arizona,40.8,2212,4530,1.8,70.55,7.8,58.1,15,113417
5,1990,1,CA,Final,631195200,RESIDENTIAL,597161,6168009,0,9.68,5,California,43.1,21198,5114,1.1,71.71,10.3,62.6,20,156361
6,1990,1,CO,Final,631195200,RESIDENTIAL,71325,1046805,0,6.81,6,Colorado,26.4,2541,4884,0.7,72.06,6.8,63.9,166,103766
