In [98]:
setwd('C:/Users/iceca/Documents/Earthquake_Damage_Predictor/')
library(tidyverse)


In [99]:
loadPr<- modules::use('Helpers/Load_Preprocessed.R')
train <- loadPr$loadTrain()[[1]]
trainLab <- loadPr$loadTrain()[[2]]

val <- loadPr$loadVal()[[1]]
valLab <- loadPr$loadVal()[[2]]

loadRaw<- modules::use('Helpers/Load_Raw_Data.R')
test <- loadRaw$testVal()

In [112]:
removeId <- function(data) {
    data$X <- NULL
    
    #building_id may be needed in the future if ever there is a need to combine the data_values with data_labels
    #data$building_id <- NULL
    
    return(data)
}
removeLevelsPlan <- function(data) {
    newLevel = "other"
    data$plan_configuration <- plyr::revalue(data$plan_configuration, 
                c("a"=newLevel, "c"=newLevel, "f"=newLevel,
                "m"=newLevel, "n"=newLevel, "o"=newLevel, 
                "s" = newLevel))
    return(data)
}
saveData <- function(tr, trLab, val, valLab, test, name) {
    tr <- removeId(tr)
    trLab <- removeId(trLab) #labels do not have plan configuration to removeLevelsPlan not called
    val <- removeId(val)
    valLab <- removeId(valLab) #labels do not have plan configuration to removeLevelsPlan not called
    #do not remove the ids of the test, needed for submission
    if ("plan_configuration" %in% tr) { #training set was used here but any dataset could be used instead
        tr <- removeLevelsPlan(tr)
        val <- removeLevelsPlan(val)
        test <- removeLevelsPlan(test)
    }
    rootDir <- 'Further_Preprocess_Analysis/Data/'
    write.csv(tr, paste(rootDir, name, '_train.csv', sep=''))
    write.csv(trLab, paste(rootDir, name, '_train_lab.csv', sep=''))
    write.csv(val, paste(rootDir, name, '_val.csv', sep=''))
    write.csv(valLab, paste(rootDir, name, '_val_lab.csv', sep=''))
    write.csv(valLab, paste(rootDir, name, '_test.csv', sep=''))
}

In [113]:
names(train)
saveData(train, trainLab, val, valLab, test, 'original')

### Making a dataset with more features

In [114]:
featureEngineerAdd <- function(data) {
    data$has_superstructure_tree = 
        data$has_superstructure_bamboo | data$has_superstructure_timber
    data$has_superstructure_mortar = 
        data$has_superstructure_mud_mortar_stone | data$has_superstructure_cement_mortar_stone | data$has_superstructure_mud_mortar_brick | data$has_superstructure_cement_mortar_brick
    data$has_superstructure_cement = 
        data$has_superstructure_cement_mortar_stone | data$has_superstructure_timber
    data$has_superstructure_brick = 
        data$has_superstructure_mud_mortar_brick | data$has_superstructure_cement_mortar_brick
    data$has_superstructure_mud = 
        data$has_superstructure_adobe_mud | data$has_superstructure_mud_mortar_stone | data$has_superstructure_mud_mortar_brick
    data$has_superstructure_concrete = 
        data$has_superstructure_rc_non_engineered | data$has_superstructure_rc_engineered 
    data$has_superstructure_stone = 
        data$has_superstructure_mud_mortar_stone | data$has_superstructure_stone_flag | data$has_superstructure_cement_mortar_stone
    return(data)
}

In [115]:
saveData(featureEngineerAdd(train), trainLab, 
         featureEngineerAdd(val), valLab, featureEngineerAdd(test), 'expanded')

### Making a dataset with less features and levels

In [116]:
featureEngineerRemove <- function(data) {
    newLevel <- "other"
    #according to analysis, combine levels that were strikingly similar
    data$foundation_type <- plyr::revalue(data$foundation_type, c("u"=newLevel, "w"=newLevel))
    data$roof_type <- plyr::revalue(data$roof_type, c("n"=newLevel, "q"=newLevel))    
    data %>% select(
        #first add all binary/categorical features that were found to be effective in the analysis stage
                    foundation_type , roof_type , ground_floor_type, other_floor_type, legal_ownership_status, 
                   has_superstructure_stone_flag, has_superstructure_cement_mortar_stone, 
                    has_superstructure_cement_mortar_brick, has_superstructure_rc_non_engineered, 
                    has_superstructure_rc_engineered, has_secondary_use_hotel, has_secondary_use_rental, 
                    has_secondary_use_institution, has_secondary_use_school, has_secondary_use_industry, 
                    has_secondary_use_gov_office,
        #now add all other features that were considered relevent according to research conducted on the internet
                    geo_level_1_id, geo_level_2_id, geo_level_3_id, count_floors_pre_eq, age, 
                    area_percentage, height_percentage, land_surface_condition) %>% return()
}

In [None]:
saveData(featureEngineerRemove(train), trainLab, 
         featureEngineerRemove(val), valLab, featureEngineerRemove(test), 'filtered')