In [1]:
setwd('C:/Users/iceca/Documents/Earthquake_Damage_Predictor/')
library(tidyverse)


Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang
Registered S3 method overwritten by 'rvest':
  method            from
  read_xml.response xml2
-- Attaching packages --------------------------------------- tidyverse 1.2.1 --
v ggplot2 3.1.1       v purrr   0.3.4  
v tibble  2.1.1       v dplyr   0.8.0.1
v tidyr   0.8.3       v stringr 1.4.0  
v readr   1.3.1       v forcats 0.4.0  
"package 'purrr' was built under R version 3.6.3"-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()


In [2]:
loadPr<- modules::use('Helpers/Load_Preprocessed.R')
train <- loadPr$loadTrain()[[1]]
trainLab <- loadPr$loadTrain()[[2]]

val <- loadPr$loadVal()[[1]]
valLab <- loadPr$loadVal()[[2]]

loadRaw<- modules::use('Helpers/Load_Raw_Data.R')
test <- loadRaw$testVal()

In [3]:
removeId <- function(data) {
    data$X <- NULL
    return(data)
}
removeLevelsPlan <- function(data) {
    newLevel = "other"
    data$plan_configuration <- plyr::revalue(data$plan_configuration, 
                c("a"=newLevel, "c"=newLevel, "f"=newLevel,
                "m"=newLevel, "n"=newLevel, "o"=newLevel, 
                "s" = newLevel))
    return(data)
}
labelToFactor <- function(data) {
    data$damage_grade <- as.factor(data$damage_grade)
    return(data)
}
saveData <- function(tr, trLab, val, valLab, test, prefix) {
    tr <- removeId(tr)
    trLab <- labelToFactor(removeId(trLab)) #labels do not have plan configuration to removeLevelsPlan not called
    val <- removeId(val)
    valLab <- labelToFactor(removeId(valLab)) #labels do not have plan configuration to removeLevelsPlan not called
    test <- removeId(test) #do not remove the building_ids of the test, needed for submission
    if ("plan_configuration" %in% names(tr)) { #training set was used here but any dataset could be used instead
        tr <- removeLevelsPlan(tr)
        val <- removeLevelsPlan(val)
        test <- removeLevelsPlan(test)
    }
    rootDir <- 'Further_Preprocess_Analysis/Data/'
    write.csv(tr, paste(rootDir, prefix, '_train.csv', sep=''))
    write.csv(trLab, paste(rootDir, prefix, '_train_lab.csv', sep=''))
    write.csv(val, paste(rootDir, prefix, '_val.csv', sep=''))
    write.csv(valLab, paste(rootDir, prefix, '_val_lab.csv', sep=''))
    write.csv(test, paste(rootDir, prefix, '_test.csv', sep=''))
}

In [17]:
saveData(train, trainLab, val, valLab, test, 'original')

### Making a dataset with more features

In [18]:
featureEngineerAdd <- function(data) {
    data$has_superstructure_tree = 
        data$has_superstructure_bamboo | data$has_superstructure_timber
    data$has_superstructure_mortar = 
        data$has_superstructure_mud_mortar_stone | data$has_superstructure_cement_mortar_stone | data$has_superstructure_mud_mortar_brick | data$has_superstructure_cement_mortar_brick
    data$has_superstructure_cement = 
        data$has_superstructure_cement_mortar_stone | data$has_superstructure_timber
    data$has_superstructure_brick = 
        data$has_superstructure_mud_mortar_brick | data$has_superstructure_cement_mortar_brick
    data$has_superstructure_mud = 
        data$has_superstructure_adobe_mud | data$has_superstructure_mud_mortar_stone | data$has_superstructure_mud_mortar_brick
    data$has_superstructure_concrete = 
        data$has_superstructure_rc_non_engineered | data$has_superstructure_rc_engineered 
    data$has_superstructure_stone = 
        data$has_superstructure_mud_mortar_stone | data$has_superstructure_stone_flag | data$has_superstructure_cement_mortar_stone
    return(data)
}

In [19]:
saveData(featureEngineerAdd(train), trainLab, 
         featureEngineerAdd(val), valLab, featureEngineerAdd(test), 'expanded')

### Making a dataset with less features and levels

In [13]:
featureEngineerRemove <- function(data) {
    newLevel <- "other"
    #according to analysis, combine levels that were strikingly similar
    data$foundation_type <- plyr::revalue(data$foundation_type, c("u"=newLevel, "w"=newLevel))
    data$roof_type <- plyr::revalue(data$roof_type, c("n"=newLevel, "q"=newLevel))    
    data %>% select(
        #first add all binary/categorical features that were found to be effective in the analysis stage
                    foundation_type , roof_type , ground_floor_type, other_floor_type, legal_ownership_status, 
                   has_superstructure_stone_flag, has_superstructure_cement_mortar_stone, 
                    has_superstructure_cement_mortar_brick, has_superstructure_mud_mortar_stone, has_superstructure_rc_non_engineered, 
                    has_superstructure_rc_engineered, has_superstructure_timber,has_secondary_use, has_secondary_use_hotel, has_secondary_use_rental, 
                    has_secondary_use_institution, has_secondary_use_industry, 
                    plan_configuration,
        #now add all other features that were considered relevent according to research conducted on the internet
                    geo_level_1_id, geo_level_2_id, geo_level_3_id, count_floors_pre_eq, age, 
                    area_percentage, height_percentage, land_surface_condition, count_families, building_id) %>% return()
}

In [14]:
saveData(featureEngineerRemove(train), trainLab, 
         featureEngineerRemove(val), valLab, featureEngineerRemove(test), 'filtered')