Skip to content

Commit

Permalink
deloitte-western-australia-rental-prices
Browse files Browse the repository at this point in the history
  • Loading branch information
gtesei committed Dec 6, 2015
1 parent 33ac505 commit 0843da6
Show file tree
Hide file tree
Showing 16 changed files with 4,352 additions and 0 deletions.
171 changes: 171 additions & 0 deletions competitions/deloitte-western-australia-rental-prices/Merge_Paul.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
library(data.table)
library(xgboost)
library(fastfurious)
library(Hmisc)

### FUNCS
RMSLE = function(pred, obs) {
if (sum(pred<0)>0) {
pred = ifelse(pred >=0 , pred , 1.5)
}
rmsle = sqrt( sum( (log(pred+1) - log(obs+1))^2 ) / length(pred))
return (rmsle)
}

### FAST-FURIOUS
ff.setBasePath(path = '/Users/gino/kaggle/fast-furious/gitHub/fast-furious/')
ff.bindPath(type = 'data' , sub_path = 'dataset/deloitte-western-australia-rental-prices/data')
ff.bindPath(type = 'code' , sub_path = 'competitions/deloitte-western-australia-rental-prices')
ff.bindPath(type = 'elab' , sub_path = 'dataset/deloitte-western-australia-rental-prices/elab' , createDir = T)

ff.bindPath(type = 'ensemble_1' , sub_path = 'dataset/deloitte-western-australia-rental-prices/ensembles/ensemble_1',createDir = T) ## out
ff.bindPath(type = 'best_tune_1' , sub_path = 'dataset/deloitte-western-australia-rental-prices/ensembles/best_tune_1',createDir = T) ## out
ff.bindPath(type = 'submission_1' , sub_path = 'dataset/deloitte-western-australia-rental-prices/ensembles/pred_ensemble_1',createDir = T) ## out

ff.bindPath(type = 'ensemble_2' , sub_path = 'dataset/deloitte-western-australia-rental-prices/ensembles/ensemble_2',createDir = T) ## out
ff.bindPath(type = 'best_tune_2' , sub_path = 'dataset/deloitte-western-australia-rental-prices/ensembles/best_tune_2',createDir = T) ## out
ff.bindPath(type = 'submission_2' , sub_path = 'dataset/deloitte-western-australia-rental-prices/ensembles/pred_ensemble_2',createDir = T) ## out


#############################################

id_1 = "submit36.csv"
id_2 = "avg_Nov2_15.csv"
#id_2 = "layer1_dataProcNAs4_ytranflog_modxgbTreeGTJ_eta0.02_max_depth9_tuneTRUE.csv"


pred_1 = as.data.frame( fread(paste(ff.getPath("elab") , id_1 , sep='') , stringsAsFactors = F))
pred_2 = as.data.frame( fread(paste(ff.getPath("elab") , id_2 , sep='') , stringsAsFactors = F))
#pred_2 = as.data.frame( fread(paste(ff.getPath("submission_1") , id_2 , sep='') , stringsAsFactors = F))


###
describe(pred_1$REN_BASE_RENT)
describe(pred_2$REN_BASE_RENT)
###
p1 <- hist(pred_1$REN_BASE_RENT) # centered at 4
p2 <- hist(pred_2$REN_BASE_RENT) # centered at 6
plot( p2, col=rgb(0,0,1,1/4) ,xlim = c(0,30000)) # first histogram
plot( p1, col=rgb(1,0,0,1/4), xlim = c(0,30000) , add=T) # second

##
delta_distribution = data.frame(max_delta = seq(from = 50,to = 1500,by = 10) , rmse = NA)

for (i in seq_along(delta_distribution$max_delta) ) {
MAX_DELTA = delta_distribution$max_delta[i]
delta = abs(pred_1$REN_BASE_RENT-pred_2$REN_BASE_RENT)

delta_idx = which(delta<=MAX_DELTA)

pred_1_overlap = pred_1$REN_BASE_RENT[delta_idx]
pred_2_overlap = pred_2$REN_BASE_RENT[delta_idx]

pred_test = pred_1$REN_BASE_RENT
pred_test[delta_idx] = pred_2$REN_BASE_RENT[delta_idx]

rmsle_overlap = RMSLE(pred=pred_test, obs=pred_1$REN_BASE_RENT)

cat(">>> MAX_DELTA:",MAX_DELTA,"--> overlapping rate: ",length(delta_idx)/length(pred_1$REN_BASE_RENT),"--> RMSLE:",rmsle_overlap,"\n")
delta_distribution[delta_distribution$max_delta==MAX_DELTA,]$rmse = rmsle_overlap
}

plot(x = delta_distribution$max_delta,y = delta_distribution$rmse , type = "l")

RMSLE(pred = pred_2$REN_BASE_RENT , obs = pred_1$REN_BASE_RENT) ## 0.5028865
## >>> MAX_DELTA: 130 --> overlapping rate: 0.504817 --> RMSLE: 0.1280602

####
delta_distribution_perc = data.frame(max_delta = seq(from = 0,to = 1,length.out = 400) , err_perc = NA)

for (i in seq_along(delta_distribution_perc$max_delta) ) {
MAX_DELTA = delta_distribution_perc$max_delta[i]
delta = abs((pred_1$REN_BASE_RENT-pred_2$REN_BASE_RENT)/pred_1$REN_BASE_RENT)

delta_idx = which(delta<=MAX_DELTA)

pred_1_overlap = pred_1$REN_BASE_RENT[delta_idx]
pred_2_overlap = pred_2$REN_BASE_RENT[delta_idx]

pred_test = pred_1$REN_BASE_RENT
pred_test[delta_idx] = pred_2$REN_BASE_RENT[delta_idx]

rmsle_overlap = RMSLE(pred=pred_test, obs=pred_1$REN_BASE_RENT)

cat(">>> MAX_DELTA:",MAX_DELTA,"--> overlapping rate: ",length(delta_idx)/length(pred_1$REN_BASE_RENT),"--> RMSLE:",rmsle_overlap,"\n")
delta_distribution_perc[delta_distribution_perc$max_delta==MAX_DELTA,]$rmse = rmsle_overlap
}

plot(x = delta_distribution_perc$max_delta,y = delta_distribution_perc$err_perc , type = "l")

## >>> MAX_DELTA: 0.2982456 --> overlapping rate: 0.5010099 --> RMSLE: 0.1230421

###
MAX_DELTA = 0.2982456
delta = abs((pred_1$REN_BASE_RENT-pred_2$REN_BASE_RENT)/pred_1$REN_BASE_RENT)
delta_idx = which(delta<=MAX_DELTA)

pred_1_overlap = pred_1$REN_BASE_RENT[delta_idx]
pred_2_overlap = pred_2$REN_BASE_RENT[delta_idx]

pred_test = pred_1$REN_BASE_RENT
pred_test[delta_idx] = 0.6 * pred_1$REN_BASE_RENT[delta_idx] + 0.4 * pred_2$REN_BASE_RENT[delta_idx]

## pred
stopifnot(sum(is.na(pred_avg))==0)
stopifnot(sum(pred_avg==Inf)==0)
submission = pred_1
submission$REN_BASE_RENT <- pred_test
print(head(submission))
write.csv(submission,
quote=FALSE,
file=paste(ff.getPath("submission_1") , "merge_60_40_perc_err029.csv" , sep='') ,
row.names=FALSE)

####
describe(pred_1$REN_BASE_RENT)
describe(pred_2$REN_BASE_RENT)
describe(submission$REN_BASE_RENT)

#### pred with 13 outlier
pred_test = pred_1$REN_BASE_RENT
pred_test[pred_2$REN_BASE_RENT>3100] = pred_2$REN_BASE_RENT[pred_2$REN_BASE_RENT>3100]

## pred
stopifnot(sum(is.na(pred_test))==0)
stopifnot(sum(pred_test==Inf)==0)
submission = pred_1
submission$REN_BASE_RENT <- pred_test
print(head(submission))
write.csv(submission,
quote=FALSE,
file=paste(ff.getPath("submission_1") , "merge_Paul_13_outlier.csv" , sep='') ,
row.names=FALSE)

### max delta tra 0.3 e 0.4
delta = abs((pred_1$REN_BASE_RENT-pred_2$REN_BASE_RENT)/pred_1$REN_BASE_RENT)
delta_idx = which(delta<=0.4 & delta>0.3)

pred_1_overlap = pred_1$REN_BASE_RENT[delta_idx]
pred_2_overlap = pred_2$REN_BASE_RENT[delta_idx]

pred_test = pred_1$REN_BASE_RENT
pred_test[delta_idx] = 0.6 * pred_1$REN_BASE_RENT[delta_idx] + 0.4 * pred_2$REN_BASE_RENT[delta_idx]

## pred
stopifnot(sum(is.na(pred_test))==0)
stopifnot(sum(pred_test==Inf)==0)
submission = pred_1
submission$REN_BASE_RENT <- pred_test
print(head(submission))
write.csv(submission,
quote=FALSE,
file=paste(ff.getPath("submission_1") , "merge_60_40_perc_err03_04.csv" , sep='') ,
row.names=FALSE)


####
cor.test(x = pred_1$REN_BASE_RENT,y = pred_2$REN_BASE_RENT,method = "pearson")




139 changes: 139 additions & 0 deletions competitions/deloitte-western-australia-rental-prices/TODO
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
######### DATA ANALYSIS

- train.csv
Each row is a rental property, with a REN_ID, a VE_NUMBER, and the rental price per week (REN_BASE_RENT).
2 NAs values - cut

REN_ID REN_DATE_EFF_FROM REN_BASE_RENT VE_NUMBER REN_LEASE_LENGTH
794528 3016221 2012-09-29 450 NA
832143 7877 2014-02-21 400 NA


there are more than 1 REN_ID for a given VE_NUMBER (even the same REN_DATE_EFF_FROM)

REN_ID REN_DATE_EFF_FROM REN_BASE_RENT VE_NUMBER REN_LEASE_LENGTH
61205 1946378 2001-07-20 85 219069
61350 2116475 2001-07-20 95 219069
62697 3748629 2001-07-20 100 219069
68522 5162316 2001-07-30 75 219069
83675 1243826 2001-09-26 100 219069
83948 2853259 2001-09-26 80 219069
89142 2624206 2001-10-19 100 219069

REN_LEASE_LENGTH is 0.9915789% "" (i.e. NAs) --> remove

**NOTICE**
> sum(unique(train$VE_NUMBER) %in% test$VE_NUMBER)
[1] 71318

REN_ID REN_DATE_EFF_FROM REN_BASE_RENT VE_NUMBER REN_LEASE_LENGTH
104198 3879849 2001-12-11 130 720779
166368 4712586 2002-05-31 135 720779
207121 5013937 2002-09-26 140 720779
231589 3573781 2002-12-10 135 720779
420922 1628649 2004-02-04 145 720779

> test[test$VE_NUMBER==720779,]
REN_ID REN_DATE_EFF_FROM VE_NUMBER REN_LEASE_LENGTH
47632 933694 2013-06-28 720779


- test.csv
same as train (without the response variable REN_BASE_RENT) and no NAs

############################################################################
############# LAND ###############################
############################################################################

- land
[ LAN_LDS_NUMBER ]: 0.9982159 -- NAs 1173872
[ LAN_LDS_NUMBER_ID_TYPE3 ]: 0.9876987 -- NAs 1161504
[ LAN_LDS_NUMBER_IS_RURAL ]: 0.4417664 -- NAs 519504
[ LAN_DATE_SUBDIVISION_LGA ]: 0.8037952 -- NAs 945239
[ LAN_DATE_SUBDIVISION_WAPC ]: 0.03576367 -- NAs 42057
[ LAN_SKETCH_ID ]: 0.9832725 -- NAs 1156299
[ LAN_ID1_LOT_NO ]: 0.01416533 -- NAs 16658
[ LAN_ID1_PART_LOT ]: 0.9918884 -- NAs 1166431
[ LAN_ID2_LOT ]: 0.9982772 -- NAs 1173944
[ LAN_ID2_PART_LOT ]: 0.9999269 -- NAs 1175884
[ LAN_ID3_PART_LOT ]: 0.9989855 -- NAs 1174777
[ LAN_DATE_SURVEY_STRATA ]: 0.9572634 -- NAs 1125713
[ LAN_DATE_LEASE_EXPIRY ]: 0.9863058 -- NAs 1159866
[ LAN_DATE_LEASE_FROM ]: 0.9918076 -- NAs 1166336
[ LAN_STR_ID_HAS_CORNER ]: 0.9590823 -- NAs 1127852
[ LLG_DATE_EFF_FROM ]: 6.802895e-06 -- NAs 8
[ SUB_POSTCODE ]: 1.700724e-06 -- NAs 2
[ URT_DATE_EFF_FROM ]: 2.551085e-06 -- NAs 3

- land_valuation_key
1694 VEs in land_valuation_key are associated to more than one LAN_ID
But without a rule, as it results comparing two LAN_IDs associated to the same VE.
Most of the fields are equal. >>> 100% pure noise >>> cut

- land_admin_areas
0.8265898 is the perc. of LAND_ID in land don't occur in land_admin_areas --> cut table??

- land_pins
3.097041e-05 (9) is the perc. of LAND_ID in land don't occur in land_pins

- land_restrictions
0.9996593 is the perc. of LAND_ID in land don't occur in land_restrictions --> cut ??

- land_urban
0.3122368 is the perc. of LAND_ID in land don't occur in land_urban --> cut ??

- land_zonings
0.156139 is the perc. of LAND_ID in land don't occur in land_urban

############################################################################
############# VALUATION ENTITIES ###############################
############################################################################

- valuation_entities
all VE_NUMBER occur in train / test set

- valuation_entities_classifications
4 VE_NUMBER don't occur in test set

- valuation_entities_details
107 VE_NUMBER don't occur in train (95) / test set (15) --> ??



######### TODO

- once identified the best data to use, try log tranf. on Ytrain
- once identified the best data to use, try eta 0.01

layer1_dataProcNAs4_modxgbTreeGTJ_eta0.02_max_depth9_tuneTRUE.csv: 17999 rounds (0.283003) - with pred


layer1_dataProcbase_ytranflog_modxgbTreeGTJ_eta0.02_max_depth6_tuneTRUE.csv: 53997 rounds (log 0.027037) - without pred
> mean(predTrain)
[1] 5.414936
> mean(predTest)
[1] 6.12215
>
>
>
> if (!is.null(modelList[[m]]$ytranf)) {
+ cat(">>> applying inverse tranf. of ",modelList[[m]]$ytranf," to ensembling predictions ... \n")
+ predTrain <- exp(predTrain)
+ predTest <- exp(predTest)
+ }
>>> applying inverse tranf. of log to ensembling predictions ...
> mean(predTest)
[1] 488.2577
> mean(predTrain)
[1] 276.1616

### top submission on Nov 20 -- avg_Nov2_15.csv (0.21465)

>> inside ff.xgb:: stopping [ 20998 =early.stop < nrounds= 21000 ] [perf.xg= 0.024911 ] ...

demographics.csv
demographics_key.csv
distances.csv



0 comments on commit 0843da6

Please sign in to comment.