-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
deloitte-western-australia-rental-prices
- Loading branch information
Showing
16 changed files
with
4,352 additions
and
0 deletions.
There are no files selected for viewing
171 changes: 171 additions & 0 deletions
171
competitions/deloitte-western-australia-rental-prices/Merge_Paul.R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,171 @@ | ||
library(data.table) | ||
library(xgboost) | ||
library(fastfurious) | ||
library(Hmisc) | ||
|
||
### FUNCS | ||
RMSLE = function(pred, obs) { | ||
if (sum(pred<0)>0) { | ||
pred = ifelse(pred >=0 , pred , 1.5) | ||
} | ||
rmsle = sqrt( sum( (log(pred+1) - log(obs+1))^2 ) / length(pred)) | ||
return (rmsle) | ||
} | ||
|
||
### FAST-FURIOUS | ||
ff.setBasePath(path = '/Users/gino/kaggle/fast-furious/gitHub/fast-furious/') | ||
ff.bindPath(type = 'data' , sub_path = 'dataset/deloitte-western-australia-rental-prices/data') | ||
ff.bindPath(type = 'code' , sub_path = 'competitions/deloitte-western-australia-rental-prices') | ||
ff.bindPath(type = 'elab' , sub_path = 'dataset/deloitte-western-australia-rental-prices/elab' , createDir = T) | ||
|
||
ff.bindPath(type = 'ensemble_1' , sub_path = 'dataset/deloitte-western-australia-rental-prices/ensembles/ensemble_1',createDir = T) ## out | ||
ff.bindPath(type = 'best_tune_1' , sub_path = 'dataset/deloitte-western-australia-rental-prices/ensembles/best_tune_1',createDir = T) ## out | ||
ff.bindPath(type = 'submission_1' , sub_path = 'dataset/deloitte-western-australia-rental-prices/ensembles/pred_ensemble_1',createDir = T) ## out | ||
|
||
ff.bindPath(type = 'ensemble_2' , sub_path = 'dataset/deloitte-western-australia-rental-prices/ensembles/ensemble_2',createDir = T) ## out | ||
ff.bindPath(type = 'best_tune_2' , sub_path = 'dataset/deloitte-western-australia-rental-prices/ensembles/best_tune_2',createDir = T) ## out | ||
ff.bindPath(type = 'submission_2' , sub_path = 'dataset/deloitte-western-australia-rental-prices/ensembles/pred_ensemble_2',createDir = T) ## out | ||
|
||
|
||
############################################# | ||
|
||
id_1 = "submit36.csv" | ||
id_2 = "avg_Nov2_15.csv" | ||
#id_2 = "layer1_dataProcNAs4_ytranflog_modxgbTreeGTJ_eta0.02_max_depth9_tuneTRUE.csv" | ||
|
||
|
||
pred_1 = as.data.frame( fread(paste(ff.getPath("elab") , id_1 , sep='') , stringsAsFactors = F)) | ||
pred_2 = as.data.frame( fread(paste(ff.getPath("elab") , id_2 , sep='') , stringsAsFactors = F)) | ||
#pred_2 = as.data.frame( fread(paste(ff.getPath("submission_1") , id_2 , sep='') , stringsAsFactors = F)) | ||
|
||
|
||
### | ||
describe(pred_1$REN_BASE_RENT) | ||
describe(pred_2$REN_BASE_RENT) | ||
### | ||
p1 <- hist(pred_1$REN_BASE_RENT) # centered at 4 | ||
p2 <- hist(pred_2$REN_BASE_RENT) # centered at 6 | ||
plot( p2, col=rgb(0,0,1,1/4) ,xlim = c(0,30000)) # first histogram | ||
plot( p1, col=rgb(1,0,0,1/4), xlim = c(0,30000) , add=T) # second | ||
|
||
## | ||
delta_distribution = data.frame(max_delta = seq(from = 50,to = 1500,by = 10) , rmse = NA) | ||
|
||
for (i in seq_along(delta_distribution$max_delta) ) { | ||
MAX_DELTA = delta_distribution$max_delta[i] | ||
delta = abs(pred_1$REN_BASE_RENT-pred_2$REN_BASE_RENT) | ||
|
||
delta_idx = which(delta<=MAX_DELTA) | ||
|
||
pred_1_overlap = pred_1$REN_BASE_RENT[delta_idx] | ||
pred_2_overlap = pred_2$REN_BASE_RENT[delta_idx] | ||
|
||
pred_test = pred_1$REN_BASE_RENT | ||
pred_test[delta_idx] = pred_2$REN_BASE_RENT[delta_idx] | ||
|
||
rmsle_overlap = RMSLE(pred=pred_test, obs=pred_1$REN_BASE_RENT) | ||
|
||
cat(">>> MAX_DELTA:",MAX_DELTA,"--> overlapping rate: ",length(delta_idx)/length(pred_1$REN_BASE_RENT),"--> RMSLE:",rmsle_overlap,"\n") | ||
delta_distribution[delta_distribution$max_delta==MAX_DELTA,]$rmse = rmsle_overlap | ||
} | ||
|
||
plot(x = delta_distribution$max_delta,y = delta_distribution$rmse , type = "l") | ||
|
||
RMSLE(pred = pred_2$REN_BASE_RENT , obs = pred_1$REN_BASE_RENT) ## 0.5028865 | ||
## >>> MAX_DELTA: 130 --> overlapping rate: 0.504817 --> RMSLE: 0.1280602 | ||
|
||
#### | ||
delta_distribution_perc = data.frame(max_delta = seq(from = 0,to = 1,length.out = 400) , err_perc = NA) | ||
|
||
for (i in seq_along(delta_distribution_perc$max_delta) ) { | ||
MAX_DELTA = delta_distribution_perc$max_delta[i] | ||
delta = abs((pred_1$REN_BASE_RENT-pred_2$REN_BASE_RENT)/pred_1$REN_BASE_RENT) | ||
|
||
delta_idx = which(delta<=MAX_DELTA) | ||
|
||
pred_1_overlap = pred_1$REN_BASE_RENT[delta_idx] | ||
pred_2_overlap = pred_2$REN_BASE_RENT[delta_idx] | ||
|
||
pred_test = pred_1$REN_BASE_RENT | ||
pred_test[delta_idx] = pred_2$REN_BASE_RENT[delta_idx] | ||
|
||
rmsle_overlap = RMSLE(pred=pred_test, obs=pred_1$REN_BASE_RENT) | ||
|
||
cat(">>> MAX_DELTA:",MAX_DELTA,"--> overlapping rate: ",length(delta_idx)/length(pred_1$REN_BASE_RENT),"--> RMSLE:",rmsle_overlap,"\n") | ||
delta_distribution_perc[delta_distribution_perc$max_delta==MAX_DELTA,]$rmse = rmsle_overlap | ||
} | ||
|
||
plot(x = delta_distribution_perc$max_delta,y = delta_distribution_perc$err_perc , type = "l") | ||
|
||
## >>> MAX_DELTA: 0.2982456 --> overlapping rate: 0.5010099 --> RMSLE: 0.1230421 | ||
|
||
### | ||
MAX_DELTA = 0.2982456 | ||
delta = abs((pred_1$REN_BASE_RENT-pred_2$REN_BASE_RENT)/pred_1$REN_BASE_RENT) | ||
delta_idx = which(delta<=MAX_DELTA) | ||
|
||
pred_1_overlap = pred_1$REN_BASE_RENT[delta_idx] | ||
pred_2_overlap = pred_2$REN_BASE_RENT[delta_idx] | ||
|
||
pred_test = pred_1$REN_BASE_RENT | ||
pred_test[delta_idx] = 0.6 * pred_1$REN_BASE_RENT[delta_idx] + 0.4 * pred_2$REN_BASE_RENT[delta_idx] | ||
|
||
## pred | ||
stopifnot(sum(is.na(pred_avg))==0) | ||
stopifnot(sum(pred_avg==Inf)==0) | ||
submission = pred_1 | ||
submission$REN_BASE_RENT <- pred_test | ||
print(head(submission)) | ||
write.csv(submission, | ||
quote=FALSE, | ||
file=paste(ff.getPath("submission_1") , "merge_60_40_perc_err029.csv" , sep='') , | ||
row.names=FALSE) | ||
|
||
#### | ||
describe(pred_1$REN_BASE_RENT) | ||
describe(pred_2$REN_BASE_RENT) | ||
describe(submission$REN_BASE_RENT) | ||
|
||
#### pred with 13 outlier | ||
pred_test = pred_1$REN_BASE_RENT | ||
pred_test[pred_2$REN_BASE_RENT>3100] = pred_2$REN_BASE_RENT[pred_2$REN_BASE_RENT>3100] | ||
|
||
## pred | ||
stopifnot(sum(is.na(pred_test))==0) | ||
stopifnot(sum(pred_test==Inf)==0) | ||
submission = pred_1 | ||
submission$REN_BASE_RENT <- pred_test | ||
print(head(submission)) | ||
write.csv(submission, | ||
quote=FALSE, | ||
file=paste(ff.getPath("submission_1") , "merge_Paul_13_outlier.csv" , sep='') , | ||
row.names=FALSE) | ||
|
||
### max delta tra 0.3 e 0.4 | ||
delta = abs((pred_1$REN_BASE_RENT-pred_2$REN_BASE_RENT)/pred_1$REN_BASE_RENT) | ||
delta_idx = which(delta<=0.4 & delta>0.3) | ||
|
||
pred_1_overlap = pred_1$REN_BASE_RENT[delta_idx] | ||
pred_2_overlap = pred_2$REN_BASE_RENT[delta_idx] | ||
|
||
pred_test = pred_1$REN_BASE_RENT | ||
pred_test[delta_idx] = 0.6 * pred_1$REN_BASE_RENT[delta_idx] + 0.4 * pred_2$REN_BASE_RENT[delta_idx] | ||
|
||
## pred | ||
stopifnot(sum(is.na(pred_test))==0) | ||
stopifnot(sum(pred_test==Inf)==0) | ||
submission = pred_1 | ||
submission$REN_BASE_RENT <- pred_test | ||
print(head(submission)) | ||
write.csv(submission, | ||
quote=FALSE, | ||
file=paste(ff.getPath("submission_1") , "merge_60_40_perc_err03_04.csv" , sep='') , | ||
row.names=FALSE) | ||
|
||
|
||
#### | ||
cor.test(x = pred_1$REN_BASE_RENT,y = pred_2$REN_BASE_RENT,method = "pearson") | ||
|
||
|
||
|
||
|
139 changes: 139 additions & 0 deletions
139
competitions/deloitte-western-australia-rental-prices/TODO
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
######### DATA ANALYSIS | ||
|
||
- train.csv | ||
Each row is a rental property, with a REN_ID, a VE_NUMBER, and the rental price per week (REN_BASE_RENT). | ||
2 NAs values - cut | ||
|
||
REN_ID REN_DATE_EFF_FROM REN_BASE_RENT VE_NUMBER REN_LEASE_LENGTH | ||
794528 3016221 2012-09-29 450 NA | ||
832143 7877 2014-02-21 400 NA | ||
|
||
|
||
there are more than 1 REN_ID for a given VE_NUMBER (even the same REN_DATE_EFF_FROM) | ||
|
||
REN_ID REN_DATE_EFF_FROM REN_BASE_RENT VE_NUMBER REN_LEASE_LENGTH | ||
61205 1946378 2001-07-20 85 219069 | ||
61350 2116475 2001-07-20 95 219069 | ||
62697 3748629 2001-07-20 100 219069 | ||
68522 5162316 2001-07-30 75 219069 | ||
83675 1243826 2001-09-26 100 219069 | ||
83948 2853259 2001-09-26 80 219069 | ||
89142 2624206 2001-10-19 100 219069 | ||
|
||
REN_LEASE_LENGTH is 0.9915789% "" (i.e. NAs) --> remove | ||
|
||
**NOTICE** | ||
> sum(unique(train$VE_NUMBER) %in% test$VE_NUMBER) | ||
[1] 71318 | ||
|
||
REN_ID REN_DATE_EFF_FROM REN_BASE_RENT VE_NUMBER REN_LEASE_LENGTH | ||
104198 3879849 2001-12-11 130 720779 | ||
166368 4712586 2002-05-31 135 720779 | ||
207121 5013937 2002-09-26 140 720779 | ||
231589 3573781 2002-12-10 135 720779 | ||
420922 1628649 2004-02-04 145 720779 | ||
|
||
> test[test$VE_NUMBER==720779,] | ||
REN_ID REN_DATE_EFF_FROM VE_NUMBER REN_LEASE_LENGTH | ||
47632 933694 2013-06-28 720779 | ||
|
||
|
||
- test.csv | ||
same as train (without the response variable REN_BASE_RENT) and no NAs | ||
|
||
############################################################################ | ||
############# LAND ############################### | ||
############################################################################ | ||
|
||
- land | ||
[ LAN_LDS_NUMBER ]: 0.9982159 -- NAs 1173872 | ||
[ LAN_LDS_NUMBER_ID_TYPE3 ]: 0.9876987 -- NAs 1161504 | ||
[ LAN_LDS_NUMBER_IS_RURAL ]: 0.4417664 -- NAs 519504 | ||
[ LAN_DATE_SUBDIVISION_LGA ]: 0.8037952 -- NAs 945239 | ||
[ LAN_DATE_SUBDIVISION_WAPC ]: 0.03576367 -- NAs 42057 | ||
[ LAN_SKETCH_ID ]: 0.9832725 -- NAs 1156299 | ||
[ LAN_ID1_LOT_NO ]: 0.01416533 -- NAs 16658 | ||
[ LAN_ID1_PART_LOT ]: 0.9918884 -- NAs 1166431 | ||
[ LAN_ID2_LOT ]: 0.9982772 -- NAs 1173944 | ||
[ LAN_ID2_PART_LOT ]: 0.9999269 -- NAs 1175884 | ||
[ LAN_ID3_PART_LOT ]: 0.9989855 -- NAs 1174777 | ||
[ LAN_DATE_SURVEY_STRATA ]: 0.9572634 -- NAs 1125713 | ||
[ LAN_DATE_LEASE_EXPIRY ]: 0.9863058 -- NAs 1159866 | ||
[ LAN_DATE_LEASE_FROM ]: 0.9918076 -- NAs 1166336 | ||
[ LAN_STR_ID_HAS_CORNER ]: 0.9590823 -- NAs 1127852 | ||
[ LLG_DATE_EFF_FROM ]: 6.802895e-06 -- NAs 8 | ||
[ SUB_POSTCODE ]: 1.700724e-06 -- NAs 2 | ||
[ URT_DATE_EFF_FROM ]: 2.551085e-06 -- NAs 3 | ||
|
||
- land_valuation_key | ||
1694 VEs in land_valuation_key are associated to more than one LAN_ID | ||
But without a rule, as it results comparing two LAN_IDs associated to the same VE. | ||
Most of the fields are equal. >>> 100% pure noise >>> cut | ||
|
||
- land_admin_areas | ||
0.8265898 is the perc. of LAND_ID in land don't occur in land_admin_areas --> cut table?? | ||
|
||
- land_pins | ||
3.097041e-05 (9) is the perc. of LAND_ID in land don't occur in land_pins | ||
|
||
- land_restrictions | ||
0.9996593 is the perc. of LAND_ID in land don't occur in land_restrictions --> cut ?? | ||
|
||
- land_urban | ||
0.3122368 is the perc. of LAND_ID in land don't occur in land_urban --> cut ?? | ||
|
||
- land_zonings | ||
0.156139 is the perc. of LAND_ID in land don't occur in land_urban | ||
|
||
############################################################################ | ||
############# VALUATION ENTITIES ############################### | ||
############################################################################ | ||
|
||
- valuation_entities | ||
all VE_NUMBER occur in train / test set | ||
|
||
- valuation_entities_classifications | ||
4 VE_NUMBER don't occur in test set | ||
|
||
- valuation_entities_details | ||
107 VE_NUMBER don't occur in train (95) / test set (15) --> ?? | ||
|
||
|
||
|
||
######### TODO | ||
|
||
- once identified the best data to use, try log tranf. on Ytrain | ||
- once identified the best data to use, try eta 0.01 | ||
|
||
layer1_dataProcNAs4_modxgbTreeGTJ_eta0.02_max_depth9_tuneTRUE.csv: 17999 rounds (0.283003) - with pred | ||
|
||
|
||
layer1_dataProcbase_ytranflog_modxgbTreeGTJ_eta0.02_max_depth6_tuneTRUE.csv: 53997 rounds (log 0.027037) - without pred | ||
> mean(predTrain) | ||
[1] 5.414936 | ||
> mean(predTest) | ||
[1] 6.12215 | ||
> | ||
> | ||
> | ||
> if (!is.null(modelList[[m]]$ytranf)) { | ||
+ cat(">>> applying inverse tranf. of ",modelList[[m]]$ytranf," to ensembling predictions ... \n") | ||
+ predTrain <- exp(predTrain) | ||
+ predTest <- exp(predTest) | ||
+ } | ||
>>> applying inverse tranf. of log to ensembling predictions ... | ||
> mean(predTest) | ||
[1] 488.2577 | ||
> mean(predTrain) | ||
[1] 276.1616 | ||
|
||
### top submission on Nov 20 -- avg_Nov2_15.csv (0.21465) | ||
|
||
>> inside ff.xgb:: stopping [ 20998 =early.stop < nrounds= 21000 ] [perf.xg= 0.024911 ] ... | ||
|
||
demographics.csv | ||
demographics_key.csv | ||
distances.csv | ||
|
||
|
||
|
Oops, something went wrong.