Skip to content

Commit

Permalink
coupon-purchase-prediction
Browse files Browse the repository at this point in the history
  • Loading branch information
gtesei committed Oct 5, 2015
1 parent 20ac1a8 commit 476b7f4
Show file tree
Hide file tree
Showing 16 changed files with 2,954 additions and 0 deletions.
272 changes: 272 additions & 0 deletions competitions/coupon-purchase-prediction/Exploratory_1.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,272 @@
library(fastfurious)
library(data.table)
library(plyr)

################# FAST-FURIOUS
ff.setBasePath(path = '/Users/gino/kaggle/fast-furious/gitHub/fast-furious/')
ff.bindPath(type = 'data' , sub_path = 'dataset/coupon-purchase-prediction/data')
ff.bindPath(type = 'elab' , sub_path = 'dataset/coupon-purchase-prediction/elab' , createDir = T)
ff.bindPath(type = 'process' , sub_path = 'data_process')

################# DATA
users = train_enc = as.data.frame( fread(paste(ff.getPath("data") , "user_list.csv" , sep='')))

coupon_list_train = as.data.frame( fread(paste(ff.getPath("data") , "coupon_list_train.csv" , sep='')))
coupon_list_test = as.data.frame( fread(paste(ff.getPath("data") , "coupon_list_test.csv" , sep='')))

coupon_detail_train = as.data.frame( fread(paste(ff.getPath("data") , "coupon_detail_train.csv" , sep='')))

coupon_area_train = as.data.frame( fread(paste(ff.getPath("data") , "coupon_area_train.csv" , sep='')))
coupon_area_test = as.data.frame( fread(paste(ff.getPath("data") , "coupon_area_test.csv" , sep='')))

###############
coupon_sales_train = merge(x = coupon_list_train , y = coupon_detail_train , by = 'COUPON_ID_hash' , all = F)

sum(!coupon_sales_train$I_DATE < coupon_sales_train$DISPEND)
##[1] 4
coupon_sales_train[which(!coupon_sales_train$I_DATE < coupon_sales_train$DISPEND), ]
## fuori per questioni di minuti
# DISPEND I_DATE
# 2011-12-03 12:00:00 2011-12-03 12:00:01
# 2011-08-13 13:00:00 2011-08-13 18:26:53
# 2011-08-13 13:00:00 2011-08-13 13:06:20
# 2011-07-17 12:00:00 2011-07-17 12:00:01

## let's check on the test set
sum( coupon_list_test$DISPEND < as.Date('2012-06-24') )
# [1] 0

sum( coupon_list_test$DISPFROM > as.Date('2012-06-30') )
# [1] 0

#### >>>> quindi tutti i coupon del test set sono attivi nel periodo di test

sum(!coupon_sales_train$I_DATE > coupon_sales_train$DISPFROM)
##[1] 0

sum(!coupon_sales_train$I_DATE > coupon_sales_train$VALIDFROM , na.rm = T)
# [1] 97123
sum(!coupon_sales_train$I_DATE < coupon_sales_train$VALIDEND , na.rm = T)
# [1] 1

#### >>> ne discende che un coupon viene venduto in un tempo compreso tra [coupon_sales$DISPFROM , coupon_sales$DISPEND]

#### >>> inoltre, dato che VALIDFROM e VALIDEND hanno circa 65.0000 NAs cadauno --> li butto nel cestino!!


#### how many coupons an average user bougth in a week ??

## The training set spans the dates 2011-07-01 to 2012-06-23
## The test set spans the week after the end of the training set, 2012-06-24 to 2012-06-30

day_1 = as.Date(min(coupon_detail_train$I_DATE))
#"2011-07-01 00:10:42"

day_max_train = as.Date(max(coupon_detail_train$I_DATE))
#"2012-06-23 23:54:47"

day_last = as.Date("2012-06-30")

day_last - day_max_train
#Time difference of 7 days

day_last - day_1
#Time difference of 7 days

weeks = 1+ ceiling(as.numeric(day_max_train - day_1) / 7)
#[1] 52

a_day = as.Date("2011-07-01 18:32:55")
week_number_of_a_day = 1 + floor(as.numeric(a_day-day_1)/7)
#[1] 32

coupon_sales_train$I_DATE_WN = 1 + floor(as.numeric(as.Date(coupon_sales_train$I_DATE)-day_1)/7)

l = ff.encodeCategoricalFeature(data.train = coupon_sales_train$I_DATE_WN ,
data.test = coupon_sales_train$I_DATE_WN ,
asNumericSequence=T,
colname.prefix = 'I_DATE_WN')


coupon_sales_train = cbind(coupon_sales_train , l$traindata)

clust = ddply(coupon_sales_train , .(USER_ID_hash) , function(x) {
ret = rep(NA,52)
for (i in 1:52) {
ret[i] = sum( x[,paste0('I_DATE_WN_',i)] )
}
setNames(object = ret , nm = paste0('I_DATE_WN_',1:52))
} )

## check
coupon_sales_train[coupon_sales_train$USER_ID_hash=='0000b53e182165208887ba65c079fc21',] ## ok
coupon_sales_train[coupon_sales_train$USER_ID_hash=='00035b86e6884589ec8d28fbf2fe7757',] ## ok
coupon_sales_train[coupon_sales_train$USER_ID_hash=='000cc06982785a19e2a2fdb40b1c9d59',] ## ok

max(apply(X = clust[2:ncol(clust)] , MARGIN = 2 , FUN = max))
#[1] 31

## Max coupons sales per user/week
cat(">>> Coupons max sales per user/week ...\n")
as = setNames(apply(X = clust[2:ncol(clust)] , MARGIN = 2 , FUN = max),1:weeks)
barplot(as, legend.text = F,
main = "Max coupons sales per user by week",
col = terrain.colors(weeks),
beside = TRUE,
xlab = "Week",
ylab = "Sales (units)")

text(wn(day = "2011-12-25"), 25, "Christmas" , cex=0.8 , col = 'purple' )
text(wn(day = "2011-11-26"), 28, "Thanksgiving day" , cex=0.8 , col = 'purple')
text(wn(day = "2012-06-30"), 25, "June 30th" , cex=0.8 , col = 'purple')
par(new=TRUE)
plot( x = 1:weeks , y = filter(as, rep(1, 5)), type='l', pch=27 , col="blue", lty=6, lwd=2 , axes=FALSE , xlab = "",
ylab = "")

## Average coupons sales per user/week
cat(">>> Coupons average sales per user/week ...\n")
as = setNames(apply(X = clust[2:ncol(clust)] , MARGIN = 2 , FUN = mean),1:weeks)
barplot(as, legend.text = F,
main = "Average coupons sales per user (at least 1 purchase) by week",
col = terrain.colors(weeks),
beside = TRUE,
xlab = "Week",
ylab = "Sales (units)")
text(wn(day = "2011-12-25"), 0.2, "Christmas" , cex=0.8 , col = 'brown' )
text(wn(day = "2011-11-26"), 0.23, "Thanksgiving day" , cex=0.8 , col = 'brown')
text(wn(day = "2012-06-30"), 0.23, "June 30th" , cex=0.8 , col = 'brown')
par(new=TRUE)
plot( x = 1:weeks , y = filter(as, rep(1, 5)), type='l', pch=27 , col="blue", lty=6, lwd=2 , axes=FALSE , xlab = "",
ylab = "")

########
dim(users)
# [1] 22873 6

### quasi tutti gli utenti hanno acquistato almeno 1 coupon
length(unique(coupon_detail_train$USER_ID_hash))
# 22782

## cluster by user
c2 = ddply(coupon_detail_train , .(USER_ID_hash) , function(x) c(tot = nrow(x) , uniq_coupon = length(unique(x$COUPON_ID_hash)) ))


## ogni utente ha in media acquistato 7.4 coupon
mean(c2$tot)
# 7.417962

## di cui 6.9 (quasi tutti) sono diversi tra loro
mean(c2$uniq_coupon)
# 6.976253

###### coupon_area_train , coupon_area_test
c3 = ddply(coupon_area_test , .(COUPON_ID_hash) , function(x) c(num=nrow(x)) )

### in media ci sono 6.9 coupon listing area for each coupon in the test set
mean(c3$num)
# 6.983871

# le pref area non sono molte
length(unique(coupon_area_train$PREF_NAME))
# 47

## e neanche le small area
length(unique(coupon_area_train$SMALL_AREA_NAME))
# 55


####
intersect( unique(users$PREF_NAME) , coupon_list_train$ken_name)
# [1] "東京都" "愛知県" "神奈川県" "広島県" "埼玉県" "奈良県" "石川県" "大阪府" "熊本県" "福岡県" "北海道" "京都府" "秋田県"
# [14] "千葉県" "長崎県" "兵庫県" "沖縄県" "三重県" "茨城県" "鹿児島県" "宮城県" "静岡県" "和歌山県" "長野県" "岡山県" "栃木県"
# [27] "滋賀県" "富山県" "佐賀県" "宮崎県" "岩手県" "新潟県" "大分県" "山口県" "岐阜県" "群馬県" "福島県" "愛媛県" "香川県"
# [40] "山梨県" "高知県" "島根県" "徳島県" "福井県" "青森県" "山形県" "鳥取県"

intersect( unique(users$PREF_NAME) , coupon_area_train$PREF_NAME)
# [1] "東京都" "愛知県" "神奈川県" "広島県" "埼玉県" "奈良県" "石川県" "大阪府" "熊本県" "福岡県" "北海道" "京都府" "秋田県"
# [14] "千葉県" "長崎県" "兵庫県" "沖縄県" "三重県" "茨城県" "鹿児島県" "宮城県" "静岡県" "和歌山県" "長野県" "岡山県" "栃木県"
# [27] "滋賀県" "富山県" "佐賀県" "宮崎県" "岩手県" "新潟県" "大分県" "山口県" "岐阜県" "群馬県" "福島県" "愛媛県" "香川県"
# [40] "山梨県" "高知県" "島根県" "徳島県" "福井県" "青森県" "山形県" "鳥取県"

intersect( unique(coupon_list_train$small_area_name) , unique(coupon_detail_train$SMALL_AREA_NAME) )
# [1] "兵庫" "銀座・新橋・東京・上野" "恵比寿・目黒・品川" "渋谷・青山・自由が丘"
# [5] "新宿・高田馬場・中野・吉祥寺" "群馬" "愛知" "山形"
# [9] "赤坂・六本木・麻布" "川崎・湘南・箱根他" "埼玉" "横浜"
# [13] "栃木" "広島" "池袋・神楽坂・赤羽" "三重"
# [17] "岐阜" "静岡" "キタ" "ミナミ他"
# [21] "滋賀" "京都" "北海道" "石川"
# [25] "長野" "千葉" "和歌山" "鹿児島"
# [29] "佐賀" "長崎" "福岡" "大分"
# [33] "宮崎" "沖縄" "立川・町田・八王子他" "岩手"
# [37] "富山" "島根" "山口" "奈良"
# [41] "福島" "青森" "宮城" "茨城"
# [45] "秋田" "岡山" "愛媛" "熊本"
# [49] "香川" "徳島" "高知" "福井"
# [53] "新潟" "鳥取" "山梨"

intersect( unique(coupon_area_train$SMALL_AREA_NAME) , unique(coupon_detail_train$SMALL_AREA_NAME) )
# [1] "埼玉" "千葉" "新宿・高田馬場・中野・吉祥寺" "京都"
# [5] "恵比寿・目黒・品川" "銀座・新橋・東京・上野" "愛知" "川崎・湘南・箱根他"
# [9] "北海道" "福岡" "栃木" "ミナミ他"
# [13] "渋谷・青山・自由が丘" "池袋・神楽坂・赤羽" "赤坂・六本木・麻布" "横浜"
# [17] "宮城" "福島" "大分" "高知"
# [21] "立川・町田・八王子他" "広島" "新潟" "岡山"
# [25] "愛媛" "香川" "キタ" "徳島"
# [29] "兵庫" "岐阜" "宮崎" "長崎"
# [33] "山梨" "石川" "山口" "富山"
# [37] "山形" "秋田" "鳥取" "奈良"
# [41] "鹿児島" "三重" "熊本" "長野"
# [45] "滋賀" "静岡" "青森" "茨城"
# [49] "群馬" "福井" "和歌山" "沖縄"
# [53] "佐賀" "島根" "岩手"


### attenzione che nel 30% dei casi l'informazioni area_pref e' missing in users
sum(users$PREF_NAME == '') / nrow(users)
##[1] 0.3172299

##### >>> ok, vediamo se gli acquisti fatti da untente (che e' registrato in una certa area_pref coincide con area_pred dell'acquisto)
sales_users = merge(x = users , y = coupon_detail_train , by = 'USER_ID_hash' , all = F)
sales_users_ext = merge(x = sales_users , y = coupon_area_train , by=c('SMALL_AREA_NAME','COUPON_ID_hash') ,
all.x = T , all.y = F)

### ci sono 209 casi (=0.1%) di transazioni avvenute al di fuori delle previste aree geografiche
lapply(sales_users_ext , function(x) sum(is.na(x)))
# $PREF_NAME.y
# [1] 209

clust_user = ddply(sales_users_ext , .(USER_ID_hash) , function(x) c(num_small_area = length(unique(x$SMALL_AREA_NAME)) ,
num_pref_area = length(unique(x$PREF_NAME.y))))

mean(clust_user$num_small_area , na.rm = T)
# 3.463656

###
mean(clust_user$num_pref_area , na.rm = T)
# 2.527566
max(clust_user$num_pref_area , na.rm = T)
# 24


## nel 68% dei casi area_pref della transazione != area_pref dell'utente
## puo' significare che i giapponesi viaggiano molto per shopping oppure che la base dati e' sporca o entrambe
## >> in ogni caso area_pref dell'utente non sembra molto correlato all'area in cui avviene la transazione !!
sum(sales_users_ext$PREF_NAME.y != sales_users_ext$PREF_NAME.x , na.rm = T)
# 115594

## vediamo se area_pref di coupon_list e' piu' correlato
sales_users_ext2 = merge(x = sales_users , y = coupon_list_train , by=c('COUPON_ID_hash') ,
all.x = T , all.y = F)


## nel 75% dei casi area_pref dell'utente non ci azzecca una fava con area_pref del coupon (list) laddove sono avvenute transazioni
sum(sales_users_ext2$PREF_NAME != sales_users_ext2$ken_name)
# 128282

## vediamo se c'e piu' congruenza tra le info legate ai coupon e le transazioni
sales_coupon = merge(x = coupon_list_train , y = coupon_detail_train , by = 'COUPON_ID_hash' , all = F)

sum(sales_coupon$small_area_name != sales_coupon$SMALL_AREA_NAME) / nrow(sales_coupon)
# 0.4620228


27 changes: 27 additions & 0 deletions competitions/coupon-purchase-prediction/TODO.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@

Vector components:
------------------------------------------------------------

LIST.CAPSULE_TEXT_x (1=<x<=25)
LIST.GENRE_NAME_x (1=<x<=13)

LIST.LARGE_AREA_NAME_x (1=<x<9)
AREA.SMALL_AREA_NAME_x (1=<x<=55)
AREA.PREF_NAME_x (1=<x<=47) <<<< remove because it is always associated to SMALL_AREA_NAME
except 3 cases (PREF_NAME=2,4,9) where for the same PREF_NAME
there are more than one SMALL_AREA_NAME associated

LIST.BID_CATALOG_PRICE_x (1=<x<=5)
LIST.BID_PRICE_RATE_x (1=<x<=3)

LIST.WDISPPERIOD_x (0=<x<=6) <<<<< removed as component vector



************ TODO ************

-




41 changes: 41 additions & 0 deletions competitions/coupon-purchase-prediction/WSPeriod_exploratory.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
library(fastfurious)
library(data.table)
library(plyr)
library(Hmisc)

### FUNCS

### FAST-FURIOUS
ff.setBasePath(path = '/Users/gino/kaggle/fast-furious/gitHub/fast-furious/')
ff.bindPath(type = 'code' , sub_path = 'competitions/coupon-purchase-prediction')
ff.bindPath(type = 'data' , sub_path = 'dataset/coupon-purchase-prediction/data')

### GLOBAL CONFIG
debug = F

### DATA & COUPON VECTORS
source(paste0(ff.getPath('code'),'make_coupon_vector.R'))
ff.bindPath(type = 'elab' , sub_path = 'dataset/coupon-purchase-prediction/elab' , createDir = T)
ff.bindPath(type = 'elab_train' , sub_path = 'dataset/coupon-purchase-prediction/elab/train' , createDir = T)
ff.bindPath(type = 'elab_labels' , sub_path = 'dataset/coupon-purchase-prediction/elab/labels' , createDir = T)
ff.bindPath(type = 'elab_pred' , sub_path = 'dataset/coupon-purchase-prediction/elab/pred' , createDir = T)


### PROCESSING
wn = 1
coupons = getLabels(week_number = wn)$coupons
coupons_red = coupon_list_train.meta[coupon_list_train.meta$COUPON_ID_hash %in% coupons , ]
trans = coupon_detail_train.meta[coupon_detail_train.meta$COUPON_ID_hash %in% coupons , ]
trans_ext = merge(x = trans , y = coupon_list_train.meta , by='COUPON_ID_hash' , all.y = F , all.x = T)
stopifnot(sum(is.na(trans_ext))==0)
par(mfrow=c(2,1))
hist(trans_ext$WDISPPERIOD)
hist(coupons_red$WDISPPERIOD)

describe(trans_ext$WDISPPERIOD)
describe(coupons_red$WDISPPERIOD)





0 comments on commit 476b7f4

Please sign in to comment.